• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2017 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct qinst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 };
67 
68 /* When walking the instructions in reverse, we need to swap before/after in
69  * add_dep().
70  */
71 enum direction { F, R };
72 
73 struct schedule_state {
74         const struct v3d_device_info *devinfo;
75         struct dag *dag;
76         struct schedule_node *last_r[6];
77         struct schedule_node *last_rf[64];
78         struct schedule_node *last_sf;
79         struct schedule_node *last_vpm_read;
80         struct schedule_node *last_tmu_write;
81         struct schedule_node *last_tmu_config;
82         struct schedule_node *last_tmu_read;
83         struct schedule_node *last_tlb;
84         struct schedule_node *last_vpm;
85         struct schedule_node *last_unif;
86         struct schedule_node *last_rtop;
87         struct schedule_node *last_unifa;
88         struct schedule_node *last_setmsf;
89         enum direction dir;
90         /* Estimated cycle when the current instruction would start. */
91         uint32_t time;
92 };
93 
94 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)95 add_dep(struct schedule_state *state,
96         struct schedule_node *before,
97         struct schedule_node *after,
98         bool write)
99 {
100         bool write_after_read = !write && state->dir == R;
101         uintptr_t edge_data = write_after_read;
102 
103         if (!before || !after)
104                 return;
105 
106         assert(before != after);
107 
108         if (state->dir == F)
109                 dag_add_edge(&before->dag, &after->dag, edge_data);
110         else
111                 dag_add_edge(&after->dag, &before->dag, edge_data);
112 }
113 
114 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)115 add_read_dep(struct schedule_state *state,
116               struct schedule_node *before,
117               struct schedule_node *after)
118 {
119         add_dep(state, before, after, false);
120 }
121 
122 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)123 add_write_dep(struct schedule_state *state,
124               struct schedule_node **before,
125               struct schedule_node *after)
126 {
127         add_dep(state, *before, after, true);
128         *before = after;
129 }
130 
131 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)132 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
133 {
134         if (inst->sig.ldtlb || inst->sig.ldtlbu)
135                 return true;
136 
137         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
138                 return false;
139 
140         if (inst->alu.add.op != V3D_QPU_A_NOP &&
141             inst->alu.add.magic_write &&
142             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
143              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
144                 return true;
145 
146         if (inst->alu.mul.op != V3D_QPU_M_NOP &&
147             inst->alu.mul.magic_write &&
148             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
149              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
150                 return true;
151 
152         return false;
153 }
154 
155 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)156 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
157                  enum v3d_qpu_mux mux)
158 {
159         assert(state->devinfo->ver < 71);
160         switch (mux) {
161         case V3D_QPU_MUX_A:
162                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
163                 break;
164         case V3D_QPU_MUX_B:
165                 if (!n->inst->qpu.sig.small_imm_b) {
166                         add_read_dep(state,
167                                      state->last_rf[n->inst->qpu.raddr_b], n);
168                 }
169                 break;
170         default:
171                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
172                 break;
173         }
174 }
175 
176 
177 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint8_t raddr,bool is_small_imm)178 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
179                    uint8_t raddr, bool is_small_imm)
180 {
181         assert(state->devinfo->ver >= 71);
182 
183         if (!is_small_imm)
184                 add_read_dep(state, state->last_rf[raddr], n);
185 }
186 
187 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)188 tmu_write_is_sequence_terminator(uint32_t waddr)
189 {
190         switch (waddr) {
191         case V3D_QPU_WADDR_TMUS:
192         case V3D_QPU_WADDR_TMUSCM:
193         case V3D_QPU_WADDR_TMUSF:
194         case V3D_QPU_WADDR_TMUSLOD:
195         case V3D_QPU_WADDR_TMUA:
196         case V3D_QPU_WADDR_TMUAU:
197                 return true;
198         default:
199                 return false;
200         }
201 }
202 
203 static bool
is_tmu_sequence_terminator(struct qinst * inst)204 is_tmu_sequence_terminator(struct qinst *inst)
205 {
206         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
207                 return false;
208 
209         if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
210                 if (!inst->qpu.alu.add.magic_write)
211                         return false;
212                 return tmu_write_is_sequence_terminator(inst->qpu.alu.add.waddr);
213         }
214 
215         if (inst->qpu.alu.mul.op != V3D_QPU_M_NOP) {
216                 if (!inst->qpu.alu.mul.magic_write)
217                         return false;
218                 return tmu_write_is_sequence_terminator(inst->qpu.alu.mul.waddr);
219         }
220 
221         return false;
222 }
223 
224 static bool
can_reorder_tmu_write(const struct v3d_device_info * devinfo,uint32_t waddr)225 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
226 {
227         if (tmu_write_is_sequence_terminator(waddr))
228                 return false;
229 
230         if (waddr == V3D_QPU_WADDR_TMUD)
231                 return false;
232 
233         return true;
234 }
235 
236 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)237 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
238                    uint32_t waddr, bool magic)
239 {
240         if (!magic) {
241                 add_write_dep(state, &state->last_rf[waddr], n);
242         } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
243                 if (can_reorder_tmu_write(state->devinfo, waddr))
244                         add_read_dep(state, state->last_tmu_write, n);
245                 else
246                         add_write_dep(state, &state->last_tmu_write, n);
247 
248                 if (tmu_write_is_sequence_terminator(waddr))
249                         add_write_dep(state, &state->last_tmu_config, n);
250         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
251                 /* Handled by v3d_qpu_writes_r4() check. */
252         } else {
253                 switch (waddr) {
254                 case V3D_QPU_WADDR_R0:
255                 case V3D_QPU_WADDR_R1:
256                 case V3D_QPU_WADDR_R2:
257                         add_write_dep(state,
258                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
259                                       n);
260                         break;
261                 case V3D_QPU_WADDR_R3:
262                 case V3D_QPU_WADDR_R4:
263                 case V3D_QPU_WADDR_R5:
264                         /* Handled by v3d_qpu_writes_r*() checks below. */
265                         break;
266 
267                 case V3D_QPU_WADDR_VPM:
268                 case V3D_QPU_WADDR_VPMU:
269                         add_write_dep(state, &state->last_vpm, n);
270                         break;
271 
272                 case V3D_QPU_WADDR_TLB:
273                 case V3D_QPU_WADDR_TLBU:
274                         add_write_dep(state, &state->last_tlb, n);
275                         break;
276 
277                 case V3D_QPU_WADDR_SYNC:
278                 case V3D_QPU_WADDR_SYNCB:
279                 case V3D_QPU_WADDR_SYNCU:
280                         /* For CS barrier(): Sync against any other memory
281                          * accesses.  There doesn't appear to be any need for
282                          * barriers to affect ALU operations.
283                          */
284                         add_write_dep(state, &state->last_tmu_write, n);
285                         add_write_dep(state, &state->last_tmu_read, n);
286                         break;
287 
288                 case V3D_QPU_WADDR_UNIFA:
289                         add_write_dep(state, &state->last_unifa, n);
290                         break;
291 
292                 case V3D_QPU_WADDR_NOP:
293                         break;
294 
295                 default:
296                         fprintf(stderr, "Unknown waddr %d\n", waddr);
297                         abort();
298                 }
299         }
300 }
301 
302 /**
303  * Common code for dependencies that need to be tracked both forward and
304  * backward.
305  *
306  * This is for things like "all reads of r4 have to happen between the r4
307  * writes that surround them".
308  */
309 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)310 calculate_deps(struct schedule_state *state, struct schedule_node *n)
311 {
312         const struct v3d_device_info *devinfo = state->devinfo;
313         struct qinst *qinst = n->inst;
314         struct v3d_qpu_instr *inst = &qinst->qpu;
315         /* If the input and output segments are shared, then all VPM reads to
316          * a location need to happen before all writes.  We handle this by
317          * serializing all VPM operations for now.
318          *
319          * FIXME: we are assuming that the segments are shared. That is
320          * correct right now as we are only using shared, but technically you
321          * can choose.
322          */
323         bool separate_vpm_segment = false;
324 
325         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
326                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
327                         add_read_dep(state, state->last_sf, n);
328 
329                 /* XXX: BDI */
330                 /* XXX: BDU */
331                 /* XXX: ub */
332                 /* XXX: raddr_a */
333 
334                 add_write_dep(state, &state->last_unif, n);
335                 return;
336         }
337 
338         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
339 
340         /* XXX: LOAD_IMM */
341 
342         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
343                 if (devinfo->ver < 71) {
344                         process_mux_deps(state, n, inst->alu.add.a.mux);
345                 } else {
346                         process_raddr_deps(state, n, inst->alu.add.a.raddr,
347                                            inst->sig.small_imm_a);
348                 }
349         }
350         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
351                 if (devinfo->ver < 71) {
352                         process_mux_deps(state, n, inst->alu.add.b.mux);
353                 } else {
354                         process_raddr_deps(state, n, inst->alu.add.b.raddr,
355                                            inst->sig.small_imm_b);
356                 }
357         }
358 
359         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
360                 if (devinfo->ver < 71) {
361                         process_mux_deps(state, n, inst->alu.mul.a.mux);
362                 } else {
363                         process_raddr_deps(state, n, inst->alu.mul.a.raddr,
364                                            inst->sig.small_imm_c);
365                 }
366         }
367         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
368                 if (devinfo->ver < 71) {
369                         process_mux_deps(state, n, inst->alu.mul.b.mux);
370                 } else {
371                         process_raddr_deps(state, n, inst->alu.mul.b.raddr,
372                                            inst->sig.small_imm_d);
373                 }
374         }
375 
376         switch (inst->alu.add.op) {
377         case V3D_QPU_A_VPMSETUP:
378                 /* Could distinguish read/write by unpacking the uniform. */
379                 add_write_dep(state, &state->last_vpm, n);
380                 add_write_dep(state, &state->last_vpm_read, n);
381                 break;
382 
383         case V3D_QPU_A_STVPMV:
384         case V3D_QPU_A_STVPMD:
385         case V3D_QPU_A_STVPMP:
386                 add_write_dep(state, &state->last_vpm, n);
387                 break;
388 
389         case V3D_QPU_A_LDVPMV_IN:
390         case V3D_QPU_A_LDVPMD_IN:
391         case V3D_QPU_A_LDVPMG_IN:
392         case V3D_QPU_A_LDVPMP:
393                 if (!separate_vpm_segment)
394                         add_write_dep(state, &state->last_vpm, n);
395                 break;
396 
397         case V3D_QPU_A_VPMWT:
398                 add_read_dep(state, state->last_vpm, n);
399                 break;
400 
401         case V3D_QPU_A_MSF:
402                 add_read_dep(state, state->last_tlb, n);
403                 add_read_dep(state, state->last_setmsf, n);
404                 break;
405 
406         case V3D_QPU_A_SETMSF:
407                 add_write_dep(state, &state->last_setmsf, n);
408                 add_write_dep(state, &state->last_tmu_write, n);
409                 FALLTHROUGH;
410         case V3D_QPU_A_SETREVF:
411                 add_write_dep(state, &state->last_tlb, n);
412                 break;
413 
414         case V3D_QPU_A_BALLOT:
415         case V3D_QPU_A_BCASTF:
416         case V3D_QPU_A_ALLEQ:
417         case V3D_QPU_A_ALLFEQ:
418                 add_read_dep(state, state->last_setmsf, n);
419                 break;
420 
421         default:
422                 break;
423         }
424 
425         switch (inst->alu.mul.op) {
426         case V3D_QPU_M_MULTOP:
427         case V3D_QPU_M_UMUL24:
428                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
429                  * resets it to 0.  We could possibly reorder umul24s relative
430                  * to each other, but for now just keep all the MUL parts in
431                  * order.
432                  */
433                 add_write_dep(state, &state->last_rtop, n);
434                 break;
435         default:
436                 break;
437         }
438 
439         if (inst->alu.add.op != V3D_QPU_A_NOP) {
440                 process_waddr_deps(state, n, inst->alu.add.waddr,
441                                    inst->alu.add.magic_write);
442         }
443         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
444                 process_waddr_deps(state, n, inst->alu.mul.waddr,
445                                    inst->alu.mul.magic_write);
446         }
447         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
448                 process_waddr_deps(state, n, inst->sig_addr,
449                                    inst->sig_magic);
450         }
451 
452         if (v3d_qpu_writes_r3(devinfo, inst))
453                 add_write_dep(state, &state->last_r[3], n);
454         if (v3d_qpu_writes_r4(devinfo, inst))
455                 add_write_dep(state, &state->last_r[4], n);
456         if (v3d_qpu_writes_r5(devinfo, inst))
457                 add_write_dep(state, &state->last_r[5], n);
458         if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
459                 add_write_dep(state, &state->last_rf[0], n);
460 
461         /* If we add any more dependencies here we should consider whether we
462          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
463          */
464         if (inst->sig.thrsw) {
465                 /* All accumulator contents and flags are undefined after the
466                  * switch.
467                  */
468                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
469                         add_write_dep(state, &state->last_r[i], n);
470                 add_write_dep(state, &state->last_sf, n);
471                 add_write_dep(state, &state->last_rtop, n);
472 
473                 /* Scoreboard-locking operations have to stay after the last
474                  * thread switch.
475                  */
476                 add_write_dep(state, &state->last_tlb, n);
477 
478                 add_write_dep(state, &state->last_tmu_write, n);
479                 add_write_dep(state, &state->last_tmu_config, n);
480         }
481 
482         if (v3d_qpu_waits_on_tmu(inst)) {
483                 /* TMU loads are coming from a FIFO, so ordering is important.
484                  */
485                 add_write_dep(state, &state->last_tmu_read, n);
486                 /* Keep TMU loads after their TMU lookup terminator */
487                 add_read_dep(state, state->last_tmu_config, n);
488         }
489 
490         /* Allow wrtmuc to be reordered with other instructions in the
491          * same TMU sequence by using a read dependency on the last TMU
492          * sequence terminator.
493          */
494         if (inst->sig.wrtmuc)
495                 add_read_dep(state, state->last_tmu_config, n);
496 
497         if (inst->sig.ldtlb | inst->sig.ldtlbu)
498                 add_write_dep(state, &state->last_tlb, n);
499 
500         if (inst->sig.ldvpm) {
501                 add_write_dep(state, &state->last_vpm_read, n);
502 
503                 /* At least for now, we're doing shared I/O segments, so queue
504                  * all writes after all reads.
505                  */
506                 if (!separate_vpm_segment)
507                         add_write_dep(state, &state->last_vpm, n);
508         }
509 
510         /* inst->sig.ldunif or sideband uniform read */
511         if (vir_has_uniform(qinst))
512                 add_write_dep(state, &state->last_unif, n);
513 
514         /* Both unifa and ldunifa must preserve ordering */
515         if (inst->sig.ldunifa || inst->sig.ldunifarf)
516                 add_write_dep(state, &state->last_unifa, n);
517 
518         if (v3d_qpu_reads_flags(inst))
519                 add_read_dep(state, state->last_sf, n);
520         if (v3d_qpu_writes_flags(inst))
521                 add_write_dep(state, &state->last_sf, n);
522 }
523 
524 static void
calculate_forward_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)525 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
526                        struct list_head *schedule_list)
527 {
528         struct schedule_state state;
529 
530         memset(&state, 0, sizeof(state));
531         state.dag = dag;
532         state.devinfo = c->devinfo;
533         state.dir = F;
534 
535         list_for_each_entry(struct schedule_node, node, schedule_list, link)
536                 calculate_deps(&state, node);
537 }
538 
539 static void
calculate_reverse_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)540 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
541                        struct list_head *schedule_list)
542 {
543         struct schedule_state state;
544 
545         memset(&state, 0, sizeof(state));
546         state.dag = dag;
547         state.devinfo = c->devinfo;
548         state.dir = R;
549 
550         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
551                                 link) {
552                 calculate_deps(&state, (struct schedule_node *)node);
553         }
554 }
555 
556 struct choose_scoreboard {
557         struct dag *dag;
558         int tick;
559         int last_magic_sfu_write_tick;
560         int last_stallable_sfu_reg;
561         int last_stallable_sfu_tick;
562         int last_ldvary_tick;
563         int last_unifa_write_tick;
564         int last_uniforms_reset_tick;
565         int last_thrsw_tick;
566         int last_branch_tick;
567         int last_setmsf_tick;
568         bool first_thrsw_emitted;
569         bool last_thrsw_emitted;
570         bool fixup_ldvary;
571         int ldvary_count;
572         int pending_ldtmu_count;
573         bool first_ldtmu_after_thrsw;
574 
575         /* V3D 7.x */
576         int last_implicit_rf0_write_tick;
577         bool has_rf0_flops_conflict;
578 };
579 
580 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)581 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
582                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
583 {
584         switch (mux) {
585         case V3D_QPU_MUX_R4:
586                 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
587                         return true;
588                 break;
589 
590         case V3D_QPU_MUX_R5:
591                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
592                         return true;
593                 break;
594         default:
595                 break;
596         }
597 
598         return false;
599 }
600 
601 static bool
reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,uint8_t raddr)602 reads_too_soon(struct choose_scoreboard *scoreboard,
603                const struct v3d_qpu_instr *inst, uint8_t raddr)
604 {
605         switch (raddr) {
606         case 0: /* ldvary delayed write of C coefficient to rf0 */
607                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
608                         return true;
609                 break;
610         default:
611                 break;
612         }
613 
614         return false;
615 }
616 
617 static bool
reads_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)618 reads_too_soon_after_write(const struct v3d_device_info *devinfo,
619                            struct choose_scoreboard *scoreboard,
620                            struct qinst *qinst)
621 {
622         const struct v3d_qpu_instr *inst = &qinst->qpu;
623 
624         /* XXX: Branching off of raddr. */
625         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
626                 return false;
627 
628         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
629 
630         if (inst->alu.add.op != V3D_QPU_A_NOP) {
631                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
632                         if (devinfo->ver < 71) {
633                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
634                                         return true;
635                         } else {
636                                 if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
637                                         return true;
638                         }
639                 }
640                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
641                         if (devinfo->ver < 71) {
642                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
643                                         return true;
644                         } else {
645                                 if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
646                                         return true;
647                         }
648                 }
649         }
650 
651         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
652                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
653                         if (devinfo->ver < 71) {
654                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
655                                         return true;
656                         } else {
657                                 if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
658                                         return true;
659                         }
660                 }
661                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
662                         if (devinfo->ver < 71) {
663                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
664                                         return true;
665                         } else {
666                                 if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
667                                         return true;
668                         }
669                 }
670         }
671 
672         /* XXX: imm */
673 
674         return false;
675 }
676 
677 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)678 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
679                             struct choose_scoreboard *scoreboard,
680                             struct qinst *qinst)
681 {
682         const struct v3d_qpu_instr *inst = &qinst->qpu;
683 
684         /* Don't schedule any other r4 write too soon after an SFU write.
685          * This would normally be prevented by dependency tracking, but might
686          * occur if a dead SFU computation makes it to scheduling.
687          */
688         if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
689             v3d_qpu_writes_r4(devinfo, inst))
690                 return true;
691 
692         if (devinfo->ver == 42)
693            return false;
694 
695         /* Don't schedule anything that writes rf0 right after ldvary, since
696          * that would clash with the ldvary's delayed rf0 write (the exception
697          * is another ldvary, since its implicit rf0 write would also have
698          * one cycle of delay and would not clash).
699          */
700         if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
701             (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
702              (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
703               !inst->sig.ldvary))) {
704             return true;
705        }
706 
707         return false;
708 }
709 
710 static bool
scoreboard_is_locked(struct choose_scoreboard * scoreboard,bool lock_scoreboard_on_first_thrsw)711 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
712                      bool lock_scoreboard_on_first_thrsw)
713 {
714         if (lock_scoreboard_on_first_thrsw) {
715                 return scoreboard->first_thrsw_emitted &&
716                        scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
717         }
718 
719         return scoreboard->last_thrsw_emitted &&
720                scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
721 }
722 
723 static bool
pixel_scoreboard_too_soon(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)724 pixel_scoreboard_too_soon(struct v3d_compile *c,
725                           struct choose_scoreboard *scoreboard,
726                           const struct v3d_qpu_instr *inst)
727 {
728         return qpu_inst_is_tlb(inst) &&
729                !scoreboard_is_locked(scoreboard,
730                                      c->lock_scoreboard_on_first_thrsw);
731 }
732 
733 static bool
qpu_instruction_uses_rf(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst,uint32_t waddr)734 qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
735                         const struct v3d_qpu_instr *inst,
736                         uint32_t waddr) {
737 
738         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
739            return false;
740 
741         if (devinfo->ver < 71) {
742                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
743                     inst->raddr_a == waddr)
744                         return true;
745 
746                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
747                     !inst->sig.small_imm_b && (inst->raddr_b == waddr))
748                         return true;
749         } else {
750                 if (v3d71_qpu_reads_raddr(inst, waddr))
751                         return true;
752         }
753 
754         return false;
755 }
756 
757 static bool
read_stalls(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)758 read_stalls(const struct v3d_device_info *devinfo,
759             struct choose_scoreboard *scoreboard,
760             const struct v3d_qpu_instr *inst)
761 {
762         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
763                 qpu_instruction_uses_rf(devinfo, inst,
764                                         scoreboard->last_stallable_sfu_reg);
765 }
766 
767 /* We define a max schedule priority to allow negative priorities as result of
768  * subtracting this max when an instruction stalls. So instructions that
769  * stall have lower priority than regular instructions. */
770 #define MAX_SCHEDULE_PRIORITY 16
771 
772 static int
get_instruction_priority(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)773 get_instruction_priority(const struct v3d_device_info *devinfo,
774                          const struct v3d_qpu_instr *inst)
775 {
776         uint32_t baseline_score;
777         uint32_t next_score = 0;
778 
779         /* Schedule TLB operations as late as possible, to get more
780          * parallelism between shaders.
781          */
782         if (qpu_inst_is_tlb(inst))
783                 return next_score;
784         next_score++;
785 
786         /* Empirical testing shows that using priorities to hide latency of
787          * TMU operations when scheduling QPU leads to slightly worse
788          * performance, even at 2 threads. We think this is because the thread
789          * switching is already quite effective at hiding latency and NIR
790          * scheduling (and possibly TMU pipelining too) are sufficient to hide
791          * TMU latency, so piling up on that here doesn't provide any benefits
792          * and instead may cause us to postpone critical paths that depend on
793          * the TMU results.
794          */
795 #if 0
796         /* Schedule texture read results collection late to hide latency. */
797         if (v3d_qpu_waits_on_tmu(inst))
798                 return next_score;
799         next_score++;
800 #endif
801 
802         /* Default score for things that aren't otherwise special. */
803         baseline_score = next_score;
804         next_score++;
805 
806 #if 0
807         /* Schedule texture read setup early to hide their latency better. */
808         if (v3d_qpu_writes_tmu(devinfo, inst))
809                 return next_score;
810         next_score++;
811 #endif
812 
813         /* We should increase the maximum if we assert here */
814         assert(next_score < MAX_SCHEDULE_PRIORITY);
815 
816         return baseline_score;
817 }
818 
819 enum {
820         V3D_PERIPHERAL_VPM_READ           = (1 << 0),
821         V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
822         V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
823         V3D_PERIPHERAL_SFU                = (1 << 3),
824         V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
825         V3D_PERIPHERAL_TMU_READ           = (1 << 5),
826         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
827         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
828         V3D_PERIPHERAL_TSY                = (1 << 8),
829         V3D_PERIPHERAL_TLB_READ           = (1 << 9),
830         V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
831 };
832 
833 static uint32_t
qpu_peripherals(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)834 qpu_peripherals(const struct v3d_device_info *devinfo,
835                 const struct v3d_qpu_instr *inst)
836 {
837         uint32_t result = 0;
838         if (v3d_qpu_reads_vpm(inst))
839                 result |= V3D_PERIPHERAL_VPM_READ;
840         if (v3d_qpu_writes_vpm(inst))
841                 result |= V3D_PERIPHERAL_VPM_WRITE;
842         if (v3d_qpu_waits_vpm(inst))
843                 result |= V3D_PERIPHERAL_VPM_WAIT;
844 
845         if (v3d_qpu_writes_tmu(devinfo, inst))
846                 result |= V3D_PERIPHERAL_TMU_WRITE;
847         if (inst->sig.ldtmu)
848                 result |= V3D_PERIPHERAL_TMU_READ;
849         if (inst->sig.wrtmuc)
850                 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
851 
852         if (v3d_qpu_uses_sfu(inst))
853                 result |= V3D_PERIPHERAL_SFU;
854 
855         if (v3d_qpu_reads_tlb(inst))
856                 result |= V3D_PERIPHERAL_TLB_READ;
857         if (v3d_qpu_writes_tlb(inst))
858                 result |= V3D_PERIPHERAL_TLB_WRITE;
859 
860         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
861                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
862                     inst->alu.add.magic_write &&
863                     v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
864                         result |= V3D_PERIPHERAL_TSY;
865                 }
866 
867                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
868                         result |= V3D_PERIPHERAL_TMU_WAIT;
869         }
870 
871         return result;
872 }
873 
874 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)875 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
876                                  const struct v3d_qpu_instr *a,
877                                  const struct v3d_qpu_instr *b)
878 {
879         const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
880         const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
881 
882         /* We can always do one peripheral access per instruction. */
883         if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
884                 return true;
885 
886         /* V3D 4.x can't do more than one peripheral access except in a
887          * few cases:
888          */
889         if (devinfo->ver == 42) {
890                 /* WRTMUC signal with TMU register write (other than tmuc). */
891                 if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
892                     b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
893                         return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
894                 }
895                 if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
896                     a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
897                         return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
898                 }
899 
900                 /* TMU read with VPM read/write. */
901                 if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
902                     (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
903                      b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
904                         return true;
905                 }
906                 if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
907                     (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
908                      a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
909                         return true;
910                 }
911 
912                 return false;
913         }
914 
915         /* V3D 7.x can't have more than one of these restricted peripherals */
916         const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
917                                     V3D_PERIPHERAL_TMU_WRTMUC_SIG |
918                                     V3D_PERIPHERAL_TSY |
919                                     V3D_PERIPHERAL_TLB_READ |
920                                     V3D_PERIPHERAL_SFU |
921                                     V3D_PERIPHERAL_VPM_READ |
922                                     V3D_PERIPHERAL_VPM_WRITE;
923 
924         const uint32_t a_restricted = a_peripherals & restricted;
925         const uint32_t b_restricted = b_peripherals & restricted;
926         if (a_restricted && b_restricted) {
927                 /* WRTMUC signal with TMU register write (other than tmuc) is
928                  * allowed though.
929                  */
930                 if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
931                        b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
932                        v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
933                       (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
934                        a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
935                        v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
936                         return false;
937                 }
938         }
939 
940         /* Only one TMU read per instruction */
941         if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
942             (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
943                 return false;
944         }
945 
946         /* Only one TLB access per instruction */
947         if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
948                               V3D_PERIPHERAL_TLB_READ)) &&
949             (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
950                               V3D_PERIPHERAL_TLB_READ))) {
951                 return false;
952         }
953 
954         return true;
955 }
956 
957 /* Compute a bitmask of which rf registers are used between
958  * the two instructions.
959  */
960 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)961 qpu_raddrs_used(const struct v3d_qpu_instr *a,
962                 const struct v3d_qpu_instr *b)
963 {
964         assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
965         assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
966 
967         uint64_t raddrs_used = 0;
968         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
969                 raddrs_used |= (UINT64_C(1) << a->raddr_a);
970         if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
971                 raddrs_used |= (UINT64_C(1) << a->raddr_b);
972         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
973                 raddrs_used |= (UINT64_C(1) << b->raddr_a);
974         if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
975                 raddrs_used |= (UINT64_C(1) << b->raddr_b);
976 
977         return raddrs_used;
978 }
979 
980 /* Takes two instructions and attempts to merge their raddr fields (including
981  * small immediates) into one merged instruction. For V3D 4.x, returns false
982  * if the two instructions access more than two different rf registers between
983  * them, or more than one rf register and one small immediate. For 7.x returns
984  * false if both instructions use small immediates.
985  */
986 static bool
qpu_merge_raddrs(struct v3d_qpu_instr * result,const struct v3d_qpu_instr * add_instr,const struct v3d_qpu_instr * mul_instr,const struct v3d_device_info * devinfo)987 qpu_merge_raddrs(struct v3d_qpu_instr *result,
988                  const struct v3d_qpu_instr *add_instr,
989                  const struct v3d_qpu_instr *mul_instr,
990                  const struct v3d_device_info *devinfo)
991 {
992         if (devinfo->ver >= 71) {
993                 assert(add_instr->sig.small_imm_a +
994                        add_instr->sig.small_imm_b <= 1);
995                 assert(add_instr->sig.small_imm_c +
996                        add_instr->sig.small_imm_d == 0);
997                 assert(mul_instr->sig.small_imm_a +
998                        mul_instr->sig.small_imm_b == 0);
999                 assert(mul_instr->sig.small_imm_c +
1000                        mul_instr->sig.small_imm_d <= 1);
1001 
1002                 result->sig.small_imm_a = add_instr->sig.small_imm_a;
1003                 result->sig.small_imm_b = add_instr->sig.small_imm_b;
1004                 result->sig.small_imm_c = mul_instr->sig.small_imm_c;
1005                 result->sig.small_imm_d = mul_instr->sig.small_imm_d;
1006 
1007                 return (result->sig.small_imm_a +
1008                         result->sig.small_imm_b +
1009                         result->sig.small_imm_c +
1010                         result->sig.small_imm_d) <= 1;
1011         }
1012 
1013         assert(devinfo->ver == 42);
1014 
1015         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
1016         int naddrs = util_bitcount64(raddrs_used);
1017 
1018         if (naddrs > 2)
1019                 return false;
1020 
1021         if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
1022                 if (naddrs > 1)
1023                         return false;
1024 
1025                 if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
1026                         if (add_instr->raddr_b != mul_instr->raddr_b)
1027                                 return false;
1028 
1029                 result->sig.small_imm_b = true;
1030                 result->raddr_b = add_instr->sig.small_imm_b ?
1031                         add_instr->raddr_b : mul_instr->raddr_b;
1032         }
1033 
1034         if (naddrs == 0)
1035                 return true;
1036 
1037         int raddr_a = ffsll(raddrs_used) - 1;
1038         raddrs_used &= ~(UINT64_C(1) << raddr_a);
1039         result->raddr_a = raddr_a;
1040 
1041         if (!result->sig.small_imm_b) {
1042                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
1043                     raddr_a == add_instr->raddr_b) {
1044                         if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
1045                                 result->alu.add.a.mux = V3D_QPU_MUX_A;
1046                         if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
1047                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1048                                 result->alu.add.b.mux = V3D_QPU_MUX_A;
1049                         }
1050                 }
1051                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
1052                     raddr_a == mul_instr->raddr_b) {
1053                         if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
1054                                 result->alu.mul.a.mux = V3D_QPU_MUX_A;
1055                         if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
1056                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1057                                 result->alu.mul.b.mux = V3D_QPU_MUX_A;
1058                         }
1059                 }
1060         }
1061         if (!raddrs_used)
1062                 return true;
1063 
1064         int raddr_b = ffsll(raddrs_used) - 1;
1065         result->raddr_b = raddr_b;
1066         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
1067             raddr_b == add_instr->raddr_a) {
1068                 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
1069                         result->alu.add.a.mux = V3D_QPU_MUX_B;
1070                 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
1071                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1072                         result->alu.add.b.mux = V3D_QPU_MUX_B;
1073                 }
1074         }
1075         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
1076             raddr_b == mul_instr->raddr_a) {
1077                 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
1078                         result->alu.mul.a.mux = V3D_QPU_MUX_B;
1079                 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
1080                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1081                         result->alu.mul.b.mux = V3D_QPU_MUX_B;
1082                 }
1083         }
1084 
1085         return true;
1086 }
1087 
1088 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)1089 can_do_add_as_mul(enum v3d_qpu_add_op op)
1090 {
1091         switch (op) {
1092         case V3D_QPU_A_ADD:
1093         case V3D_QPU_A_SUB:
1094                 return true;
1095         default:
1096                 return false;
1097         }
1098 }
1099 
1100 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)1101 add_op_as_mul_op(enum v3d_qpu_add_op op)
1102 {
1103         switch (op) {
1104         case V3D_QPU_A_ADD:
1105                 return V3D_QPU_M_ADD;
1106         case V3D_QPU_A_SUB:
1107                 return V3D_QPU_M_SUB;
1108         default:
1109                 unreachable("unexpected add opcode");
1110         }
1111 }
1112 
1113 static void
qpu_convert_add_to_mul(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst)1114 qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
1115                        struct v3d_qpu_instr *inst)
1116 {
1117         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
1118         assert(inst->alu.add.op != V3D_QPU_A_NOP);
1119         assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1120 
1121         memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
1122         inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
1123         inst->alu.add.op = V3D_QPU_A_NOP;
1124 
1125         inst->flags.mc = inst->flags.ac;
1126         inst->flags.mpf = inst->flags.apf;
1127         inst->flags.muf = inst->flags.auf;
1128         inst->flags.ac = V3D_QPU_COND_NONE;
1129         inst->flags.apf = V3D_QPU_PF_NONE;
1130         inst->flags.auf = V3D_QPU_UF_NONE;
1131 
1132         inst->alu.mul.output_pack = inst->alu.add.output_pack;
1133 
1134         inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
1135         inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
1136         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
1137         inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
1138         inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
1139 
1140         if (devinfo->ver >= 71) {
1141                 assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
1142                 assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
1143                 if (inst->sig.small_imm_a) {
1144                         inst->sig.small_imm_c = true;
1145                         inst->sig.small_imm_a = false;
1146                 } else if (inst->sig.small_imm_b) {
1147                         inst->sig.small_imm_d = true;
1148                         inst->sig.small_imm_b = false;
1149                 }
1150         }
1151 }
1152 
1153 static bool
can_do_mul_as_add(const struct v3d_device_info * devinfo,enum v3d_qpu_mul_op op)1154 can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
1155 {
1156         switch (op) {
1157         case V3D_QPU_M_MOV:
1158         case V3D_QPU_M_FMOV:
1159                 return devinfo->ver >= 71;
1160         default:
1161                 return false;
1162         }
1163 }
1164 
1165 static enum v3d_qpu_mul_op
mul_op_as_add_op(enum v3d_qpu_mul_op op)1166 mul_op_as_add_op(enum v3d_qpu_mul_op op)
1167 {
1168         switch (op) {
1169         case V3D_QPU_M_MOV:
1170                 return V3D_QPU_A_MOV;
1171         case V3D_QPU_M_FMOV:
1172                 return V3D_QPU_A_FMOV;
1173         default:
1174                 unreachable("unexpected mov opcode");
1175         }
1176 }
1177 
1178 static void
qpu_convert_mul_to_add(struct v3d_qpu_instr * inst)1179 qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
1180 {
1181         STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
1182         assert(inst->alu.mul.op != V3D_QPU_M_NOP);
1183         assert(inst->alu.add.op == V3D_QPU_A_NOP);
1184 
1185         memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
1186         inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
1187         inst->alu.mul.op = V3D_QPU_M_NOP;
1188 
1189         inst->flags.ac = inst->flags.mc;
1190         inst->flags.apf = inst->flags.mpf;
1191         inst->flags.auf = inst->flags.muf;
1192         inst->flags.mc = V3D_QPU_COND_NONE;
1193         inst->flags.mpf = V3D_QPU_PF_NONE;
1194         inst->flags.muf = V3D_QPU_UF_NONE;
1195 
1196         inst->alu.add.output_pack = inst->alu.mul.output_pack;
1197         inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
1198         inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
1199         inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
1200         inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
1201         inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
1202 
1203         assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
1204         assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
1205         if (inst->sig.small_imm_c) {
1206                 inst->sig.small_imm_a = true;
1207                 inst->sig.small_imm_c = false;
1208         } else if (inst->sig.small_imm_d) {
1209                 inst->sig.small_imm_b = true;
1210                 inst->sig.small_imm_d = false;
1211         }
1212 }
1213 
1214 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)1215 qpu_merge_inst(const struct v3d_device_info *devinfo,
1216                struct v3d_qpu_instr *result,
1217                const struct v3d_qpu_instr *a,
1218                const struct v3d_qpu_instr *b)
1219 {
1220         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
1221             b->type != V3D_QPU_INSTR_TYPE_ALU) {
1222                 return false;
1223         }
1224 
1225         if (!qpu_compatible_peripheral_access(devinfo, a, b))
1226                 return false;
1227 
1228         struct v3d_qpu_instr merge = *a;
1229         const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
1230 
1231         struct v3d_qpu_instr mul_inst;
1232         if (b->alu.add.op != V3D_QPU_A_NOP) {
1233                 if (a->alu.add.op == V3D_QPU_A_NOP) {
1234                         merge.alu.add = b->alu.add;
1235 
1236                         merge.flags.ac = b->flags.ac;
1237                         merge.flags.apf = b->flags.apf;
1238                         merge.flags.auf = b->flags.auf;
1239 
1240                         add_instr = b;
1241                         mul_instr = a;
1242                 }
1243                 /* If a's add op is used but its mul op is not, then see if we
1244                  * can convert either a's add op or b's add op to a mul op
1245                  * so we can merge.
1246                  */
1247                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1248                          can_do_add_as_mul(b->alu.add.op)) {
1249                         mul_inst = *b;
1250                         qpu_convert_add_to_mul(devinfo, &mul_inst);
1251 
1252                         merge.alu.mul = mul_inst.alu.mul;
1253 
1254                         merge.flags.mc = mul_inst.flags.mc;
1255                         merge.flags.mpf = mul_inst.flags.mpf;
1256                         merge.flags.muf = mul_inst.flags.muf;
1257 
1258                         add_instr = a;
1259                         mul_instr = &mul_inst;
1260                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1261                            can_do_add_as_mul(a->alu.add.op)) {
1262                         mul_inst = *a;
1263                         qpu_convert_add_to_mul(devinfo, &mul_inst);
1264 
1265                         merge = mul_inst;
1266                         merge.alu.add = b->alu.add;
1267 
1268                         merge.flags.ac = b->flags.ac;
1269                         merge.flags.apf = b->flags.apf;
1270                         merge.flags.auf = b->flags.auf;
1271 
1272                         add_instr = b;
1273                         mul_instr = &mul_inst;
1274                 } else {
1275                         return false;
1276                 }
1277         }
1278 
1279         struct v3d_qpu_instr add_inst;
1280         if (b->alu.mul.op != V3D_QPU_M_NOP) {
1281                 if (a->alu.mul.op == V3D_QPU_M_NOP) {
1282                         merge.alu.mul = b->alu.mul;
1283 
1284                         merge.flags.mc = b->flags.mc;
1285                         merge.flags.mpf = b->flags.mpf;
1286                         merge.flags.muf = b->flags.muf;
1287 
1288                         mul_instr = b;
1289                         add_instr = a;
1290                 }
1291                 /* If a's mul op is used but its add op is not, then see if we
1292                  * can convert either a's mul op or b's mul op to an add op
1293                  * so we can merge.
1294                  */
1295                 else if (a->alu.add.op == V3D_QPU_A_NOP &&
1296                          can_do_mul_as_add(devinfo, b->alu.mul.op)) {
1297                         add_inst = *b;
1298                         qpu_convert_mul_to_add(&add_inst);
1299 
1300                         merge.alu.add = add_inst.alu.add;
1301 
1302                         merge.flags.ac = add_inst.flags.ac;
1303                         merge.flags.apf = add_inst.flags.apf;
1304                         merge.flags.auf = add_inst.flags.auf;
1305 
1306                         mul_instr = a;
1307                         add_instr = &add_inst;
1308                 } else if (a->alu.add.op == V3D_QPU_A_NOP &&
1309                            can_do_mul_as_add(devinfo, a->alu.mul.op)) {
1310                         add_inst = *a;
1311                         qpu_convert_mul_to_add(&add_inst);
1312 
1313                         merge = add_inst;
1314                         merge.alu.mul = b->alu.mul;
1315 
1316                         merge.flags.mc = b->flags.mc;
1317                         merge.flags.mpf = b->flags.mpf;
1318                         merge.flags.muf = b->flags.muf;
1319 
1320                         mul_instr = b;
1321                         add_instr = &add_inst;
1322                 } else {
1323                         return false;
1324                 }
1325         }
1326 
1327         /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
1328          * they have restrictions on the number of raddrs that can be adressed
1329          * in a single instruction. In V3D 7.x, we don't have that restriction,
1330          * but we are still limited to a single small immediate per instruction.
1331          */
1332         if (add_instr && mul_instr &&
1333             !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
1334                 return false;
1335         }
1336 
1337         merge.sig.thrsw |= b->sig.thrsw;
1338         merge.sig.ldunif |= b->sig.ldunif;
1339         merge.sig.ldunifrf |= b->sig.ldunifrf;
1340         merge.sig.ldunifa |= b->sig.ldunifa;
1341         merge.sig.ldunifarf |= b->sig.ldunifarf;
1342         merge.sig.ldtmu |= b->sig.ldtmu;
1343         merge.sig.ldvary |= b->sig.ldvary;
1344         merge.sig.ldvpm |= b->sig.ldvpm;
1345         merge.sig.ldtlb |= b->sig.ldtlb;
1346         merge.sig.ldtlbu |= b->sig.ldtlbu;
1347         merge.sig.ucb |= b->sig.ucb;
1348         merge.sig.rotate |= b->sig.rotate;
1349         merge.sig.wrtmuc |= b->sig.wrtmuc;
1350 
1351         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1352             v3d_qpu_sig_writes_address(devinfo, &b->sig))
1353                 return false;
1354         merge.sig_addr |= b->sig_addr;
1355         merge.sig_magic |= b->sig_magic;
1356 
1357         uint64_t packed;
1358         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1359 
1360         *result = merge;
1361         /* No modifying the real instructions on failure. */
1362         assert(ok || (a != result && b != result));
1363 
1364         return ok;
1365 }
1366 
1367 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr * inst)1368 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1369 {
1370         return inst->sig.ldunif || inst->sig.ldunifrf;
1371 }
1372 
1373 static bool
1374 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1375                                          struct choose_scoreboard *scoreboard,
1376                                          const struct qinst *qinst);
1377 
1378 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct schedule_node * prev_inst)1379 choose_instruction_to_schedule(struct v3d_compile *c,
1380                                struct choose_scoreboard *scoreboard,
1381                                struct schedule_node *prev_inst)
1382 {
1383         struct schedule_node *chosen = NULL;
1384         int chosen_prio = 0;
1385 
1386         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1387          * will handle pairing it along with filling the delay slots.
1388          */
1389         if (prev_inst) {
1390                 if (prev_inst->inst->qpu.sig.thrsw)
1391                         return NULL;
1392         }
1393 
1394         bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1395                                  scoreboard->ldvary_count < c->num_inputs;
1396         bool skipped_insts_for_ldvary_pipelining = false;
1397 retry:
1398         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1399                             dag.link) {
1400                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1401 
1402                 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1403                         skipped_insts_for_ldvary_pipelining = true;
1404                         continue;
1405                 }
1406 
1407                 /* Don't choose the branch instruction until it's the last one
1408                  * left.  We'll move it up to fit its delay slots after we
1409                  * choose it.
1410                  */
1411                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1412                     !list_is_singular(&scoreboard->dag->heads)) {
1413                         continue;
1414                 }
1415 
1416                 /* We need to have 3 delay slots between a write to unifa and
1417                  * a follow-up ldunifa.
1418                  */
1419                 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1420                     scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1421                         continue;
1422 
1423                 /* "An instruction must not read from a location in physical
1424                  *  regfile A or B that was written to by the previous
1425                  *  instruction."
1426                  */
1427                 if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1428                         continue;
1429 
1430                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1431                         continue;
1432 
1433                 /* "Before doing a TLB access a scoreboard wait must have been
1434                  *  done. This happens either on the first or last thread
1435                  *  switch, depending on a setting (scb_wait_on_first_thrsw) in
1436                  *  the shader state."
1437                  */
1438                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1439                         continue;
1440 
1441                 /* ldunif and ldvary both write the same register (r5 for v42
1442                  * and below, rf0 for v71), but ldunif does so a tick sooner.
1443                  * If the ldvary's register wasn't used, then ldunif might
1444                  * otherwise get scheduled so ldunif and ldvary try to update
1445                  * the register in the same tick.
1446                  */
1447                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1448                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1449                         continue;
1450                 }
1451 
1452                 /* If we are in a thrsw delay slot check that this instruction
1453                  * is valid for that.
1454                  */
1455                 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1456                     !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1457                                                               n->inst)) {
1458                         continue;
1459                 }
1460 
1461                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1462                         /* Don't try to put a branch in the delay slots of another
1463                          * branch or a unifa write.
1464                          */
1465                         if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1466                                 continue;
1467                         if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1468                                 continue;
1469 
1470                         /* No branch with cond != 0,2,3 and msfign != 0 after
1471                          * setmsf.
1472                          */
1473                         if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1474                             inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1475                             inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1476                             inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1477                             inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1478                                 continue;
1479                         }
1480                 }
1481 
1482                 /* If we're trying to pair with another instruction, check
1483                  * that they're compatible.
1484                  */
1485                 if (prev_inst) {
1486                         /* Don't pair up a thread switch signal -- we'll
1487                          * handle pairing it when we pick it on its own.
1488                          */
1489                         if (inst->sig.thrsw)
1490                                 continue;
1491 
1492                         if (prev_inst->inst->uniform != -1 &&
1493                             n->inst->uniform != -1)
1494                                 continue;
1495 
1496                        /* Simulator complains if we have two uniforms loaded in
1497                         * the the same instruction, which could happen if we
1498                         * have a ldunif or sideband uniform and we pair that
1499                         * with ldunifa.
1500                         */
1501                         if (vir_has_uniform(prev_inst->inst) &&
1502                             (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1503                                 continue;
1504                         }
1505 
1506                         if ((prev_inst->inst->qpu.sig.ldunifa ||
1507                              prev_inst->inst->qpu.sig.ldunifarf) &&
1508                             vir_has_uniform(n->inst)) {
1509                                 continue;
1510                         }
1511 
1512                         /* Don't merge TLB instructions before we have acquired
1513                          * the scoreboard lock.
1514                          */
1515                         if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1516                                 continue;
1517 
1518                         /* When we successfully pair up an ldvary we then try
1519                          * to merge it into the previous instruction if
1520                          * possible to improve pipelining. Don't pick up the
1521                          * ldvary now if the follow-up fixup would place
1522                          * it in the delay slots of a thrsw, which is not
1523                          * allowed and would prevent the fixup from being
1524                          * successful. In V3D 7.x we can allow this to happen
1525                          * as long as it is not the last delay slot.
1526                          */
1527                         if (inst->sig.ldvary) {
1528                                 if (c->devinfo->ver == 42 &&
1529                                     scoreboard->last_thrsw_tick + 2 >=
1530                                     scoreboard->tick - 1) {
1531                                         continue;
1532                                 }
1533                                 if (c->devinfo->ver >= 71 &&
1534                                     scoreboard->last_thrsw_tick + 2 ==
1535                                     scoreboard->tick - 1) {
1536                                         continue;
1537                                 }
1538                         }
1539 
1540                         /* We can emit a new tmu lookup with a previous ldtmu
1541                          * if doing this would free just enough space in the
1542                          * TMU output fifo so we don't overflow, however, this
1543                          * is only safe if the ldtmu cannot stall.
1544                          *
1545                          * A ldtmu can stall if it is not the first following a
1546                          * thread switch and corresponds to the first word of a
1547                          * read request.
1548                          *
1549                          * FIXME: For now we forbid pairing up a new lookup
1550                          * with a previous ldtmu that is not the first after a
1551                          * thrsw if that could overflow the TMU output fifo
1552                          * regardless of whether the ldtmu is reading the first
1553                          * word of a TMU result or not, since we don't track
1554                          * this aspect in the compiler yet.
1555                          */
1556                         if (prev_inst->inst->qpu.sig.ldtmu &&
1557                             is_tmu_sequence_terminator(n->inst) &&
1558                             !scoreboard->first_ldtmu_after_thrsw &&
1559                             (scoreboard->pending_ldtmu_count +
1560                              n->inst->ldtmu_count > 16 / c->threads)) {
1561                                 continue;
1562                         }
1563 
1564                         struct v3d_qpu_instr merged_inst;
1565                         if (!qpu_merge_inst(c->devinfo, &merged_inst,
1566                                             &prev_inst->inst->qpu, inst)) {
1567                                 continue;
1568                         }
1569                 }
1570 
1571                 int prio = get_instruction_priority(c->devinfo, inst);
1572 
1573                 if (read_stalls(c->devinfo, scoreboard, inst)) {
1574                         /* Don't merge an instruction that stalls */
1575                         if (prev_inst)
1576                                 continue;
1577                         else {
1578                                 /* Any instruction that don't stall will have
1579                                  * higher scheduling priority */
1580                                 prio -= MAX_SCHEDULE_PRIORITY;
1581                                 assert(prio < 0);
1582                         }
1583                 }
1584 
1585                 /* Found a valid instruction.  If nothing better comes along,
1586                  * this one works.
1587                  */
1588                 if (!chosen) {
1589                         chosen = n;
1590                         chosen_prio = prio;
1591                         continue;
1592                 }
1593 
1594                 if (prio > chosen_prio) {
1595                         chosen = n;
1596                         chosen_prio = prio;
1597                 } else if (prio < chosen_prio) {
1598                         continue;
1599                 }
1600 
1601                 if (n->delay > chosen->delay) {
1602                         chosen = n;
1603                         chosen_prio = prio;
1604                 } else if (n->delay < chosen->delay) {
1605                         continue;
1606                 }
1607         }
1608 
1609         /* If we did not find any instruction to schedule but we discarded
1610          * some of them to prioritize ldvary pipelining, try again.
1611          */
1612         if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1613                 skipped_insts_for_ldvary_pipelining = false;
1614                 ldvary_pipelining = false;
1615                 goto retry;
1616         }
1617 
1618         if (chosen && chosen->inst->qpu.sig.ldvary) {
1619                 scoreboard->ldvary_count++;
1620                 /* If we are pairing an ldvary, flag it so we can fix it up for
1621                  * optimal pipelining of ldvary sequences.
1622                  */
1623                 if (prev_inst)
1624                         scoreboard->fixup_ldvary = true;
1625         }
1626 
1627         return chosen;
1628 }
1629 
1630 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr,const struct v3d_device_info * devinfo)1631 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1632                                   enum v3d_qpu_waddr waddr,
1633                                   const struct v3d_device_info *devinfo)
1634 {
1635         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1636                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1637         else if (waddr == V3D_QPU_WADDR_UNIFA)
1638                 scoreboard->last_unifa_write_tick = scoreboard->tick;
1639 }
1640 
1641 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)1642 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1643                                       const struct v3d_qpu_instr *inst)
1644 {
1645         if (v3d_qpu_instr_is_sfu(inst)) {
1646                 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1647                 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1648         }
1649 }
1650 
1651 static void
update_scoreboard_tmu_tracking(struct choose_scoreboard * scoreboard,const struct qinst * inst)1652 update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
1653                                const struct qinst *inst)
1654 {
1655         /* Track if the have seen any ldtmu after the last thread switch */
1656         if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
1657                 scoreboard->first_ldtmu_after_thrsw = true;
1658 
1659         /* Track the number of pending ldtmu instructions for outstanding
1660          * TMU lookups.
1661          */
1662         scoreboard->pending_ldtmu_count += inst->ldtmu_count;
1663         if (inst->qpu.sig.ldtmu) {
1664                 assert(scoreboard->pending_ldtmu_count > 0);
1665                 scoreboard->pending_ldtmu_count--;
1666                 scoreboard->first_ldtmu_after_thrsw = false;
1667         }
1668 }
1669 
1670 static void
set_has_rf0_flops_conflict(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1671 set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
1672                            const struct v3d_qpu_instr *inst,
1673                            const struct v3d_device_info *devinfo)
1674 {
1675         if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
1676             v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
1677             !inst->sig_magic) {
1678                 scoreboard->has_rf0_flops_conflict = true;
1679         }
1680 }
1681 
1682 static void
update_scoreboard_for_rf0_flops(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1683 update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
1684                                 const struct v3d_qpu_instr *inst,
1685                                 const struct v3d_device_info *devinfo)
1686 {
1687         if (devinfo->ver < 71)
1688                 return;
1689 
1690         /* Thread switch restrictions:
1691          *
1692          * At the point of a thread switch or thread end (when the actual
1693          * thread switch or thread end happens, not when the signalling
1694          * instruction is processed):
1695          *
1696          *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
1697          *      ldvary instruction in which another signal also wrote to the
1698          *      register file, and the final instruction of the thread section
1699          *      contained a signal which wrote to the register file, then the
1700          *      value of rf0 is undefined at the start of the new section
1701          *
1702          * Here we use the scoreboard to track if our last rf0 implicit write
1703          * happens at the same time that another signal writes the register
1704          * file (has_rf0_flops_conflict). We will use that information when
1705          * scheduling thrsw instructions to avoid putting anything in their
1706          * last delay slot which has a signal that writes to the register file.
1707          */
1708 
1709         /* Reset tracking if we have an explicit rf0 write or we are starting
1710          * a new thread section.
1711          */
1712         if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
1713             scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
1714                 scoreboard->last_implicit_rf0_write_tick = -10;
1715                 scoreboard->has_rf0_flops_conflict = false;
1716         }
1717 
1718         if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
1719                 scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
1720                         scoreboard->tick + 1 : scoreboard->tick;
1721         }
1722 
1723         set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
1724 }
1725 
1726 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct qinst * qinst,const struct v3d_device_info * devinfo)1727 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1728                              const struct qinst *qinst,
1729                              const struct v3d_device_info *devinfo)
1730 {
1731         const struct v3d_qpu_instr *inst = &qinst->qpu;
1732 
1733         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1734                 return;
1735 
1736         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1737 
1738         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1739                 if (inst->alu.add.magic_write) {
1740                         update_scoreboard_for_magic_waddr(scoreboard,
1741                                                           inst->alu.add.waddr,
1742                                                           devinfo);
1743                 } else {
1744                         update_scoreboard_for_sfu_stall_waddr(scoreboard,
1745                                                               inst);
1746                 }
1747 
1748                 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1749                         scoreboard->last_setmsf_tick = scoreboard->tick;
1750         }
1751 
1752         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1753                 if (inst->alu.mul.magic_write) {
1754                         update_scoreboard_for_magic_waddr(scoreboard,
1755                                                           inst->alu.mul.waddr,
1756                                                           devinfo);
1757                 }
1758         }
1759 
1760         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
1761                 update_scoreboard_for_magic_waddr(scoreboard,
1762                                                   inst->sig_addr,
1763                                                   devinfo);
1764         }
1765 
1766         if (inst->sig.ldvary)
1767                 scoreboard->last_ldvary_tick = scoreboard->tick;
1768 
1769         update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
1770 
1771         update_scoreboard_tmu_tracking(scoreboard, qinst);
1772 }
1773 
1774 static void
dump_state(const struct v3d_device_info * devinfo,struct dag * dag)1775 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1776 {
1777         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1778                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1779                 v3d_qpu_dump(devinfo, &n->inst->qpu);
1780                 fprintf(stderr, "\n");
1781 
1782                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1783                         struct schedule_node *child =
1784                                 (struct schedule_node *)edge->child;
1785                         if (!child)
1786                                 continue;
1787 
1788                         fprintf(stderr, "                 - ");
1789                         v3d_qpu_dump(devinfo, &child->inst->qpu);
1790                         fprintf(stderr, " (%d parents, %c)\n",
1791                                 child->dag.parent_count,
1792                                 edge->data ? 'w' : 'r');
1793                 }
1794         }
1795 }
1796 
magic_waddr_latency(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)1797 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1798                                     enum v3d_qpu_waddr waddr,
1799                                     const struct v3d_qpu_instr *after)
1800 {
1801         /* Apply some huge latency between texture fetch requests and getting
1802          * their results back.
1803          *
1804          * FIXME: This is actually pretty bogus.  If we do:
1805          *
1806          * mov tmu0_s, a
1807          * <a bit of math>
1808          * mov tmu0_s, b
1809          * load_tmu0
1810          * <more math>
1811          * load_tmu0
1812          *
1813          * we count that as worse than
1814          *
1815          * mov tmu0_s, a
1816          * mov tmu0_s, b
1817          * <lots of math>
1818          * load_tmu0
1819          * <more math>
1820          * load_tmu0
1821          *
1822          * because we associate the first load_tmu0 with the *second* tmu0_s.
1823          */
1824         if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1825             v3d_qpu_waits_on_tmu(after)) {
1826                 return 100;
1827         }
1828 
1829         /* Assume that anything depending on us is consuming the SFU result. */
1830         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1831                 return 3;
1832 
1833         return 1;
1834 }
1835 
1836 static uint32_t
instruction_latency(const struct v3d_device_info * devinfo,struct schedule_node * before,struct schedule_node * after)1837 instruction_latency(const struct v3d_device_info *devinfo,
1838                     struct schedule_node *before, struct schedule_node *after)
1839 {
1840         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1841         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1842         uint32_t latency = 1;
1843 
1844         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1845             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1846                 return latency;
1847 
1848         if (v3d_qpu_instr_is_sfu(before_inst))
1849                 return 2;
1850 
1851         if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
1852             before_inst->alu.add.magic_write) {
1853                 latency = MAX2(latency,
1854                                magic_waddr_latency(devinfo,
1855                                                    before_inst->alu.add.waddr,
1856                                                    after_inst));
1857         }
1858 
1859         if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
1860             before_inst->alu.mul.magic_write) {
1861                 latency = MAX2(latency,
1862                                magic_waddr_latency(devinfo,
1863                                                    before_inst->alu.mul.waddr,
1864                                                    after_inst));
1865         }
1866 
1867         return latency;
1868 }
1869 
1870 /** Recursive computation of the delay member of a node. */
1871 static void
compute_delay(struct dag_node * node,void * state)1872 compute_delay(struct dag_node *node, void *state)
1873 {
1874         struct schedule_node *n = (struct schedule_node *)node;
1875         struct v3d_compile *c = (struct v3d_compile *) state;
1876 
1877         n->delay = 1;
1878 
1879         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1880                 struct schedule_node *child =
1881                         (struct schedule_node *)edge->child;
1882 
1883                 n->delay = MAX2(n->delay, (child->delay +
1884                                            instruction_latency(c->devinfo, n,
1885                                                                child)));
1886         }
1887 }
1888 
1889 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1890  * should be called on it later to finish pruning the other edges).
1891  */
1892 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)1893 pre_remove_head(struct dag *dag, struct schedule_node *n)
1894 {
1895         list_delinit(&n->dag.link);
1896 
1897         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1898                 if (edge->data)
1899                         dag_remove_edge(dag, edge);
1900         }
1901 }
1902 
1903 static void
mark_instruction_scheduled(const struct v3d_device_info * devinfo,struct dag * dag,uint32_t time,struct schedule_node * node)1904 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1905                            struct dag *dag,
1906                            uint32_t time,
1907                            struct schedule_node *node)
1908 {
1909         if (!node)
1910                 return;
1911 
1912         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1913                 struct schedule_node *child =
1914                         (struct schedule_node *)edge->child;
1915 
1916                 if (!child)
1917                         continue;
1918 
1919                 uint32_t latency = instruction_latency(devinfo, node, child);
1920 
1921                 child->unblocked_time = MAX2(child->unblocked_time,
1922                                              time + latency);
1923         }
1924         dag_prune_head(dag, &node->dag);
1925 }
1926 
1927 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1928 insert_scheduled_instruction(struct v3d_compile *c,
1929                              struct qblock *block,
1930                              struct choose_scoreboard *scoreboard,
1931                              struct qinst *inst)
1932 {
1933         list_addtail(&inst->link, &block->instructions);
1934 
1935         update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
1936         c->qpu_inst_count++;
1937         scoreboard->tick++;
1938 }
1939 
1940 static struct qinst *
vir_nop()1941 vir_nop()
1942 {
1943         struct qreg undef = vir_nop_reg();
1944         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1945 
1946         return qinst;
1947 }
1948 
1949 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1950 emit_nop(struct v3d_compile *c, struct qblock *block,
1951          struct choose_scoreboard *scoreboard)
1952 {
1953         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1954 }
1955 
1956 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1957 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1958                               const struct qinst *qinst, int slot)
1959 {
1960         const struct v3d_qpu_instr *inst = &qinst->qpu;
1961 
1962         if (slot == 2 && qinst->is_tlb_z_write)
1963                 return false;
1964 
1965         if (slot > 0 && qinst->uniform != ~0)
1966                 return false;
1967 
1968         if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
1969                 return false;
1970 
1971         if (inst->sig.ldvary)
1972                 return false;
1973 
1974         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1975                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1976                 if (c->devinfo->ver == 42 && slot == 2 &&
1977                     inst->alu.add.op == V3D_QPU_A_TMUWT) {
1978                         return false;
1979                 }
1980 
1981                 if (c->devinfo->ver == 42) {
1982                         /* No writing physical registers at the end. */
1983                         bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
1984                         bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
1985                         if ((!add_is_nop && !inst->alu.add.magic_write) ||
1986                             (!mul_is_nop && !inst->alu.mul.magic_write)) {
1987                                 return false;
1988                         }
1989 
1990                         if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1991                             !inst->sig_magic) {
1992                                 return false;
1993                         }
1994                 }
1995 
1996                 if (c->devinfo->ver >= 71) {
1997                         /* The thread end instruction must not write to the
1998                          * register file via the add/mul ALUs.
1999                          */
2000                         if (slot == 0 &&
2001                             (!inst->alu.add.magic_write ||
2002                              !inst->alu.mul.magic_write)) {
2003                                 return false;
2004                         }
2005                 }
2006 
2007                 if (c->devinfo->ver == 42) {
2008                         /* RF0-2 might be overwritten during the delay slots by
2009                          * fragment shader setup.
2010                          */
2011                         if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
2012                                 return false;
2013 
2014                         if (inst->raddr_b < 3 &&
2015                             !inst->sig.small_imm_b &&
2016                             v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
2017                                 return false;
2018                         }
2019                 }
2020 
2021                 if (c->devinfo->ver >= 71) {
2022                         /* RF2-3 might be overwritten during the delay slots by
2023                          * fragment shader setup.
2024                          */
2025                         if (v3d71_qpu_reads_raddr(inst, 2) ||
2026                             v3d71_qpu_reads_raddr(inst, 3)) {
2027                                 return false;
2028                         }
2029 
2030                         if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
2031                             v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
2032                                 return false;
2033                         }
2034                 }
2035         }
2036 
2037         return true;
2038 }
2039 
2040 /**
2041  * This is called when trying to merge a thrsw back into the instruction stream
2042  * of instructions that were scheduled *before* the thrsw signal to fill its
2043  * delay slots. Because the actual execution of the thrsw happens after the
2044  * delay slots, it is usually safe to do this, but there are some cases that
2045  * need special care.
2046  */
2047 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst,uint32_t slot)2048 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2049                                           struct choose_scoreboard *scoreboard,
2050                                           const struct qinst *qinst,
2051                                           uint32_t slot)
2052 {
2053         /* No scheduling SFU when the result would land in the other
2054          * thread.  The simulator complains for safety, though it
2055          * would only occur for dead code in our case.
2056          */
2057         if (slot > 0) {
2058                 if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
2059                         return false;
2060                 if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
2061                         return false;
2062         }
2063 
2064         if (qinst->qpu.sig.ldvary) {
2065                 if (c->devinfo->ver == 42 && slot > 0)
2066                         return false;
2067                 if (c->devinfo->ver >= 71 && slot == 2)
2068                         return false;
2069         }
2070 
2071         /* unifa and the following 3 instructions can't overlap a
2072          * thread switch/end. The docs further clarify that this means
2073          * the cycle at which the actual thread switch/end happens
2074          * and not when the thrsw instruction is processed, which would
2075          * be after the 2 delay slots following the thrsw instruction.
2076          * This means that we can move up a thrsw up to the instruction
2077          * right after unifa:
2078          *
2079          * unifa, r5
2080          * thrsw
2081          * delay slot 1
2082          * delay slot 2
2083          * Thread switch happens here, 4 instructions away from unifa
2084          */
2085         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
2086                 return false;
2087 
2088         /* See comment when we set has_rf0_flops_conflict for details */
2089         if (c->devinfo->ver >= 71 &&
2090             slot == 2 &&
2091             v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
2092             !qinst->qpu.sig_magic) {
2093                 if (scoreboard->has_rf0_flops_conflict)
2094                         return false;
2095                 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
2096                         return false;
2097         }
2098 
2099         return true;
2100 }
2101 
2102 /**
2103  * This is called for instructions scheduled *after* a thrsw signal that may
2104  * land in the delay slots of the thrsw. Because these instructions were
2105  * scheduled after the thrsw, we need to be careful when placing them into
2106  * the delay slots, since that means that we are moving them ahead of the
2107  * thread switch and we need to ensure that is not a problem.
2108  */
2109 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst)2110 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2111                                          struct choose_scoreboard *scoreboard,
2112                                          const struct qinst *qinst)
2113 {
2114         const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
2115         assert(slot <= 2);
2116 
2117         /* We merge thrsw instructions back into the instruction stream
2118          * manually, so any instructions scheduled after a thrsw should be
2119          * in the actual delay slots and not in the same slot as the thrsw.
2120          */
2121         assert(slot >= 1);
2122 
2123         /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
2124         if (qinst->qpu.sig.thrsw)
2125                 return false;
2126 
2127         /* The restrictions for instructions scheduled before the the thrsw
2128          * also apply to instructions scheduled after the thrsw that we want
2129          * to place in its delay slots.
2130          */
2131         if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
2132                 return false;
2133 
2134         /* TLB access is disallowed until scoreboard wait is executed, which
2135          * we do on the last thread switch.
2136          */
2137         if (qpu_inst_is_tlb(&qinst->qpu))
2138                 return false;
2139 
2140         /* Instruction sequence restrictions: Branch is not allowed in delay
2141          * slots of a thrsw.
2142          */
2143         if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2144                 return false;
2145 
2146         /* Miscellaneous restrictions: At the point of a thrsw we need to have
2147          * at least one outstanding lookup or TSY wait.
2148          *
2149          * So avoid placing TMU instructions scheduled after the thrsw into
2150          * its delay slots or we may be compromising the integrity of our TMU
2151          * sequences. Also, notice that if we moved these instructions into
2152          * the delay slots of a previous thrsw we could overflow our TMU output
2153          * fifo, since we could be effectively pipelining a lookup scheduled
2154          * after the thrsw into the sequence before the thrsw.
2155          */
2156         if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
2157             qinst->qpu.sig.wrtmuc) {
2158                 return false;
2159         }
2160 
2161         /* Don't move instructions that wait on the TMU before the thread switch
2162          * happens since that would make the current thread stall before the
2163          * switch, which is exactly what we want to avoid with the thrsw
2164          * instruction.
2165          */
2166         if (v3d_qpu_waits_on_tmu(&qinst->qpu))
2167                 return false;
2168 
2169         /* A thread switch invalidates all accumulators, so don't place any
2170          * instructions that write accumulators into the delay slots.
2171          */
2172         if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
2173                 return false;
2174 
2175         /* Multop has an implicit write to the rtop register which is an
2176          * specialized accumulator that is only used with this instruction.
2177          */
2178         if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
2179                 return false;
2180 
2181         /* Flags are invalidated across a thread switch, so dont' place
2182          * instructions that write flags into delay slots.
2183          */
2184         if (v3d_qpu_writes_flags(&qinst->qpu))
2185                 return false;
2186 
2187         /* TSY sync ops materialize at the point of the next thread switch,
2188          * therefore, if we have a TSY sync right after a thread switch, we
2189          * cannot place it in its delay slots, or we would be moving the sync
2190          * to the thrsw before it instead.
2191          */
2192         if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
2193                 return false;
2194 
2195         return true;
2196 }
2197 
2198 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)2199 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
2200                      struct qinst *qinst, int instructions_in_sequence,
2201                      bool is_thrend)
2202 {
2203         for (int slot = 0; slot < instructions_in_sequence; slot++) {
2204                 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
2205                                                                qinst, slot)) {
2206                         return false;
2207                 }
2208 
2209                 if (is_thrend &&
2210                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
2211                         return false;
2212                 }
2213 
2214                 /* Note that the list is circular, so we can only do this up
2215                  * to instructions_in_sequence.
2216                  */
2217                 qinst = (struct qinst *)qinst->link.next;
2218         }
2219 
2220         return true;
2221 }
2222 
2223 /**
2224  * Emits a THRSW signal in the stream, trying to move it up to pair with
2225  * another instruction.
2226  */
2227 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)2228 emit_thrsw(struct v3d_compile *c,
2229            struct qblock *block,
2230            struct choose_scoreboard *scoreboard,
2231            struct qinst *inst,
2232            bool is_thrend)
2233 {
2234         int time = 0;
2235 
2236         /* There should be nothing in a thrsw inst being scheduled other than
2237          * the signal bits.
2238          */
2239         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
2240         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
2241         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
2242 
2243         /* Don't try to emit a thrsw in the delay slots of a previous thrsw
2244          * or branch.
2245          */
2246         while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
2247                 emit_nop(c, block, scoreboard);
2248                 time++;
2249         }
2250         while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
2251                 emit_nop(c, block, scoreboard);
2252                 time++;
2253         }
2254 
2255         /* Find how far back into previous instructions we can put the THRSW. */
2256         int slots_filled = 0;
2257         int invalid_sig_count = 0;
2258         int invalid_seq_count = 0;
2259         bool last_thrsw_after_invalid_ok = false;
2260         struct qinst *merge_inst = NULL;
2261         vir_for_each_inst_rev(prev_inst, block) {
2262                 /* No emitting our thrsw while the previous thrsw hasn't
2263                  * happened yet.
2264                  */
2265                 if (scoreboard->last_thrsw_tick + 3 >
2266                     scoreboard->tick - (slots_filled + 1)) {
2267                         break;
2268                 }
2269 
2270 
2271                 if (!valid_thrsw_sequence(c, scoreboard,
2272                                           prev_inst, slots_filled + 1,
2273                                           is_thrend)) {
2274                         /* Even if the current sequence isn't valid, we may
2275                          * be able to get a valid sequence by trying to move the
2276                          * thrsw earlier, so keep going.
2277                          */
2278                         invalid_seq_count++;
2279                         goto cont_block;
2280                 }
2281 
2282                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
2283                 sig.thrsw = true;
2284                 uint32_t packed_sig;
2285                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
2286                         /* If we can't merge the thrsw here because of signal
2287                          * incompatibility, keep going, we might be able to
2288                          * merge it in an earlier instruction.
2289                          */
2290                         invalid_sig_count++;
2291                         goto cont_block;
2292                 }
2293 
2294                 /* For last thrsw we need 2 consecutive slots that are
2295                  * thrsw compatible, so if we have previously jumped over
2296                  * an incompatible signal, flag that we have found the first
2297                  * valid slot here and keep going.
2298                  */
2299                 if (inst->is_last_thrsw && invalid_sig_count > 0 &&
2300                     !last_thrsw_after_invalid_ok) {
2301                         last_thrsw_after_invalid_ok = true;
2302                         invalid_sig_count++;
2303                         goto cont_block;
2304                 }
2305 
2306                 /* We can merge the thrsw in this instruction */
2307                 last_thrsw_after_invalid_ok = false;
2308                 invalid_sig_count = 0;
2309                 invalid_seq_count = 0;
2310                 merge_inst = prev_inst;
2311 
2312 cont_block:
2313                 if (++slots_filled == 3)
2314                         break;
2315         }
2316 
2317         /* If we jumped over a signal incompatibility and did not manage to
2318          * merge the thrsw in the end, we need to adjust slots filled to match
2319          * the last valid merge point.
2320          */
2321         assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
2322                 slots_filled >= invalid_sig_count + invalid_seq_count);
2323         if (invalid_sig_count > 0)
2324                 slots_filled -= invalid_sig_count;
2325         if (invalid_seq_count > 0)
2326                 slots_filled -= invalid_seq_count;
2327 
2328         bool needs_free = false;
2329         if (merge_inst) {
2330                 merge_inst->qpu.sig.thrsw = true;
2331                 needs_free = true;
2332                 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
2333         } else {
2334                 scoreboard->last_thrsw_tick = scoreboard->tick;
2335                 insert_scheduled_instruction(c, block, scoreboard, inst);
2336                 time++;
2337                 slots_filled++;
2338                 merge_inst = inst;
2339         }
2340 
2341         scoreboard->first_thrsw_emitted = true;
2342 
2343         /* If we're emitting the last THRSW (other than program end), then
2344          * signal that to the HW by emitting two THRSWs in a row.
2345          */
2346         if (inst->is_last_thrsw) {
2347                 if (slots_filled <= 1) {
2348                         emit_nop(c, block, scoreboard);
2349                         time++;
2350                 }
2351                 struct qinst *second_inst =
2352                         (struct qinst *)merge_inst->link.next;
2353                 second_inst->qpu.sig.thrsw = true;
2354                 scoreboard->last_thrsw_emitted = true;
2355         }
2356 
2357         /* Make sure the thread end executes within the program lifespan */
2358         if (is_thrend) {
2359                 for (int i = 0; i < 3 - slots_filled; i++) {
2360                         emit_nop(c, block, scoreboard);
2361                         time++;
2362                 }
2363         }
2364 
2365         /* If we put our THRSW into another instruction, free up the
2366          * instruction that didn't end up scheduled into the list.
2367          */
2368         if (needs_free)
2369                 free(inst);
2370 
2371         return time;
2372 }
2373 
2374 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile * c,struct qinst * inst)2375 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
2376 {
2377         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2378                 return false;
2379 
2380         if (inst->qpu.sig.thrsw)
2381                 return false;
2382 
2383         if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
2384                 return false;
2385 
2386         if (vir_has_uniform(inst))
2387                 return false;
2388 
2389         return true;
2390 }
2391 
2392 static void
emit_branch(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)2393 emit_branch(struct v3d_compile *c,
2394            struct qblock *block,
2395            struct choose_scoreboard *scoreboard,
2396            struct qinst *inst)
2397 {
2398         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2399 
2400         /* We should've not picked up a branch for the delay slots of a previous
2401          * thrsw, branch or unifa write instruction.
2402          */
2403         int branch_tick = scoreboard->tick;
2404         assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
2405         assert(scoreboard->last_branch_tick + 3 < branch_tick);
2406         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
2407 
2408         /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
2409          * setmsf.
2410          */
2411         bool is_safe_msf_branch =
2412                 c->devinfo->ver >= 71 ||
2413                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
2414                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
2415                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
2416                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
2417         assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
2418                is_safe_msf_branch);
2419 
2420         /* Insert the branch instruction */
2421         insert_scheduled_instruction(c, block, scoreboard, inst);
2422 
2423         /* Now see if we can move the branch instruction back into the
2424          * instruction stream to fill its delay slots
2425          */
2426         int slots_filled = 0;
2427         while (slots_filled < 3 && block->instructions.next != &inst->link) {
2428                 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
2429                 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
2430 
2431                 /* Can't move the branch instruction if that would place it
2432                  * in the delay slots of other instructions.
2433                  */
2434                 if (scoreboard->last_branch_tick + 3 >=
2435                     branch_tick - slots_filled - 1) {
2436                         break;
2437                 }
2438 
2439                 if (scoreboard->last_thrsw_tick + 2 >=
2440                     branch_tick - slots_filled - 1) {
2441                         break;
2442                 }
2443 
2444                 if (scoreboard->last_unifa_write_tick + 3 >=
2445                     branch_tick - slots_filled - 1) {
2446                         break;
2447                 }
2448 
2449                 /* Do not move up a branch if it can disrupt an ldvary sequence
2450                  * as that can cause stomping of the r5 register.
2451                  */
2452                 if (scoreboard->last_ldvary_tick + 2 >=
2453                     branch_tick - slots_filled) {
2454                        break;
2455                 }
2456 
2457                 /* Can't move a conditional branch before the instruction
2458                  * that writes the flags for its condition.
2459                  */
2460                 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
2461                     inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
2462                         break;
2463                 }
2464 
2465                 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
2466                         break;
2467 
2468                 if (!is_safe_msf_branch) {
2469                         struct qinst *prev_prev_inst =
2470                                 (struct qinst *) prev_inst->link.prev;
2471                         if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
2472                             prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
2473                                 break;
2474                         }
2475                 }
2476 
2477                 list_del(&prev_inst->link);
2478                 list_add(&prev_inst->link, &inst->link);
2479                 slots_filled++;
2480         }
2481 
2482         block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
2483         scoreboard->last_branch_tick = branch_tick - slots_filled;
2484 
2485         /* Fill any remaining delay slots.
2486          *
2487          * For unconditional branches we'll try to fill these with the
2488          * first instructions in the successor block after scheduling
2489          * all blocks when setting up branch targets.
2490          */
2491         for (int i = 0; i < 3 - slots_filled; i++)
2492                 emit_nop(c, block, scoreboard);
2493 }
2494 
2495 static bool
alu_reads_register(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst,bool add,bool magic,uint32_t index)2496 alu_reads_register(const struct v3d_device_info *devinfo,
2497                    struct v3d_qpu_instr *inst,
2498                    bool add, bool magic, uint32_t index)
2499 {
2500         uint32_t num_src;
2501         if (add)
2502                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2503         else
2504                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2505 
2506         if (devinfo->ver == 42) {
2507                 enum v3d_qpu_mux mux_a, mux_b;
2508                 if (add) {
2509                         mux_a = inst->alu.add.a.mux;
2510                         mux_b = inst->alu.add.b.mux;
2511                 } else {
2512                         mux_a = inst->alu.mul.a.mux;
2513                         mux_b = inst->alu.mul.b.mux;
2514                 }
2515 
2516                 for (int i = 0; i < num_src; i++) {
2517                         if (magic) {
2518                                 if (i == 0 && mux_a == index)
2519                                         return true;
2520                                 if (i == 1 && mux_b == index)
2521                                         return true;
2522                         } else {
2523                                 if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2524                                     inst->raddr_a == index) {
2525                                         return true;
2526                                 }
2527                                 if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2528                                     inst->raddr_b == index) {
2529                                         return true;
2530                                 }
2531                                 if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2532                                     inst->raddr_a == index) {
2533                                         return true;
2534                                 }
2535                                 if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2536                                     inst->raddr_b == index) {
2537                                         return true;
2538                                 }
2539                         }
2540                 }
2541 
2542                 return false;
2543         }
2544 
2545         assert(devinfo->ver >= 71);
2546         assert(!magic);
2547 
2548         uint32_t raddr_a, raddr_b;
2549         if (add) {
2550                 raddr_a = inst->alu.add.a.raddr;
2551                 raddr_b = inst->alu.add.b.raddr;
2552         } else {
2553                 raddr_a = inst->alu.mul.a.raddr;
2554                 raddr_b = inst->alu.mul.b.raddr;
2555         }
2556 
2557         for (int i = 0; i < num_src; i++) {
2558                 if (i == 0 && raddr_a == index)
2559                         return true;
2560                 if (i == 1 && raddr_b == index)
2561                         return true;
2562         }
2563 
2564         return false;
2565 }
2566 
2567 /**
2568  * This takes and ldvary signal merged into 'inst' and tries to move it up to
2569  * the previous instruction to get good pipelining of ldvary sequences,
2570  * transforming this:
2571  *
2572  * nop                  ; nop               ; ldvary.r4
2573  * nop                  ; fmul  r0, r4, rf0 ;
2574  * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
2575  *
2576  * into:
2577  *
2578  * nop                  ; nop               ; ldvary.r4
2579  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2580  * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
2581  *
2582  * If we manage to do this successfully (we return true here), then flagging
2583  * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2584  * we will be able to pick up to merge into 'inst', leading to code like this:
2585  *
2586  * nop                  ; nop               ; ldvary.r4
2587  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2588  * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
2589  */
2590 static bool
fixup_pipelined_ldvary(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct v3d_qpu_instr * inst)2591 fixup_pipelined_ldvary(struct v3d_compile *c,
2592                        struct choose_scoreboard *scoreboard,
2593                        struct qblock *block,
2594                        struct v3d_qpu_instr *inst)
2595 {
2596         const struct v3d_device_info *devinfo = c->devinfo;
2597 
2598         /* We only call this if we have successfully merged an ldvary into a
2599          * previous instruction.
2600          */
2601         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2602         assert(inst->sig.ldvary);
2603         uint32_t ldvary_magic = inst->sig_magic;
2604         uint32_t ldvary_index = inst->sig_addr;
2605 
2606         /* The instruction in which we merged the ldvary cannot read
2607          * the ldvary destination, if it does, then moving the ldvary before
2608          * it would overwrite it.
2609          */
2610         if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
2611                 return false;
2612         if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
2613                 return false;
2614 
2615         /* The implicit ldvary destination may not be written to by a signal
2616          * in the instruction following ldvary. Since we are planning to move
2617          * ldvary to the previous instruction, this means we need to check if
2618          * the current instruction has any other signal that could create this
2619          * conflict. The only other signal that can write to the implicit
2620          * ldvary destination that is compatible with ldvary in the same
2621          * instruction is ldunif.
2622          */
2623         if (inst->sig.ldunif)
2624                 return false;
2625 
2626         /* The previous instruction can't write to the same destination as the
2627          * ldvary.
2628          */
2629         struct qinst *prev = (struct qinst *) block->instructions.prev;
2630         if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2631                 return false;
2632 
2633         if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2634                 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2635                     prev->qpu.alu.add.waddr == ldvary_index) {
2636                         return false;
2637                 }
2638         }
2639 
2640         if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2641                 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2642                     prev->qpu.alu.mul.waddr == ldvary_index) {
2643                         return false;
2644                 }
2645         }
2646 
2647         /* The previous instruction cannot have a conflicting signal */
2648         if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
2649                 return false;
2650 
2651         uint32_t sig;
2652         struct v3d_qpu_sig new_sig = prev->qpu.sig;
2653         new_sig.ldvary = true;
2654         if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
2655                 return false;
2656 
2657         /* The previous instruction cannot use flags since ldvary uses the
2658          * 'cond' instruction field to store the destination.
2659          */
2660         if (v3d_qpu_writes_flags(&prev->qpu))
2661                 return false;
2662         if (v3d_qpu_reads_flags(&prev->qpu))
2663                 return false;
2664 
2665         /* We can't put an ldvary in the delay slots of a thrsw. We should've
2666          * prevented this when pairing up the ldvary with another instruction
2667          * and flagging it for a fixup. In V3D 7.x this is limited only to the
2668          * second delay slot.
2669          */
2670         assert((devinfo->ver == 42 &&
2671                 scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
2672                (devinfo->ver >= 71 &&
2673                 scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
2674 
2675         /* Move the ldvary to the previous instruction and remove it from the
2676          * current one.
2677          */
2678         prev->qpu.sig.ldvary = true;
2679         prev->qpu.sig_magic = ldvary_magic;
2680         prev->qpu.sig_addr = ldvary_index;
2681         scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2682 
2683         inst->sig.ldvary = false;
2684         inst->sig_magic = false;
2685         inst->sig_addr = 0;
2686 
2687         /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
2688         if (devinfo->ver >= 71) {
2689                 scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
2690                 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
2691         }
2692 
2693         /* By moving ldvary to the previous instruction we make it update r5
2694          * (rf0 for ver >= 71) in the current one, so nothing else in it
2695          * should write this register.
2696          *
2697          * This should've been prevented by our depedency tracking, which
2698          * would not allow ldvary to be paired up with an instruction that
2699          * writes r5/rf0 (since our dependency tracking doesn't know that the
2700          * ldvary write to r5/rf0 happens in the next instruction).
2701          */
2702         assert(!v3d_qpu_writes_r5(devinfo, inst));
2703         assert(devinfo->ver == 42 ||
2704                (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
2705                 !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
2706 
2707         return true;
2708 }
2709 
2710 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2711 schedule_instructions(struct v3d_compile *c,
2712                       struct choose_scoreboard *scoreboard,
2713                       struct qblock *block,
2714                       enum quniform_contents *orig_uniform_contents,
2715                       uint32_t *orig_uniform_data,
2716                       uint32_t *next_uniform)
2717 {
2718         const struct v3d_device_info *devinfo = c->devinfo;
2719         uint32_t time = 0;
2720 
2721         while (!list_is_empty(&scoreboard->dag->heads)) {
2722                 struct schedule_node *chosen =
2723                         choose_instruction_to_schedule(c, scoreboard, NULL);
2724                 struct schedule_node *merge = NULL;
2725 
2726                 /* If there are no valid instructions to schedule, drop a NOP
2727                  * in.
2728                  */
2729                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2730                 struct v3d_qpu_instr *inst = &qinst->qpu;
2731 
2732                 if (debug) {
2733                         fprintf(stderr, "t=%4d: current list:\n",
2734                                 time);
2735                         dump_state(devinfo, scoreboard->dag);
2736                         fprintf(stderr, "t=%4d: chose:   ", time);
2737                         v3d_qpu_dump(devinfo, inst);
2738                         fprintf(stderr, "\n");
2739                 }
2740 
2741                 /* We can't mark_instruction_scheduled() the chosen inst until
2742                  * we're done identifying instructions to merge, so put the
2743                  * merged instructions on a list for a moment.
2744                  */
2745                 struct list_head merged_list;
2746                 list_inithead(&merged_list);
2747 
2748                 /* Schedule this instruction onto the QPU list. Also try to
2749                  * find an instruction to pair with it.
2750                  */
2751                 if (chosen) {
2752                         time = MAX2(chosen->unblocked_time, time);
2753                         pre_remove_head(scoreboard->dag, chosen);
2754 
2755                         while ((merge =
2756                                 choose_instruction_to_schedule(c, scoreboard,
2757                                                                chosen))) {
2758                                 time = MAX2(merge->unblocked_time, time);
2759                                 pre_remove_head(scoreboard->dag, merge);
2760                                 list_addtail(&merge->link, &merged_list);
2761                                 (void)qpu_merge_inst(devinfo, inst,
2762                                                      inst, &merge->inst->qpu);
2763                                 if (merge->inst->uniform != -1) {
2764                                         chosen->inst->uniform =
2765                                                 merge->inst->uniform;
2766                                 }
2767 
2768                                 chosen->inst->ldtmu_count +=
2769                                         merge->inst->ldtmu_count;
2770 
2771                                 if (debug) {
2772                                         fprintf(stderr, "t=%4d: merging: ",
2773                                                 time);
2774                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
2775                                         fprintf(stderr, "\n");
2776                                         fprintf(stderr, "         result: ");
2777                                         v3d_qpu_dump(devinfo, inst);
2778                                         fprintf(stderr, "\n");
2779                                 }
2780 
2781                                 if (scoreboard->fixup_ldvary) {
2782                                         scoreboard->fixup_ldvary = false;
2783                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2784                                                 /* Flag the ldvary as scheduled
2785                                                  * now so we can try to merge the
2786                                                  * follow-up instruction in the
2787                                                  * the ldvary sequence into the
2788                                                  * current instruction.
2789                                                  */
2790                                                 mark_instruction_scheduled(
2791                                                         devinfo, scoreboard->dag,
2792                                                         time, merge);
2793                                         }
2794                                 }
2795                         }
2796                         if (read_stalls(c->devinfo, scoreboard, inst))
2797                                 c->qpu_inst_stalled_count++;
2798                 }
2799 
2800                 /* Update the uniform index for the rewritten location --
2801                  * branch target updating will still need to change
2802                  * c->uniform_data[] using this index.
2803                  */
2804                 if (qinst->uniform != -1) {
2805                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2806                                 block->branch_uniform = *next_uniform;
2807 
2808                         c->uniform_data[*next_uniform] =
2809                                 orig_uniform_data[qinst->uniform];
2810                         c->uniform_contents[*next_uniform] =
2811                                 orig_uniform_contents[qinst->uniform];
2812                         qinst->uniform = *next_uniform;
2813                         (*next_uniform)++;
2814                 }
2815 
2816                 if (debug) {
2817                         fprintf(stderr, "\n");
2818                 }
2819 
2820                 /* Now that we've scheduled a new instruction, some of its
2821                  * children can be promoted to the list of instructions ready to
2822                  * be scheduled.  Update the children's unblocked time for this
2823                  * DAG edge as we do so.
2824                  */
2825                 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2826                 list_for_each_entry(struct schedule_node, merge, &merged_list,
2827                                     link) {
2828                         mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2829 
2830                         /* The merged VIR instruction doesn't get re-added to the
2831                          * block, so free it now.
2832                          */
2833                         free(merge->inst);
2834                 }
2835 
2836                 if (inst->sig.thrsw) {
2837                         time += emit_thrsw(c, block, scoreboard, qinst, false);
2838                 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2839                         emit_branch(c, block, scoreboard, qinst);
2840                 } else {
2841                         insert_scheduled_instruction(c, block,
2842                                                      scoreboard, qinst);
2843                 }
2844         }
2845 
2846         return time;
2847 }
2848 
2849 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2850 qpu_schedule_instructions_block(struct v3d_compile *c,
2851                                 struct choose_scoreboard *scoreboard,
2852                                 struct qblock *block,
2853                                 enum quniform_contents *orig_uniform_contents,
2854                                 uint32_t *orig_uniform_data,
2855                                 uint32_t *next_uniform)
2856 {
2857         void *mem_ctx = ralloc_context(NULL);
2858         scoreboard->dag = dag_create(mem_ctx);
2859         struct list_head setup_list;
2860 
2861         list_inithead(&setup_list);
2862 
2863         /* Wrap each instruction in a scheduler structure. */
2864         while (!list_is_empty(&block->instructions)) {
2865                 struct qinst *qinst = (struct qinst *)block->instructions.next;
2866                 struct schedule_node *n =
2867                         rzalloc(mem_ctx, struct schedule_node);
2868 
2869                 dag_init_node(scoreboard->dag, &n->dag);
2870                 n->inst = qinst;
2871 
2872                 list_del(&qinst->link);
2873                 list_addtail(&n->link, &setup_list);
2874         }
2875 
2876         calculate_forward_deps(c, scoreboard->dag, &setup_list);
2877         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2878 
2879         dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2880 
2881         uint32_t cycles = schedule_instructions(c, scoreboard, block,
2882                                                 orig_uniform_contents,
2883                                                 orig_uniform_data,
2884                                                 next_uniform);
2885 
2886         ralloc_free(mem_ctx);
2887         scoreboard->dag = NULL;
2888 
2889         return cycles;
2890 }
2891 
2892 static void
qpu_set_branch_targets(struct v3d_compile * c)2893 qpu_set_branch_targets(struct v3d_compile *c)
2894 {
2895         vir_for_each_block(block, c) {
2896                 /* The end block of the program has no branch. */
2897                 if (!block->successors[0])
2898                         continue;
2899 
2900                 /* If there was no branch instruction, then the successor
2901                  * block must follow immediately after this one.
2902                  */
2903                 if (block->branch_qpu_ip == ~0) {
2904                         assert(block->end_qpu_ip + 1 ==
2905                                block->successors[0]->start_qpu_ip);
2906                         continue;
2907                 }
2908 
2909                 /* Walk back through the delay slots to find the branch
2910                  * instr.
2911                  */
2912                 struct qinst *branch = NULL;
2913                 struct list_head *entry = block->instructions.prev;
2914                 int32_t delay_slot_count = -1;
2915                 struct qinst *delay_slots_start = NULL;
2916                 for (int i = 0; i < 3; i++) {
2917                         entry = entry->prev;
2918                         struct qinst *inst =
2919                                 container_of(entry, struct qinst, link);
2920 
2921                         if (delay_slot_count == -1) {
2922                                 if (!v3d_qpu_is_nop(&inst->qpu))
2923                                         delay_slot_count = i;
2924                                 else
2925                                         delay_slots_start = inst;
2926                         }
2927 
2928                         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2929                                 branch = inst;
2930                                 break;
2931                         }
2932                 }
2933                 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2934                 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2935                 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2936 
2937                 /* Make sure that the if-we-don't-jump
2938                  * successor was scheduled just after the
2939                  * delay slots.
2940                  */
2941                 assert(!block->successors[1] ||
2942                        block->successors[1]->start_qpu_ip ==
2943                        block->branch_qpu_ip + 4);
2944 
2945                 branch->qpu.branch.offset =
2946                         ((block->successors[0]->start_qpu_ip -
2947                           (block->branch_qpu_ip + 4)) *
2948                          sizeof(uint64_t));
2949 
2950                 /* Set up the relative offset to jump in the
2951                  * uniform stream.
2952                  *
2953                  * Use a temporary here, because
2954                  * uniform_data[inst->uniform] may be shared
2955                  * between multiple instructions.
2956                  */
2957                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2958                 c->uniform_data[branch->uniform] =
2959                         (block->successors[0]->start_uniform -
2960                          (block->branch_uniform + 1)) * 4;
2961 
2962                 /* If this is an unconditional branch, try to fill any remaining
2963                  * delay slots with the initial instructions of the successor
2964                  * block.
2965                  *
2966                  * FIXME: we can do the same for conditional branches if we
2967                  * predicate the instructions to match the branch condition.
2968                  */
2969                 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2970                         struct list_head *successor_insts =
2971                                 &block->successors[0]->instructions;
2972                         delay_slot_count = MIN2(delay_slot_count,
2973                                                 list_length(successor_insts));
2974                         struct qinst *s_inst =
2975                                 (struct qinst *) successor_insts->next;
2976                         struct qinst *slot = delay_slots_start;
2977                         int slots_filled = 0;
2978                         while (slots_filled < delay_slot_count &&
2979                                qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2980                                 memcpy(&slot->qpu, &s_inst->qpu,
2981                                        sizeof(slot->qpu));
2982                                 s_inst = (struct qinst *) s_inst->link.next;
2983                                 slot = (struct qinst *) slot->link.next;
2984                                 slots_filled++;
2985                         }
2986                         branch->qpu.branch.offset +=
2987                                 slots_filled * sizeof(uint64_t);
2988                 }
2989         }
2990 }
2991 
2992 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)2993 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2994 {
2995         const struct v3d_device_info *devinfo = c->devinfo;
2996         struct qblock *end_block = list_last_entry(&c->blocks,
2997                                                    struct qblock, link);
2998 
2999         /* We reorder the uniforms as we schedule instructions, so save the
3000          * old data off and replace it.
3001          */
3002         uint32_t *uniform_data = c->uniform_data;
3003         enum quniform_contents *uniform_contents = c->uniform_contents;
3004         c->uniform_contents = ralloc_array(c, enum quniform_contents,
3005                                            c->num_uniforms);
3006         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
3007         c->uniform_array_size = c->num_uniforms;
3008         uint32_t next_uniform = 0;
3009 
3010         struct choose_scoreboard scoreboard;
3011         memset(&scoreboard, 0, sizeof(scoreboard));
3012         scoreboard.last_ldvary_tick = -10;
3013         scoreboard.last_unifa_write_tick = -10;
3014         scoreboard.last_magic_sfu_write_tick = -10;
3015         scoreboard.last_uniforms_reset_tick = -10;
3016         scoreboard.last_thrsw_tick = -10;
3017         scoreboard.last_branch_tick = -10;
3018         scoreboard.last_setmsf_tick = -10;
3019         scoreboard.last_stallable_sfu_tick = -10;
3020         scoreboard.first_ldtmu_after_thrsw = true;
3021         scoreboard.last_implicit_rf0_write_tick = - 10;
3022 
3023         if (debug) {
3024                 fprintf(stderr, "Pre-schedule instructions\n");
3025                 vir_for_each_block(block, c) {
3026                         fprintf(stderr, "BLOCK %d\n", block->index);
3027                         list_for_each_entry(struct qinst, qinst,
3028                                             &block->instructions, link) {
3029                                 v3d_qpu_dump(devinfo, &qinst->qpu);
3030                                 fprintf(stderr, "\n");
3031                         }
3032                 }
3033                 fprintf(stderr, "\n");
3034         }
3035 
3036         uint32_t cycles = 0;
3037         vir_for_each_block(block, c) {
3038                 block->start_qpu_ip = c->qpu_inst_count;
3039                 block->branch_qpu_ip = ~0;
3040                 block->start_uniform = next_uniform;
3041 
3042                 cycles += qpu_schedule_instructions_block(c,
3043                                                           &scoreboard,
3044                                                           block,
3045                                                           uniform_contents,
3046                                                           uniform_data,
3047                                                           &next_uniform);
3048 
3049                 block->end_qpu_ip = c->qpu_inst_count - 1;
3050         }
3051 
3052         /* Emit the program-end THRSW instruction. */;
3053         struct qinst *thrsw = vir_nop();
3054         thrsw->qpu.sig.thrsw = true;
3055         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
3056 
3057         qpu_set_branch_targets(c);
3058 
3059         assert(next_uniform == c->num_uniforms);
3060 
3061         return cycles;
3062 }
3063