1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41
42 static bool debug;
43
44 struct schedule_node_child;
45
46 struct schedule_node {
47 struct dag_node dag;
48 struct list_head link;
49 struct qinst *inst;
50
51 /* Longest cycles + instruction_latency() of any parent of this node. */
52 uint32_t unblocked_time;
53
54 /**
55 * Minimum number of cycles from scheduling this instruction until the
56 * end of the program, based on the slowest dependency chain through
57 * the children.
58 */
59 uint32_t delay;
60
61 /**
62 * cycles between this instruction being scheduled and when its result
63 * can be consumed.
64 */
65 uint32_t latency;
66 };
67
68 /* When walking the instructions in reverse, we need to swap before/after in
69 * add_dep().
70 */
71 enum direction { F, R };
72
73 struct schedule_state {
74 const struct v3d_device_info *devinfo;
75 struct dag *dag;
76 struct schedule_node *last_r[6];
77 struct schedule_node *last_rf[64];
78 struct schedule_node *last_sf;
79 struct schedule_node *last_vpm_read;
80 struct schedule_node *last_tmu_write;
81 struct schedule_node *last_tmu_config;
82 struct schedule_node *last_tmu_read;
83 struct schedule_node *last_tlb;
84 struct schedule_node *last_vpm;
85 struct schedule_node *last_unif;
86 struct schedule_node *last_rtop;
87 struct schedule_node *last_unifa;
88 struct schedule_node *last_setmsf;
89 enum direction dir;
90 /* Estimated cycle when the current instruction would start. */
91 uint32_t time;
92 };
93
94 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)95 add_dep(struct schedule_state *state,
96 struct schedule_node *before,
97 struct schedule_node *after,
98 bool write)
99 {
100 bool write_after_read = !write && state->dir == R;
101 uintptr_t edge_data = write_after_read;
102
103 if (!before || !after)
104 return;
105
106 assert(before != after);
107
108 if (state->dir == F)
109 dag_add_edge(&before->dag, &after->dag, edge_data);
110 else
111 dag_add_edge(&after->dag, &before->dag, edge_data);
112 }
113
114 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)115 add_read_dep(struct schedule_state *state,
116 struct schedule_node *before,
117 struct schedule_node *after)
118 {
119 add_dep(state, before, after, false);
120 }
121
122 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)123 add_write_dep(struct schedule_state *state,
124 struct schedule_node **before,
125 struct schedule_node *after)
126 {
127 add_dep(state, *before, after, true);
128 *before = after;
129 }
130
131 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)132 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
133 {
134 if (inst->sig.ldtlb || inst->sig.ldtlbu)
135 return true;
136
137 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
138 return false;
139
140 if (inst->alu.add.op != V3D_QPU_A_NOP &&
141 inst->alu.add.magic_write &&
142 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
143 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
144 return true;
145
146 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
147 inst->alu.mul.magic_write &&
148 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
149 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
150 return true;
151
152 return false;
153 }
154
155 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)156 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
157 enum v3d_qpu_mux mux)
158 {
159 assert(state->devinfo->ver < 71);
160 switch (mux) {
161 case V3D_QPU_MUX_A:
162 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
163 break;
164 case V3D_QPU_MUX_B:
165 if (!n->inst->qpu.sig.small_imm_b) {
166 add_read_dep(state,
167 state->last_rf[n->inst->qpu.raddr_b], n);
168 }
169 break;
170 default:
171 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
172 break;
173 }
174 }
175
176
177 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint8_t raddr,bool is_small_imm)178 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
179 uint8_t raddr, bool is_small_imm)
180 {
181 assert(state->devinfo->ver >= 71);
182
183 if (!is_small_imm)
184 add_read_dep(state, state->last_rf[raddr], n);
185 }
186
187 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)188 tmu_write_is_sequence_terminator(uint32_t waddr)
189 {
190 switch (waddr) {
191 case V3D_QPU_WADDR_TMUS:
192 case V3D_QPU_WADDR_TMUSCM:
193 case V3D_QPU_WADDR_TMUSF:
194 case V3D_QPU_WADDR_TMUSLOD:
195 case V3D_QPU_WADDR_TMUA:
196 case V3D_QPU_WADDR_TMUAU:
197 return true;
198 default:
199 return false;
200 }
201 }
202
203 static bool
is_tmu_sequence_terminator(struct qinst * inst)204 is_tmu_sequence_terminator(struct qinst *inst)
205 {
206 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
207 return false;
208
209 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
210 if (!inst->qpu.alu.add.magic_write)
211 return false;
212 return tmu_write_is_sequence_terminator(inst->qpu.alu.add.waddr);
213 }
214
215 if (inst->qpu.alu.mul.op != V3D_QPU_M_NOP) {
216 if (!inst->qpu.alu.mul.magic_write)
217 return false;
218 return tmu_write_is_sequence_terminator(inst->qpu.alu.mul.waddr);
219 }
220
221 return false;
222 }
223
224 static bool
can_reorder_tmu_write(const struct v3d_device_info * devinfo,uint32_t waddr)225 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
226 {
227 if (tmu_write_is_sequence_terminator(waddr))
228 return false;
229
230 if (waddr == V3D_QPU_WADDR_TMUD)
231 return false;
232
233 return true;
234 }
235
236 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)237 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
238 uint32_t waddr, bool magic)
239 {
240 if (!magic) {
241 add_write_dep(state, &state->last_rf[waddr], n);
242 } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
243 if (can_reorder_tmu_write(state->devinfo, waddr))
244 add_read_dep(state, state->last_tmu_write, n);
245 else
246 add_write_dep(state, &state->last_tmu_write, n);
247
248 if (tmu_write_is_sequence_terminator(waddr))
249 add_write_dep(state, &state->last_tmu_config, n);
250 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
251 /* Handled by v3d_qpu_writes_r4() check. */
252 } else {
253 switch (waddr) {
254 case V3D_QPU_WADDR_R0:
255 case V3D_QPU_WADDR_R1:
256 case V3D_QPU_WADDR_R2:
257 add_write_dep(state,
258 &state->last_r[waddr - V3D_QPU_WADDR_R0],
259 n);
260 break;
261 case V3D_QPU_WADDR_R3:
262 case V3D_QPU_WADDR_R4:
263 case V3D_QPU_WADDR_R5:
264 /* Handled by v3d_qpu_writes_r*() checks below. */
265 break;
266
267 case V3D_QPU_WADDR_VPM:
268 case V3D_QPU_WADDR_VPMU:
269 add_write_dep(state, &state->last_vpm, n);
270 break;
271
272 case V3D_QPU_WADDR_TLB:
273 case V3D_QPU_WADDR_TLBU:
274 add_write_dep(state, &state->last_tlb, n);
275 break;
276
277 case V3D_QPU_WADDR_SYNC:
278 case V3D_QPU_WADDR_SYNCB:
279 case V3D_QPU_WADDR_SYNCU:
280 /* For CS barrier(): Sync against any other memory
281 * accesses. There doesn't appear to be any need for
282 * barriers to affect ALU operations.
283 */
284 add_write_dep(state, &state->last_tmu_write, n);
285 add_write_dep(state, &state->last_tmu_read, n);
286 break;
287
288 case V3D_QPU_WADDR_UNIFA:
289 add_write_dep(state, &state->last_unifa, n);
290 break;
291
292 case V3D_QPU_WADDR_NOP:
293 break;
294
295 default:
296 fprintf(stderr, "Unknown waddr %d\n", waddr);
297 abort();
298 }
299 }
300 }
301
302 /**
303 * Common code for dependencies that need to be tracked both forward and
304 * backward.
305 *
306 * This is for things like "all reads of r4 have to happen between the r4
307 * writes that surround them".
308 */
309 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)310 calculate_deps(struct schedule_state *state, struct schedule_node *n)
311 {
312 const struct v3d_device_info *devinfo = state->devinfo;
313 struct qinst *qinst = n->inst;
314 struct v3d_qpu_instr *inst = &qinst->qpu;
315 /* If the input and output segments are shared, then all VPM reads to
316 * a location need to happen before all writes. We handle this by
317 * serializing all VPM operations for now.
318 *
319 * FIXME: we are assuming that the segments are shared. That is
320 * correct right now as we are only using shared, but technically you
321 * can choose.
322 */
323 bool separate_vpm_segment = false;
324
325 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
326 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
327 add_read_dep(state, state->last_sf, n);
328
329 /* XXX: BDI */
330 /* XXX: BDU */
331 /* XXX: ub */
332 /* XXX: raddr_a */
333
334 add_write_dep(state, &state->last_unif, n);
335 return;
336 }
337
338 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
339
340 /* XXX: LOAD_IMM */
341
342 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
343 if (devinfo->ver < 71) {
344 process_mux_deps(state, n, inst->alu.add.a.mux);
345 } else {
346 process_raddr_deps(state, n, inst->alu.add.a.raddr,
347 inst->sig.small_imm_a);
348 }
349 }
350 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
351 if (devinfo->ver < 71) {
352 process_mux_deps(state, n, inst->alu.add.b.mux);
353 } else {
354 process_raddr_deps(state, n, inst->alu.add.b.raddr,
355 inst->sig.small_imm_b);
356 }
357 }
358
359 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
360 if (devinfo->ver < 71) {
361 process_mux_deps(state, n, inst->alu.mul.a.mux);
362 } else {
363 process_raddr_deps(state, n, inst->alu.mul.a.raddr,
364 inst->sig.small_imm_c);
365 }
366 }
367 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
368 if (devinfo->ver < 71) {
369 process_mux_deps(state, n, inst->alu.mul.b.mux);
370 } else {
371 process_raddr_deps(state, n, inst->alu.mul.b.raddr,
372 inst->sig.small_imm_d);
373 }
374 }
375
376 switch (inst->alu.add.op) {
377 case V3D_QPU_A_VPMSETUP:
378 /* Could distinguish read/write by unpacking the uniform. */
379 add_write_dep(state, &state->last_vpm, n);
380 add_write_dep(state, &state->last_vpm_read, n);
381 break;
382
383 case V3D_QPU_A_STVPMV:
384 case V3D_QPU_A_STVPMD:
385 case V3D_QPU_A_STVPMP:
386 add_write_dep(state, &state->last_vpm, n);
387 break;
388
389 case V3D_QPU_A_LDVPMV_IN:
390 case V3D_QPU_A_LDVPMD_IN:
391 case V3D_QPU_A_LDVPMG_IN:
392 case V3D_QPU_A_LDVPMP:
393 if (!separate_vpm_segment)
394 add_write_dep(state, &state->last_vpm, n);
395 break;
396
397 case V3D_QPU_A_VPMWT:
398 add_read_dep(state, state->last_vpm, n);
399 break;
400
401 case V3D_QPU_A_MSF:
402 add_read_dep(state, state->last_tlb, n);
403 add_read_dep(state, state->last_setmsf, n);
404 break;
405
406 case V3D_QPU_A_SETMSF:
407 add_write_dep(state, &state->last_setmsf, n);
408 add_write_dep(state, &state->last_tmu_write, n);
409 FALLTHROUGH;
410 case V3D_QPU_A_SETREVF:
411 add_write_dep(state, &state->last_tlb, n);
412 break;
413
414 case V3D_QPU_A_BALLOT:
415 case V3D_QPU_A_BCASTF:
416 case V3D_QPU_A_ALLEQ:
417 case V3D_QPU_A_ALLFEQ:
418 add_read_dep(state, state->last_setmsf, n);
419 break;
420
421 default:
422 break;
423 }
424
425 switch (inst->alu.mul.op) {
426 case V3D_QPU_M_MULTOP:
427 case V3D_QPU_M_UMUL24:
428 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
429 * resets it to 0. We could possibly reorder umul24s relative
430 * to each other, but for now just keep all the MUL parts in
431 * order.
432 */
433 add_write_dep(state, &state->last_rtop, n);
434 break;
435 default:
436 break;
437 }
438
439 if (inst->alu.add.op != V3D_QPU_A_NOP) {
440 process_waddr_deps(state, n, inst->alu.add.waddr,
441 inst->alu.add.magic_write);
442 }
443 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
444 process_waddr_deps(state, n, inst->alu.mul.waddr,
445 inst->alu.mul.magic_write);
446 }
447 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
448 process_waddr_deps(state, n, inst->sig_addr,
449 inst->sig_magic);
450 }
451
452 if (v3d_qpu_writes_r3(devinfo, inst))
453 add_write_dep(state, &state->last_r[3], n);
454 if (v3d_qpu_writes_r4(devinfo, inst))
455 add_write_dep(state, &state->last_r[4], n);
456 if (v3d_qpu_writes_r5(devinfo, inst))
457 add_write_dep(state, &state->last_r[5], n);
458 if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
459 add_write_dep(state, &state->last_rf[0], n);
460
461 /* If we add any more dependencies here we should consider whether we
462 * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
463 */
464 if (inst->sig.thrsw) {
465 /* All accumulator contents and flags are undefined after the
466 * switch.
467 */
468 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
469 add_write_dep(state, &state->last_r[i], n);
470 add_write_dep(state, &state->last_sf, n);
471 add_write_dep(state, &state->last_rtop, n);
472
473 /* Scoreboard-locking operations have to stay after the last
474 * thread switch.
475 */
476 add_write_dep(state, &state->last_tlb, n);
477
478 add_write_dep(state, &state->last_tmu_write, n);
479 add_write_dep(state, &state->last_tmu_config, n);
480 }
481
482 if (v3d_qpu_waits_on_tmu(inst)) {
483 /* TMU loads are coming from a FIFO, so ordering is important.
484 */
485 add_write_dep(state, &state->last_tmu_read, n);
486 /* Keep TMU loads after their TMU lookup terminator */
487 add_read_dep(state, state->last_tmu_config, n);
488 }
489
490 /* Allow wrtmuc to be reordered with other instructions in the
491 * same TMU sequence by using a read dependency on the last TMU
492 * sequence terminator.
493 */
494 if (inst->sig.wrtmuc)
495 add_read_dep(state, state->last_tmu_config, n);
496
497 if (inst->sig.ldtlb | inst->sig.ldtlbu)
498 add_write_dep(state, &state->last_tlb, n);
499
500 if (inst->sig.ldvpm) {
501 add_write_dep(state, &state->last_vpm_read, n);
502
503 /* At least for now, we're doing shared I/O segments, so queue
504 * all writes after all reads.
505 */
506 if (!separate_vpm_segment)
507 add_write_dep(state, &state->last_vpm, n);
508 }
509
510 /* inst->sig.ldunif or sideband uniform read */
511 if (vir_has_uniform(qinst))
512 add_write_dep(state, &state->last_unif, n);
513
514 /* Both unifa and ldunifa must preserve ordering */
515 if (inst->sig.ldunifa || inst->sig.ldunifarf)
516 add_write_dep(state, &state->last_unifa, n);
517
518 if (v3d_qpu_reads_flags(inst))
519 add_read_dep(state, state->last_sf, n);
520 if (v3d_qpu_writes_flags(inst))
521 add_write_dep(state, &state->last_sf, n);
522 }
523
524 static void
calculate_forward_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)525 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
526 struct list_head *schedule_list)
527 {
528 struct schedule_state state;
529
530 memset(&state, 0, sizeof(state));
531 state.dag = dag;
532 state.devinfo = c->devinfo;
533 state.dir = F;
534
535 list_for_each_entry(struct schedule_node, node, schedule_list, link)
536 calculate_deps(&state, node);
537 }
538
539 static void
calculate_reverse_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)540 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
541 struct list_head *schedule_list)
542 {
543 struct schedule_state state;
544
545 memset(&state, 0, sizeof(state));
546 state.dag = dag;
547 state.devinfo = c->devinfo;
548 state.dir = R;
549
550 list_for_each_entry_rev(struct schedule_node, node, schedule_list,
551 link) {
552 calculate_deps(&state, (struct schedule_node *)node);
553 }
554 }
555
556 struct choose_scoreboard {
557 struct dag *dag;
558 int tick;
559 int last_magic_sfu_write_tick;
560 int last_stallable_sfu_reg;
561 int last_stallable_sfu_tick;
562 int last_ldvary_tick;
563 int last_unifa_write_tick;
564 int last_uniforms_reset_tick;
565 int last_thrsw_tick;
566 int last_branch_tick;
567 int last_setmsf_tick;
568 bool first_thrsw_emitted;
569 bool last_thrsw_emitted;
570 bool fixup_ldvary;
571 int ldvary_count;
572 int pending_ldtmu_count;
573 bool first_ldtmu_after_thrsw;
574
575 /* V3D 7.x */
576 int last_implicit_rf0_write_tick;
577 bool has_rf0_flops_conflict;
578 };
579
580 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)581 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
582 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
583 {
584 switch (mux) {
585 case V3D_QPU_MUX_R4:
586 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
587 return true;
588 break;
589
590 case V3D_QPU_MUX_R5:
591 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
592 return true;
593 break;
594 default:
595 break;
596 }
597
598 return false;
599 }
600
601 static bool
reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,uint8_t raddr)602 reads_too_soon(struct choose_scoreboard *scoreboard,
603 const struct v3d_qpu_instr *inst, uint8_t raddr)
604 {
605 switch (raddr) {
606 case 0: /* ldvary delayed write of C coefficient to rf0 */
607 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
608 return true;
609 break;
610 default:
611 break;
612 }
613
614 return false;
615 }
616
617 static bool
reads_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)618 reads_too_soon_after_write(const struct v3d_device_info *devinfo,
619 struct choose_scoreboard *scoreboard,
620 struct qinst *qinst)
621 {
622 const struct v3d_qpu_instr *inst = &qinst->qpu;
623
624 /* XXX: Branching off of raddr. */
625 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
626 return false;
627
628 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
629
630 if (inst->alu.add.op != V3D_QPU_A_NOP) {
631 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
632 if (devinfo->ver < 71) {
633 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
634 return true;
635 } else {
636 if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
637 return true;
638 }
639 }
640 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
641 if (devinfo->ver < 71) {
642 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
643 return true;
644 } else {
645 if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
646 return true;
647 }
648 }
649 }
650
651 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
652 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
653 if (devinfo->ver < 71) {
654 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
655 return true;
656 } else {
657 if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
658 return true;
659 }
660 }
661 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
662 if (devinfo->ver < 71) {
663 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
664 return true;
665 } else {
666 if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
667 return true;
668 }
669 }
670 }
671
672 /* XXX: imm */
673
674 return false;
675 }
676
677 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)678 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
679 struct choose_scoreboard *scoreboard,
680 struct qinst *qinst)
681 {
682 const struct v3d_qpu_instr *inst = &qinst->qpu;
683
684 /* Don't schedule any other r4 write too soon after an SFU write.
685 * This would normally be prevented by dependency tracking, but might
686 * occur if a dead SFU computation makes it to scheduling.
687 */
688 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
689 v3d_qpu_writes_r4(devinfo, inst))
690 return true;
691
692 if (devinfo->ver == 42)
693 return false;
694
695 /* Don't schedule anything that writes rf0 right after ldvary, since
696 * that would clash with the ldvary's delayed rf0 write (the exception
697 * is another ldvary, since its implicit rf0 write would also have
698 * one cycle of delay and would not clash).
699 */
700 if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
701 (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
702 (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
703 !inst->sig.ldvary))) {
704 return true;
705 }
706
707 return false;
708 }
709
710 static bool
scoreboard_is_locked(struct choose_scoreboard * scoreboard,bool lock_scoreboard_on_first_thrsw)711 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
712 bool lock_scoreboard_on_first_thrsw)
713 {
714 if (lock_scoreboard_on_first_thrsw) {
715 return scoreboard->first_thrsw_emitted &&
716 scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
717 }
718
719 return scoreboard->last_thrsw_emitted &&
720 scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
721 }
722
723 static bool
pixel_scoreboard_too_soon(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)724 pixel_scoreboard_too_soon(struct v3d_compile *c,
725 struct choose_scoreboard *scoreboard,
726 const struct v3d_qpu_instr *inst)
727 {
728 return qpu_inst_is_tlb(inst) &&
729 !scoreboard_is_locked(scoreboard,
730 c->lock_scoreboard_on_first_thrsw);
731 }
732
733 static bool
qpu_instruction_uses_rf(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst,uint32_t waddr)734 qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
735 const struct v3d_qpu_instr *inst,
736 uint32_t waddr) {
737
738 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
739 return false;
740
741 if (devinfo->ver < 71) {
742 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
743 inst->raddr_a == waddr)
744 return true;
745
746 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
747 !inst->sig.small_imm_b && (inst->raddr_b == waddr))
748 return true;
749 } else {
750 if (v3d71_qpu_reads_raddr(inst, waddr))
751 return true;
752 }
753
754 return false;
755 }
756
757 static bool
read_stalls(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)758 read_stalls(const struct v3d_device_info *devinfo,
759 struct choose_scoreboard *scoreboard,
760 const struct v3d_qpu_instr *inst)
761 {
762 return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
763 qpu_instruction_uses_rf(devinfo, inst,
764 scoreboard->last_stallable_sfu_reg);
765 }
766
767 /* We define a max schedule priority to allow negative priorities as result of
768 * subtracting this max when an instruction stalls. So instructions that
769 * stall have lower priority than regular instructions. */
770 #define MAX_SCHEDULE_PRIORITY 16
771
772 static int
get_instruction_priority(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)773 get_instruction_priority(const struct v3d_device_info *devinfo,
774 const struct v3d_qpu_instr *inst)
775 {
776 uint32_t baseline_score;
777 uint32_t next_score = 0;
778
779 /* Schedule TLB operations as late as possible, to get more
780 * parallelism between shaders.
781 */
782 if (qpu_inst_is_tlb(inst))
783 return next_score;
784 next_score++;
785
786 /* Empirical testing shows that using priorities to hide latency of
787 * TMU operations when scheduling QPU leads to slightly worse
788 * performance, even at 2 threads. We think this is because the thread
789 * switching is already quite effective at hiding latency and NIR
790 * scheduling (and possibly TMU pipelining too) are sufficient to hide
791 * TMU latency, so piling up on that here doesn't provide any benefits
792 * and instead may cause us to postpone critical paths that depend on
793 * the TMU results.
794 */
795 #if 0
796 /* Schedule texture read results collection late to hide latency. */
797 if (v3d_qpu_waits_on_tmu(inst))
798 return next_score;
799 next_score++;
800 #endif
801
802 /* Default score for things that aren't otherwise special. */
803 baseline_score = next_score;
804 next_score++;
805
806 #if 0
807 /* Schedule texture read setup early to hide their latency better. */
808 if (v3d_qpu_writes_tmu(devinfo, inst))
809 return next_score;
810 next_score++;
811 #endif
812
813 /* We should increase the maximum if we assert here */
814 assert(next_score < MAX_SCHEDULE_PRIORITY);
815
816 return baseline_score;
817 }
818
819 enum {
820 V3D_PERIPHERAL_VPM_READ = (1 << 0),
821 V3D_PERIPHERAL_VPM_WRITE = (1 << 1),
822 V3D_PERIPHERAL_VPM_WAIT = (1 << 2),
823 V3D_PERIPHERAL_SFU = (1 << 3),
824 V3D_PERIPHERAL_TMU_WRITE = (1 << 4),
825 V3D_PERIPHERAL_TMU_READ = (1 << 5),
826 V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
827 V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
828 V3D_PERIPHERAL_TSY = (1 << 8),
829 V3D_PERIPHERAL_TLB_READ = (1 << 9),
830 V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
831 };
832
833 static uint32_t
qpu_peripherals(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)834 qpu_peripherals(const struct v3d_device_info *devinfo,
835 const struct v3d_qpu_instr *inst)
836 {
837 uint32_t result = 0;
838 if (v3d_qpu_reads_vpm(inst))
839 result |= V3D_PERIPHERAL_VPM_READ;
840 if (v3d_qpu_writes_vpm(inst))
841 result |= V3D_PERIPHERAL_VPM_WRITE;
842 if (v3d_qpu_waits_vpm(inst))
843 result |= V3D_PERIPHERAL_VPM_WAIT;
844
845 if (v3d_qpu_writes_tmu(devinfo, inst))
846 result |= V3D_PERIPHERAL_TMU_WRITE;
847 if (inst->sig.ldtmu)
848 result |= V3D_PERIPHERAL_TMU_READ;
849 if (inst->sig.wrtmuc)
850 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
851
852 if (v3d_qpu_uses_sfu(inst))
853 result |= V3D_PERIPHERAL_SFU;
854
855 if (v3d_qpu_reads_tlb(inst))
856 result |= V3D_PERIPHERAL_TLB_READ;
857 if (v3d_qpu_writes_tlb(inst))
858 result |= V3D_PERIPHERAL_TLB_WRITE;
859
860 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
861 if (inst->alu.add.op != V3D_QPU_A_NOP &&
862 inst->alu.add.magic_write &&
863 v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
864 result |= V3D_PERIPHERAL_TSY;
865 }
866
867 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
868 result |= V3D_PERIPHERAL_TMU_WAIT;
869 }
870
871 return result;
872 }
873
874 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)875 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
876 const struct v3d_qpu_instr *a,
877 const struct v3d_qpu_instr *b)
878 {
879 const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
880 const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
881
882 /* We can always do one peripheral access per instruction. */
883 if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
884 return true;
885
886 /* V3D 4.x can't do more than one peripheral access except in a
887 * few cases:
888 */
889 if (devinfo->ver == 42) {
890 /* WRTMUC signal with TMU register write (other than tmuc). */
891 if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
892 b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
893 return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
894 }
895 if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
896 a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
897 return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
898 }
899
900 /* TMU read with VPM read/write. */
901 if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
902 (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
903 b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
904 return true;
905 }
906 if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
907 (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
908 a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
909 return true;
910 }
911
912 return false;
913 }
914
915 /* V3D 7.x can't have more than one of these restricted peripherals */
916 const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
917 V3D_PERIPHERAL_TMU_WRTMUC_SIG |
918 V3D_PERIPHERAL_TSY |
919 V3D_PERIPHERAL_TLB_READ |
920 V3D_PERIPHERAL_SFU |
921 V3D_PERIPHERAL_VPM_READ |
922 V3D_PERIPHERAL_VPM_WRITE;
923
924 const uint32_t a_restricted = a_peripherals & restricted;
925 const uint32_t b_restricted = b_peripherals & restricted;
926 if (a_restricted && b_restricted) {
927 /* WRTMUC signal with TMU register write (other than tmuc) is
928 * allowed though.
929 */
930 if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
931 b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
932 v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
933 (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
934 a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
935 v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
936 return false;
937 }
938 }
939
940 /* Only one TMU read per instruction */
941 if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
942 (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
943 return false;
944 }
945
946 /* Only one TLB access per instruction */
947 if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
948 V3D_PERIPHERAL_TLB_READ)) &&
949 (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
950 V3D_PERIPHERAL_TLB_READ))) {
951 return false;
952 }
953
954 return true;
955 }
956
957 /* Compute a bitmask of which rf registers are used between
958 * the two instructions.
959 */
960 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)961 qpu_raddrs_used(const struct v3d_qpu_instr *a,
962 const struct v3d_qpu_instr *b)
963 {
964 assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
965 assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
966
967 uint64_t raddrs_used = 0;
968 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
969 raddrs_used |= (UINT64_C(1) << a->raddr_a);
970 if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
971 raddrs_used |= (UINT64_C(1) << a->raddr_b);
972 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
973 raddrs_used |= (UINT64_C(1) << b->raddr_a);
974 if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
975 raddrs_used |= (UINT64_C(1) << b->raddr_b);
976
977 return raddrs_used;
978 }
979
980 /* Takes two instructions and attempts to merge their raddr fields (including
981 * small immediates) into one merged instruction. For V3D 4.x, returns false
982 * if the two instructions access more than two different rf registers between
983 * them, or more than one rf register and one small immediate. For 7.x returns
984 * false if both instructions use small immediates.
985 */
986 static bool
qpu_merge_raddrs(struct v3d_qpu_instr * result,const struct v3d_qpu_instr * add_instr,const struct v3d_qpu_instr * mul_instr,const struct v3d_device_info * devinfo)987 qpu_merge_raddrs(struct v3d_qpu_instr *result,
988 const struct v3d_qpu_instr *add_instr,
989 const struct v3d_qpu_instr *mul_instr,
990 const struct v3d_device_info *devinfo)
991 {
992 if (devinfo->ver >= 71) {
993 assert(add_instr->sig.small_imm_a +
994 add_instr->sig.small_imm_b <= 1);
995 assert(add_instr->sig.small_imm_c +
996 add_instr->sig.small_imm_d == 0);
997 assert(mul_instr->sig.small_imm_a +
998 mul_instr->sig.small_imm_b == 0);
999 assert(mul_instr->sig.small_imm_c +
1000 mul_instr->sig.small_imm_d <= 1);
1001
1002 result->sig.small_imm_a = add_instr->sig.small_imm_a;
1003 result->sig.small_imm_b = add_instr->sig.small_imm_b;
1004 result->sig.small_imm_c = mul_instr->sig.small_imm_c;
1005 result->sig.small_imm_d = mul_instr->sig.small_imm_d;
1006
1007 return (result->sig.small_imm_a +
1008 result->sig.small_imm_b +
1009 result->sig.small_imm_c +
1010 result->sig.small_imm_d) <= 1;
1011 }
1012
1013 assert(devinfo->ver == 42);
1014
1015 uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
1016 int naddrs = util_bitcount64(raddrs_used);
1017
1018 if (naddrs > 2)
1019 return false;
1020
1021 if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
1022 if (naddrs > 1)
1023 return false;
1024
1025 if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
1026 if (add_instr->raddr_b != mul_instr->raddr_b)
1027 return false;
1028
1029 result->sig.small_imm_b = true;
1030 result->raddr_b = add_instr->sig.small_imm_b ?
1031 add_instr->raddr_b : mul_instr->raddr_b;
1032 }
1033
1034 if (naddrs == 0)
1035 return true;
1036
1037 int raddr_a = ffsll(raddrs_used) - 1;
1038 raddrs_used &= ~(UINT64_C(1) << raddr_a);
1039 result->raddr_a = raddr_a;
1040
1041 if (!result->sig.small_imm_b) {
1042 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
1043 raddr_a == add_instr->raddr_b) {
1044 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
1045 result->alu.add.a.mux = V3D_QPU_MUX_A;
1046 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
1047 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1048 result->alu.add.b.mux = V3D_QPU_MUX_A;
1049 }
1050 }
1051 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
1052 raddr_a == mul_instr->raddr_b) {
1053 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
1054 result->alu.mul.a.mux = V3D_QPU_MUX_A;
1055 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
1056 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1057 result->alu.mul.b.mux = V3D_QPU_MUX_A;
1058 }
1059 }
1060 }
1061 if (!raddrs_used)
1062 return true;
1063
1064 int raddr_b = ffsll(raddrs_used) - 1;
1065 result->raddr_b = raddr_b;
1066 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
1067 raddr_b == add_instr->raddr_a) {
1068 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
1069 result->alu.add.a.mux = V3D_QPU_MUX_B;
1070 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
1071 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1072 result->alu.add.b.mux = V3D_QPU_MUX_B;
1073 }
1074 }
1075 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
1076 raddr_b == mul_instr->raddr_a) {
1077 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
1078 result->alu.mul.a.mux = V3D_QPU_MUX_B;
1079 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
1080 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1081 result->alu.mul.b.mux = V3D_QPU_MUX_B;
1082 }
1083 }
1084
1085 return true;
1086 }
1087
1088 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)1089 can_do_add_as_mul(enum v3d_qpu_add_op op)
1090 {
1091 switch (op) {
1092 case V3D_QPU_A_ADD:
1093 case V3D_QPU_A_SUB:
1094 return true;
1095 default:
1096 return false;
1097 }
1098 }
1099
1100 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)1101 add_op_as_mul_op(enum v3d_qpu_add_op op)
1102 {
1103 switch (op) {
1104 case V3D_QPU_A_ADD:
1105 return V3D_QPU_M_ADD;
1106 case V3D_QPU_A_SUB:
1107 return V3D_QPU_M_SUB;
1108 default:
1109 unreachable("unexpected add opcode");
1110 }
1111 }
1112
1113 static void
qpu_convert_add_to_mul(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst)1114 qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
1115 struct v3d_qpu_instr *inst)
1116 {
1117 STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
1118 assert(inst->alu.add.op != V3D_QPU_A_NOP);
1119 assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1120
1121 memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
1122 inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
1123 inst->alu.add.op = V3D_QPU_A_NOP;
1124
1125 inst->flags.mc = inst->flags.ac;
1126 inst->flags.mpf = inst->flags.apf;
1127 inst->flags.muf = inst->flags.auf;
1128 inst->flags.ac = V3D_QPU_COND_NONE;
1129 inst->flags.apf = V3D_QPU_PF_NONE;
1130 inst->flags.auf = V3D_QPU_UF_NONE;
1131
1132 inst->alu.mul.output_pack = inst->alu.add.output_pack;
1133
1134 inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
1135 inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
1136 inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
1137 inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
1138 inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
1139
1140 if (devinfo->ver >= 71) {
1141 assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
1142 assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
1143 if (inst->sig.small_imm_a) {
1144 inst->sig.small_imm_c = true;
1145 inst->sig.small_imm_a = false;
1146 } else if (inst->sig.small_imm_b) {
1147 inst->sig.small_imm_d = true;
1148 inst->sig.small_imm_b = false;
1149 }
1150 }
1151 }
1152
1153 static bool
can_do_mul_as_add(const struct v3d_device_info * devinfo,enum v3d_qpu_mul_op op)1154 can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
1155 {
1156 switch (op) {
1157 case V3D_QPU_M_MOV:
1158 case V3D_QPU_M_FMOV:
1159 return devinfo->ver >= 71;
1160 default:
1161 return false;
1162 }
1163 }
1164
1165 static enum v3d_qpu_mul_op
mul_op_as_add_op(enum v3d_qpu_mul_op op)1166 mul_op_as_add_op(enum v3d_qpu_mul_op op)
1167 {
1168 switch (op) {
1169 case V3D_QPU_M_MOV:
1170 return V3D_QPU_A_MOV;
1171 case V3D_QPU_M_FMOV:
1172 return V3D_QPU_A_FMOV;
1173 default:
1174 unreachable("unexpected mov opcode");
1175 }
1176 }
1177
1178 static void
qpu_convert_mul_to_add(struct v3d_qpu_instr * inst)1179 qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
1180 {
1181 STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
1182 assert(inst->alu.mul.op != V3D_QPU_M_NOP);
1183 assert(inst->alu.add.op == V3D_QPU_A_NOP);
1184
1185 memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
1186 inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
1187 inst->alu.mul.op = V3D_QPU_M_NOP;
1188
1189 inst->flags.ac = inst->flags.mc;
1190 inst->flags.apf = inst->flags.mpf;
1191 inst->flags.auf = inst->flags.muf;
1192 inst->flags.mc = V3D_QPU_COND_NONE;
1193 inst->flags.mpf = V3D_QPU_PF_NONE;
1194 inst->flags.muf = V3D_QPU_UF_NONE;
1195
1196 inst->alu.add.output_pack = inst->alu.mul.output_pack;
1197 inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
1198 inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
1199 inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
1200 inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
1201 inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
1202
1203 assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
1204 assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
1205 if (inst->sig.small_imm_c) {
1206 inst->sig.small_imm_a = true;
1207 inst->sig.small_imm_c = false;
1208 } else if (inst->sig.small_imm_d) {
1209 inst->sig.small_imm_b = true;
1210 inst->sig.small_imm_d = false;
1211 }
1212 }
1213
1214 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)1215 qpu_merge_inst(const struct v3d_device_info *devinfo,
1216 struct v3d_qpu_instr *result,
1217 const struct v3d_qpu_instr *a,
1218 const struct v3d_qpu_instr *b)
1219 {
1220 if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
1221 b->type != V3D_QPU_INSTR_TYPE_ALU) {
1222 return false;
1223 }
1224
1225 if (!qpu_compatible_peripheral_access(devinfo, a, b))
1226 return false;
1227
1228 struct v3d_qpu_instr merge = *a;
1229 const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
1230
1231 struct v3d_qpu_instr mul_inst;
1232 if (b->alu.add.op != V3D_QPU_A_NOP) {
1233 if (a->alu.add.op == V3D_QPU_A_NOP) {
1234 merge.alu.add = b->alu.add;
1235
1236 merge.flags.ac = b->flags.ac;
1237 merge.flags.apf = b->flags.apf;
1238 merge.flags.auf = b->flags.auf;
1239
1240 add_instr = b;
1241 mul_instr = a;
1242 }
1243 /* If a's add op is used but its mul op is not, then see if we
1244 * can convert either a's add op or b's add op to a mul op
1245 * so we can merge.
1246 */
1247 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1248 can_do_add_as_mul(b->alu.add.op)) {
1249 mul_inst = *b;
1250 qpu_convert_add_to_mul(devinfo, &mul_inst);
1251
1252 merge.alu.mul = mul_inst.alu.mul;
1253
1254 merge.flags.mc = mul_inst.flags.mc;
1255 merge.flags.mpf = mul_inst.flags.mpf;
1256 merge.flags.muf = mul_inst.flags.muf;
1257
1258 add_instr = a;
1259 mul_instr = &mul_inst;
1260 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1261 can_do_add_as_mul(a->alu.add.op)) {
1262 mul_inst = *a;
1263 qpu_convert_add_to_mul(devinfo, &mul_inst);
1264
1265 merge = mul_inst;
1266 merge.alu.add = b->alu.add;
1267
1268 merge.flags.ac = b->flags.ac;
1269 merge.flags.apf = b->flags.apf;
1270 merge.flags.auf = b->flags.auf;
1271
1272 add_instr = b;
1273 mul_instr = &mul_inst;
1274 } else {
1275 return false;
1276 }
1277 }
1278
1279 struct v3d_qpu_instr add_inst;
1280 if (b->alu.mul.op != V3D_QPU_M_NOP) {
1281 if (a->alu.mul.op == V3D_QPU_M_NOP) {
1282 merge.alu.mul = b->alu.mul;
1283
1284 merge.flags.mc = b->flags.mc;
1285 merge.flags.mpf = b->flags.mpf;
1286 merge.flags.muf = b->flags.muf;
1287
1288 mul_instr = b;
1289 add_instr = a;
1290 }
1291 /* If a's mul op is used but its add op is not, then see if we
1292 * can convert either a's mul op or b's mul op to an add op
1293 * so we can merge.
1294 */
1295 else if (a->alu.add.op == V3D_QPU_A_NOP &&
1296 can_do_mul_as_add(devinfo, b->alu.mul.op)) {
1297 add_inst = *b;
1298 qpu_convert_mul_to_add(&add_inst);
1299
1300 merge.alu.add = add_inst.alu.add;
1301
1302 merge.flags.ac = add_inst.flags.ac;
1303 merge.flags.apf = add_inst.flags.apf;
1304 merge.flags.auf = add_inst.flags.auf;
1305
1306 mul_instr = a;
1307 add_instr = &add_inst;
1308 } else if (a->alu.add.op == V3D_QPU_A_NOP &&
1309 can_do_mul_as_add(devinfo, a->alu.mul.op)) {
1310 add_inst = *a;
1311 qpu_convert_mul_to_add(&add_inst);
1312
1313 merge = add_inst;
1314 merge.alu.mul = b->alu.mul;
1315
1316 merge.flags.mc = b->flags.mc;
1317 merge.flags.mpf = b->flags.mpf;
1318 merge.flags.muf = b->flags.muf;
1319
1320 mul_instr = b;
1321 add_instr = &add_inst;
1322 } else {
1323 return false;
1324 }
1325 }
1326
1327 /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
1328 * they have restrictions on the number of raddrs that can be adressed
1329 * in a single instruction. In V3D 7.x, we don't have that restriction,
1330 * but we are still limited to a single small immediate per instruction.
1331 */
1332 if (add_instr && mul_instr &&
1333 !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
1334 return false;
1335 }
1336
1337 merge.sig.thrsw |= b->sig.thrsw;
1338 merge.sig.ldunif |= b->sig.ldunif;
1339 merge.sig.ldunifrf |= b->sig.ldunifrf;
1340 merge.sig.ldunifa |= b->sig.ldunifa;
1341 merge.sig.ldunifarf |= b->sig.ldunifarf;
1342 merge.sig.ldtmu |= b->sig.ldtmu;
1343 merge.sig.ldvary |= b->sig.ldvary;
1344 merge.sig.ldvpm |= b->sig.ldvpm;
1345 merge.sig.ldtlb |= b->sig.ldtlb;
1346 merge.sig.ldtlbu |= b->sig.ldtlbu;
1347 merge.sig.ucb |= b->sig.ucb;
1348 merge.sig.rotate |= b->sig.rotate;
1349 merge.sig.wrtmuc |= b->sig.wrtmuc;
1350
1351 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1352 v3d_qpu_sig_writes_address(devinfo, &b->sig))
1353 return false;
1354 merge.sig_addr |= b->sig_addr;
1355 merge.sig_magic |= b->sig_magic;
1356
1357 uint64_t packed;
1358 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1359
1360 *result = merge;
1361 /* No modifying the real instructions on failure. */
1362 assert(ok || (a != result && b != result));
1363
1364 return ok;
1365 }
1366
1367 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr * inst)1368 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1369 {
1370 return inst->sig.ldunif || inst->sig.ldunifrf;
1371 }
1372
1373 static bool
1374 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1375 struct choose_scoreboard *scoreboard,
1376 const struct qinst *qinst);
1377
1378 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct schedule_node * prev_inst)1379 choose_instruction_to_schedule(struct v3d_compile *c,
1380 struct choose_scoreboard *scoreboard,
1381 struct schedule_node *prev_inst)
1382 {
1383 struct schedule_node *chosen = NULL;
1384 int chosen_prio = 0;
1385
1386 /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1387 * will handle pairing it along with filling the delay slots.
1388 */
1389 if (prev_inst) {
1390 if (prev_inst->inst->qpu.sig.thrsw)
1391 return NULL;
1392 }
1393
1394 bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1395 scoreboard->ldvary_count < c->num_inputs;
1396 bool skipped_insts_for_ldvary_pipelining = false;
1397 retry:
1398 list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1399 dag.link) {
1400 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1401
1402 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1403 skipped_insts_for_ldvary_pipelining = true;
1404 continue;
1405 }
1406
1407 /* Don't choose the branch instruction until it's the last one
1408 * left. We'll move it up to fit its delay slots after we
1409 * choose it.
1410 */
1411 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1412 !list_is_singular(&scoreboard->dag->heads)) {
1413 continue;
1414 }
1415
1416 /* We need to have 3 delay slots between a write to unifa and
1417 * a follow-up ldunifa.
1418 */
1419 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1420 scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1421 continue;
1422
1423 /* "An instruction must not read from a location in physical
1424 * regfile A or B that was written to by the previous
1425 * instruction."
1426 */
1427 if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1428 continue;
1429
1430 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1431 continue;
1432
1433 /* "Before doing a TLB access a scoreboard wait must have been
1434 * done. This happens either on the first or last thread
1435 * switch, depending on a setting (scb_wait_on_first_thrsw) in
1436 * the shader state."
1437 */
1438 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1439 continue;
1440
1441 /* ldunif and ldvary both write the same register (r5 for v42
1442 * and below, rf0 for v71), but ldunif does so a tick sooner.
1443 * If the ldvary's register wasn't used, then ldunif might
1444 * otherwise get scheduled so ldunif and ldvary try to update
1445 * the register in the same tick.
1446 */
1447 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1448 scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1449 continue;
1450 }
1451
1452 /* If we are in a thrsw delay slot check that this instruction
1453 * is valid for that.
1454 */
1455 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1456 !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1457 n->inst)) {
1458 continue;
1459 }
1460
1461 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1462 /* Don't try to put a branch in the delay slots of another
1463 * branch or a unifa write.
1464 */
1465 if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1466 continue;
1467 if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1468 continue;
1469
1470 /* No branch with cond != 0,2,3 and msfign != 0 after
1471 * setmsf.
1472 */
1473 if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1474 inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1475 inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1476 inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1477 inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1478 continue;
1479 }
1480 }
1481
1482 /* If we're trying to pair with another instruction, check
1483 * that they're compatible.
1484 */
1485 if (prev_inst) {
1486 /* Don't pair up a thread switch signal -- we'll
1487 * handle pairing it when we pick it on its own.
1488 */
1489 if (inst->sig.thrsw)
1490 continue;
1491
1492 if (prev_inst->inst->uniform != -1 &&
1493 n->inst->uniform != -1)
1494 continue;
1495
1496 /* Simulator complains if we have two uniforms loaded in
1497 * the the same instruction, which could happen if we
1498 * have a ldunif or sideband uniform and we pair that
1499 * with ldunifa.
1500 */
1501 if (vir_has_uniform(prev_inst->inst) &&
1502 (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1503 continue;
1504 }
1505
1506 if ((prev_inst->inst->qpu.sig.ldunifa ||
1507 prev_inst->inst->qpu.sig.ldunifarf) &&
1508 vir_has_uniform(n->inst)) {
1509 continue;
1510 }
1511
1512 /* Don't merge TLB instructions before we have acquired
1513 * the scoreboard lock.
1514 */
1515 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1516 continue;
1517
1518 /* When we successfully pair up an ldvary we then try
1519 * to merge it into the previous instruction if
1520 * possible to improve pipelining. Don't pick up the
1521 * ldvary now if the follow-up fixup would place
1522 * it in the delay slots of a thrsw, which is not
1523 * allowed and would prevent the fixup from being
1524 * successful. In V3D 7.x we can allow this to happen
1525 * as long as it is not the last delay slot.
1526 */
1527 if (inst->sig.ldvary) {
1528 if (c->devinfo->ver == 42 &&
1529 scoreboard->last_thrsw_tick + 2 >=
1530 scoreboard->tick - 1) {
1531 continue;
1532 }
1533 if (c->devinfo->ver >= 71 &&
1534 scoreboard->last_thrsw_tick + 2 ==
1535 scoreboard->tick - 1) {
1536 continue;
1537 }
1538 }
1539
1540 /* We can emit a new tmu lookup with a previous ldtmu
1541 * if doing this would free just enough space in the
1542 * TMU output fifo so we don't overflow, however, this
1543 * is only safe if the ldtmu cannot stall.
1544 *
1545 * A ldtmu can stall if it is not the first following a
1546 * thread switch and corresponds to the first word of a
1547 * read request.
1548 *
1549 * FIXME: For now we forbid pairing up a new lookup
1550 * with a previous ldtmu that is not the first after a
1551 * thrsw if that could overflow the TMU output fifo
1552 * regardless of whether the ldtmu is reading the first
1553 * word of a TMU result or not, since we don't track
1554 * this aspect in the compiler yet.
1555 */
1556 if (prev_inst->inst->qpu.sig.ldtmu &&
1557 is_tmu_sequence_terminator(n->inst) &&
1558 !scoreboard->first_ldtmu_after_thrsw &&
1559 (scoreboard->pending_ldtmu_count +
1560 n->inst->ldtmu_count > 16 / c->threads)) {
1561 continue;
1562 }
1563
1564 struct v3d_qpu_instr merged_inst;
1565 if (!qpu_merge_inst(c->devinfo, &merged_inst,
1566 &prev_inst->inst->qpu, inst)) {
1567 continue;
1568 }
1569 }
1570
1571 int prio = get_instruction_priority(c->devinfo, inst);
1572
1573 if (read_stalls(c->devinfo, scoreboard, inst)) {
1574 /* Don't merge an instruction that stalls */
1575 if (prev_inst)
1576 continue;
1577 else {
1578 /* Any instruction that don't stall will have
1579 * higher scheduling priority */
1580 prio -= MAX_SCHEDULE_PRIORITY;
1581 assert(prio < 0);
1582 }
1583 }
1584
1585 /* Found a valid instruction. If nothing better comes along,
1586 * this one works.
1587 */
1588 if (!chosen) {
1589 chosen = n;
1590 chosen_prio = prio;
1591 continue;
1592 }
1593
1594 if (prio > chosen_prio) {
1595 chosen = n;
1596 chosen_prio = prio;
1597 } else if (prio < chosen_prio) {
1598 continue;
1599 }
1600
1601 if (n->delay > chosen->delay) {
1602 chosen = n;
1603 chosen_prio = prio;
1604 } else if (n->delay < chosen->delay) {
1605 continue;
1606 }
1607 }
1608
1609 /* If we did not find any instruction to schedule but we discarded
1610 * some of them to prioritize ldvary pipelining, try again.
1611 */
1612 if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1613 skipped_insts_for_ldvary_pipelining = false;
1614 ldvary_pipelining = false;
1615 goto retry;
1616 }
1617
1618 if (chosen && chosen->inst->qpu.sig.ldvary) {
1619 scoreboard->ldvary_count++;
1620 /* If we are pairing an ldvary, flag it so we can fix it up for
1621 * optimal pipelining of ldvary sequences.
1622 */
1623 if (prev_inst)
1624 scoreboard->fixup_ldvary = true;
1625 }
1626
1627 return chosen;
1628 }
1629
1630 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr,const struct v3d_device_info * devinfo)1631 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1632 enum v3d_qpu_waddr waddr,
1633 const struct v3d_device_info *devinfo)
1634 {
1635 if (v3d_qpu_magic_waddr_is_sfu(waddr))
1636 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1637 else if (waddr == V3D_QPU_WADDR_UNIFA)
1638 scoreboard->last_unifa_write_tick = scoreboard->tick;
1639 }
1640
1641 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)1642 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1643 const struct v3d_qpu_instr *inst)
1644 {
1645 if (v3d_qpu_instr_is_sfu(inst)) {
1646 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1647 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1648 }
1649 }
1650
1651 static void
update_scoreboard_tmu_tracking(struct choose_scoreboard * scoreboard,const struct qinst * inst)1652 update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
1653 const struct qinst *inst)
1654 {
1655 /* Track if the have seen any ldtmu after the last thread switch */
1656 if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
1657 scoreboard->first_ldtmu_after_thrsw = true;
1658
1659 /* Track the number of pending ldtmu instructions for outstanding
1660 * TMU lookups.
1661 */
1662 scoreboard->pending_ldtmu_count += inst->ldtmu_count;
1663 if (inst->qpu.sig.ldtmu) {
1664 assert(scoreboard->pending_ldtmu_count > 0);
1665 scoreboard->pending_ldtmu_count--;
1666 scoreboard->first_ldtmu_after_thrsw = false;
1667 }
1668 }
1669
1670 static void
set_has_rf0_flops_conflict(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1671 set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
1672 const struct v3d_qpu_instr *inst,
1673 const struct v3d_device_info *devinfo)
1674 {
1675 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
1676 v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
1677 !inst->sig_magic) {
1678 scoreboard->has_rf0_flops_conflict = true;
1679 }
1680 }
1681
1682 static void
update_scoreboard_for_rf0_flops(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1683 update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
1684 const struct v3d_qpu_instr *inst,
1685 const struct v3d_device_info *devinfo)
1686 {
1687 if (devinfo->ver < 71)
1688 return;
1689
1690 /* Thread switch restrictions:
1691 *
1692 * At the point of a thread switch or thread end (when the actual
1693 * thread switch or thread end happens, not when the signalling
1694 * instruction is processed):
1695 *
1696 * - If the most recent write to rf0 was from a ldunif, ldunifa, or
1697 * ldvary instruction in which another signal also wrote to the
1698 * register file, and the final instruction of the thread section
1699 * contained a signal which wrote to the register file, then the
1700 * value of rf0 is undefined at the start of the new section
1701 *
1702 * Here we use the scoreboard to track if our last rf0 implicit write
1703 * happens at the same time that another signal writes the register
1704 * file (has_rf0_flops_conflict). We will use that information when
1705 * scheduling thrsw instructions to avoid putting anything in their
1706 * last delay slot which has a signal that writes to the register file.
1707 */
1708
1709 /* Reset tracking if we have an explicit rf0 write or we are starting
1710 * a new thread section.
1711 */
1712 if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
1713 scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
1714 scoreboard->last_implicit_rf0_write_tick = -10;
1715 scoreboard->has_rf0_flops_conflict = false;
1716 }
1717
1718 if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
1719 scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
1720 scoreboard->tick + 1 : scoreboard->tick;
1721 }
1722
1723 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
1724 }
1725
1726 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct qinst * qinst,const struct v3d_device_info * devinfo)1727 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1728 const struct qinst *qinst,
1729 const struct v3d_device_info *devinfo)
1730 {
1731 const struct v3d_qpu_instr *inst = &qinst->qpu;
1732
1733 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1734 return;
1735
1736 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1737
1738 if (inst->alu.add.op != V3D_QPU_A_NOP) {
1739 if (inst->alu.add.magic_write) {
1740 update_scoreboard_for_magic_waddr(scoreboard,
1741 inst->alu.add.waddr,
1742 devinfo);
1743 } else {
1744 update_scoreboard_for_sfu_stall_waddr(scoreboard,
1745 inst);
1746 }
1747
1748 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1749 scoreboard->last_setmsf_tick = scoreboard->tick;
1750 }
1751
1752 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1753 if (inst->alu.mul.magic_write) {
1754 update_scoreboard_for_magic_waddr(scoreboard,
1755 inst->alu.mul.waddr,
1756 devinfo);
1757 }
1758 }
1759
1760 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
1761 update_scoreboard_for_magic_waddr(scoreboard,
1762 inst->sig_addr,
1763 devinfo);
1764 }
1765
1766 if (inst->sig.ldvary)
1767 scoreboard->last_ldvary_tick = scoreboard->tick;
1768
1769 update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
1770
1771 update_scoreboard_tmu_tracking(scoreboard, qinst);
1772 }
1773
1774 static void
dump_state(const struct v3d_device_info * devinfo,struct dag * dag)1775 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1776 {
1777 list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1778 fprintf(stderr, " t=%4d: ", n->unblocked_time);
1779 v3d_qpu_dump(devinfo, &n->inst->qpu);
1780 fprintf(stderr, "\n");
1781
1782 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1783 struct schedule_node *child =
1784 (struct schedule_node *)edge->child;
1785 if (!child)
1786 continue;
1787
1788 fprintf(stderr, " - ");
1789 v3d_qpu_dump(devinfo, &child->inst->qpu);
1790 fprintf(stderr, " (%d parents, %c)\n",
1791 child->dag.parent_count,
1792 edge->data ? 'w' : 'r');
1793 }
1794 }
1795 }
1796
magic_waddr_latency(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)1797 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1798 enum v3d_qpu_waddr waddr,
1799 const struct v3d_qpu_instr *after)
1800 {
1801 /* Apply some huge latency between texture fetch requests and getting
1802 * their results back.
1803 *
1804 * FIXME: This is actually pretty bogus. If we do:
1805 *
1806 * mov tmu0_s, a
1807 * <a bit of math>
1808 * mov tmu0_s, b
1809 * load_tmu0
1810 * <more math>
1811 * load_tmu0
1812 *
1813 * we count that as worse than
1814 *
1815 * mov tmu0_s, a
1816 * mov tmu0_s, b
1817 * <lots of math>
1818 * load_tmu0
1819 * <more math>
1820 * load_tmu0
1821 *
1822 * because we associate the first load_tmu0 with the *second* tmu0_s.
1823 */
1824 if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1825 v3d_qpu_waits_on_tmu(after)) {
1826 return 100;
1827 }
1828
1829 /* Assume that anything depending on us is consuming the SFU result. */
1830 if (v3d_qpu_magic_waddr_is_sfu(waddr))
1831 return 3;
1832
1833 return 1;
1834 }
1835
1836 static uint32_t
instruction_latency(const struct v3d_device_info * devinfo,struct schedule_node * before,struct schedule_node * after)1837 instruction_latency(const struct v3d_device_info *devinfo,
1838 struct schedule_node *before, struct schedule_node *after)
1839 {
1840 const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1841 const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1842 uint32_t latency = 1;
1843
1844 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1845 after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1846 return latency;
1847
1848 if (v3d_qpu_instr_is_sfu(before_inst))
1849 return 2;
1850
1851 if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
1852 before_inst->alu.add.magic_write) {
1853 latency = MAX2(latency,
1854 magic_waddr_latency(devinfo,
1855 before_inst->alu.add.waddr,
1856 after_inst));
1857 }
1858
1859 if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
1860 before_inst->alu.mul.magic_write) {
1861 latency = MAX2(latency,
1862 magic_waddr_latency(devinfo,
1863 before_inst->alu.mul.waddr,
1864 after_inst));
1865 }
1866
1867 return latency;
1868 }
1869
1870 /** Recursive computation of the delay member of a node. */
1871 static void
compute_delay(struct dag_node * node,void * state)1872 compute_delay(struct dag_node *node, void *state)
1873 {
1874 struct schedule_node *n = (struct schedule_node *)node;
1875 struct v3d_compile *c = (struct v3d_compile *) state;
1876
1877 n->delay = 1;
1878
1879 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1880 struct schedule_node *child =
1881 (struct schedule_node *)edge->child;
1882
1883 n->delay = MAX2(n->delay, (child->delay +
1884 instruction_latency(c->devinfo, n,
1885 child)));
1886 }
1887 }
1888
1889 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1890 * should be called on it later to finish pruning the other edges).
1891 */
1892 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)1893 pre_remove_head(struct dag *dag, struct schedule_node *n)
1894 {
1895 list_delinit(&n->dag.link);
1896
1897 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1898 if (edge->data)
1899 dag_remove_edge(dag, edge);
1900 }
1901 }
1902
1903 static void
mark_instruction_scheduled(const struct v3d_device_info * devinfo,struct dag * dag,uint32_t time,struct schedule_node * node)1904 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1905 struct dag *dag,
1906 uint32_t time,
1907 struct schedule_node *node)
1908 {
1909 if (!node)
1910 return;
1911
1912 util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1913 struct schedule_node *child =
1914 (struct schedule_node *)edge->child;
1915
1916 if (!child)
1917 continue;
1918
1919 uint32_t latency = instruction_latency(devinfo, node, child);
1920
1921 child->unblocked_time = MAX2(child->unblocked_time,
1922 time + latency);
1923 }
1924 dag_prune_head(dag, &node->dag);
1925 }
1926
1927 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1928 insert_scheduled_instruction(struct v3d_compile *c,
1929 struct qblock *block,
1930 struct choose_scoreboard *scoreboard,
1931 struct qinst *inst)
1932 {
1933 list_addtail(&inst->link, &block->instructions);
1934
1935 update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
1936 c->qpu_inst_count++;
1937 scoreboard->tick++;
1938 }
1939
1940 static struct qinst *
vir_nop()1941 vir_nop()
1942 {
1943 struct qreg undef = vir_nop_reg();
1944 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1945
1946 return qinst;
1947 }
1948
1949 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1950 emit_nop(struct v3d_compile *c, struct qblock *block,
1951 struct choose_scoreboard *scoreboard)
1952 {
1953 insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1954 }
1955
1956 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1957 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1958 const struct qinst *qinst, int slot)
1959 {
1960 const struct v3d_qpu_instr *inst = &qinst->qpu;
1961
1962 if (slot == 2 && qinst->is_tlb_z_write)
1963 return false;
1964
1965 if (slot > 0 && qinst->uniform != ~0)
1966 return false;
1967
1968 if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
1969 return false;
1970
1971 if (inst->sig.ldvary)
1972 return false;
1973
1974 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1975 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1976 if (c->devinfo->ver == 42 && slot == 2 &&
1977 inst->alu.add.op == V3D_QPU_A_TMUWT) {
1978 return false;
1979 }
1980
1981 if (c->devinfo->ver == 42) {
1982 /* No writing physical registers at the end. */
1983 bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
1984 bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
1985 if ((!add_is_nop && !inst->alu.add.magic_write) ||
1986 (!mul_is_nop && !inst->alu.mul.magic_write)) {
1987 return false;
1988 }
1989
1990 if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1991 !inst->sig_magic) {
1992 return false;
1993 }
1994 }
1995
1996 if (c->devinfo->ver >= 71) {
1997 /* The thread end instruction must not write to the
1998 * register file via the add/mul ALUs.
1999 */
2000 if (slot == 0 &&
2001 (!inst->alu.add.magic_write ||
2002 !inst->alu.mul.magic_write)) {
2003 return false;
2004 }
2005 }
2006
2007 if (c->devinfo->ver == 42) {
2008 /* RF0-2 might be overwritten during the delay slots by
2009 * fragment shader setup.
2010 */
2011 if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
2012 return false;
2013
2014 if (inst->raddr_b < 3 &&
2015 !inst->sig.small_imm_b &&
2016 v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
2017 return false;
2018 }
2019 }
2020
2021 if (c->devinfo->ver >= 71) {
2022 /* RF2-3 might be overwritten during the delay slots by
2023 * fragment shader setup.
2024 */
2025 if (v3d71_qpu_reads_raddr(inst, 2) ||
2026 v3d71_qpu_reads_raddr(inst, 3)) {
2027 return false;
2028 }
2029
2030 if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
2031 v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
2032 return false;
2033 }
2034 }
2035 }
2036
2037 return true;
2038 }
2039
2040 /**
2041 * This is called when trying to merge a thrsw back into the instruction stream
2042 * of instructions that were scheduled *before* the thrsw signal to fill its
2043 * delay slots. Because the actual execution of the thrsw happens after the
2044 * delay slots, it is usually safe to do this, but there are some cases that
2045 * need special care.
2046 */
2047 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst,uint32_t slot)2048 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2049 struct choose_scoreboard *scoreboard,
2050 const struct qinst *qinst,
2051 uint32_t slot)
2052 {
2053 /* No scheduling SFU when the result would land in the other
2054 * thread. The simulator complains for safety, though it
2055 * would only occur for dead code in our case.
2056 */
2057 if (slot > 0) {
2058 if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
2059 return false;
2060 if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
2061 return false;
2062 }
2063
2064 if (qinst->qpu.sig.ldvary) {
2065 if (c->devinfo->ver == 42 && slot > 0)
2066 return false;
2067 if (c->devinfo->ver >= 71 && slot == 2)
2068 return false;
2069 }
2070
2071 /* unifa and the following 3 instructions can't overlap a
2072 * thread switch/end. The docs further clarify that this means
2073 * the cycle at which the actual thread switch/end happens
2074 * and not when the thrsw instruction is processed, which would
2075 * be after the 2 delay slots following the thrsw instruction.
2076 * This means that we can move up a thrsw up to the instruction
2077 * right after unifa:
2078 *
2079 * unifa, r5
2080 * thrsw
2081 * delay slot 1
2082 * delay slot 2
2083 * Thread switch happens here, 4 instructions away from unifa
2084 */
2085 if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
2086 return false;
2087
2088 /* See comment when we set has_rf0_flops_conflict for details */
2089 if (c->devinfo->ver >= 71 &&
2090 slot == 2 &&
2091 v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
2092 !qinst->qpu.sig_magic) {
2093 if (scoreboard->has_rf0_flops_conflict)
2094 return false;
2095 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
2096 return false;
2097 }
2098
2099 return true;
2100 }
2101
2102 /**
2103 * This is called for instructions scheduled *after* a thrsw signal that may
2104 * land in the delay slots of the thrsw. Because these instructions were
2105 * scheduled after the thrsw, we need to be careful when placing them into
2106 * the delay slots, since that means that we are moving them ahead of the
2107 * thread switch and we need to ensure that is not a problem.
2108 */
2109 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst)2110 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2111 struct choose_scoreboard *scoreboard,
2112 const struct qinst *qinst)
2113 {
2114 const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
2115 assert(slot <= 2);
2116
2117 /* We merge thrsw instructions back into the instruction stream
2118 * manually, so any instructions scheduled after a thrsw should be
2119 * in the actual delay slots and not in the same slot as the thrsw.
2120 */
2121 assert(slot >= 1);
2122
2123 /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
2124 if (qinst->qpu.sig.thrsw)
2125 return false;
2126
2127 /* The restrictions for instructions scheduled before the the thrsw
2128 * also apply to instructions scheduled after the thrsw that we want
2129 * to place in its delay slots.
2130 */
2131 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
2132 return false;
2133
2134 /* TLB access is disallowed until scoreboard wait is executed, which
2135 * we do on the last thread switch.
2136 */
2137 if (qpu_inst_is_tlb(&qinst->qpu))
2138 return false;
2139
2140 /* Instruction sequence restrictions: Branch is not allowed in delay
2141 * slots of a thrsw.
2142 */
2143 if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2144 return false;
2145
2146 /* Miscellaneous restrictions: At the point of a thrsw we need to have
2147 * at least one outstanding lookup or TSY wait.
2148 *
2149 * So avoid placing TMU instructions scheduled after the thrsw into
2150 * its delay slots or we may be compromising the integrity of our TMU
2151 * sequences. Also, notice that if we moved these instructions into
2152 * the delay slots of a previous thrsw we could overflow our TMU output
2153 * fifo, since we could be effectively pipelining a lookup scheduled
2154 * after the thrsw into the sequence before the thrsw.
2155 */
2156 if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
2157 qinst->qpu.sig.wrtmuc) {
2158 return false;
2159 }
2160
2161 /* Don't move instructions that wait on the TMU before the thread switch
2162 * happens since that would make the current thread stall before the
2163 * switch, which is exactly what we want to avoid with the thrsw
2164 * instruction.
2165 */
2166 if (v3d_qpu_waits_on_tmu(&qinst->qpu))
2167 return false;
2168
2169 /* A thread switch invalidates all accumulators, so don't place any
2170 * instructions that write accumulators into the delay slots.
2171 */
2172 if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
2173 return false;
2174
2175 /* Multop has an implicit write to the rtop register which is an
2176 * specialized accumulator that is only used with this instruction.
2177 */
2178 if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
2179 return false;
2180
2181 /* Flags are invalidated across a thread switch, so dont' place
2182 * instructions that write flags into delay slots.
2183 */
2184 if (v3d_qpu_writes_flags(&qinst->qpu))
2185 return false;
2186
2187 /* TSY sync ops materialize at the point of the next thread switch,
2188 * therefore, if we have a TSY sync right after a thread switch, we
2189 * cannot place it in its delay slots, or we would be moving the sync
2190 * to the thrsw before it instead.
2191 */
2192 if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
2193 return false;
2194
2195 return true;
2196 }
2197
2198 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)2199 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
2200 struct qinst *qinst, int instructions_in_sequence,
2201 bool is_thrend)
2202 {
2203 for (int slot = 0; slot < instructions_in_sequence; slot++) {
2204 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
2205 qinst, slot)) {
2206 return false;
2207 }
2208
2209 if (is_thrend &&
2210 !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
2211 return false;
2212 }
2213
2214 /* Note that the list is circular, so we can only do this up
2215 * to instructions_in_sequence.
2216 */
2217 qinst = (struct qinst *)qinst->link.next;
2218 }
2219
2220 return true;
2221 }
2222
2223 /**
2224 * Emits a THRSW signal in the stream, trying to move it up to pair with
2225 * another instruction.
2226 */
2227 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)2228 emit_thrsw(struct v3d_compile *c,
2229 struct qblock *block,
2230 struct choose_scoreboard *scoreboard,
2231 struct qinst *inst,
2232 bool is_thrend)
2233 {
2234 int time = 0;
2235
2236 /* There should be nothing in a thrsw inst being scheduled other than
2237 * the signal bits.
2238 */
2239 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
2240 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
2241 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
2242
2243 /* Don't try to emit a thrsw in the delay slots of a previous thrsw
2244 * or branch.
2245 */
2246 while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
2247 emit_nop(c, block, scoreboard);
2248 time++;
2249 }
2250 while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
2251 emit_nop(c, block, scoreboard);
2252 time++;
2253 }
2254
2255 /* Find how far back into previous instructions we can put the THRSW. */
2256 int slots_filled = 0;
2257 int invalid_sig_count = 0;
2258 int invalid_seq_count = 0;
2259 bool last_thrsw_after_invalid_ok = false;
2260 struct qinst *merge_inst = NULL;
2261 vir_for_each_inst_rev(prev_inst, block) {
2262 /* No emitting our thrsw while the previous thrsw hasn't
2263 * happened yet.
2264 */
2265 if (scoreboard->last_thrsw_tick + 3 >
2266 scoreboard->tick - (slots_filled + 1)) {
2267 break;
2268 }
2269
2270
2271 if (!valid_thrsw_sequence(c, scoreboard,
2272 prev_inst, slots_filled + 1,
2273 is_thrend)) {
2274 /* Even if the current sequence isn't valid, we may
2275 * be able to get a valid sequence by trying to move the
2276 * thrsw earlier, so keep going.
2277 */
2278 invalid_seq_count++;
2279 goto cont_block;
2280 }
2281
2282 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
2283 sig.thrsw = true;
2284 uint32_t packed_sig;
2285 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
2286 /* If we can't merge the thrsw here because of signal
2287 * incompatibility, keep going, we might be able to
2288 * merge it in an earlier instruction.
2289 */
2290 invalid_sig_count++;
2291 goto cont_block;
2292 }
2293
2294 /* For last thrsw we need 2 consecutive slots that are
2295 * thrsw compatible, so if we have previously jumped over
2296 * an incompatible signal, flag that we have found the first
2297 * valid slot here and keep going.
2298 */
2299 if (inst->is_last_thrsw && invalid_sig_count > 0 &&
2300 !last_thrsw_after_invalid_ok) {
2301 last_thrsw_after_invalid_ok = true;
2302 invalid_sig_count++;
2303 goto cont_block;
2304 }
2305
2306 /* We can merge the thrsw in this instruction */
2307 last_thrsw_after_invalid_ok = false;
2308 invalid_sig_count = 0;
2309 invalid_seq_count = 0;
2310 merge_inst = prev_inst;
2311
2312 cont_block:
2313 if (++slots_filled == 3)
2314 break;
2315 }
2316
2317 /* If we jumped over a signal incompatibility and did not manage to
2318 * merge the thrsw in the end, we need to adjust slots filled to match
2319 * the last valid merge point.
2320 */
2321 assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
2322 slots_filled >= invalid_sig_count + invalid_seq_count);
2323 if (invalid_sig_count > 0)
2324 slots_filled -= invalid_sig_count;
2325 if (invalid_seq_count > 0)
2326 slots_filled -= invalid_seq_count;
2327
2328 bool needs_free = false;
2329 if (merge_inst) {
2330 merge_inst->qpu.sig.thrsw = true;
2331 needs_free = true;
2332 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
2333 } else {
2334 scoreboard->last_thrsw_tick = scoreboard->tick;
2335 insert_scheduled_instruction(c, block, scoreboard, inst);
2336 time++;
2337 slots_filled++;
2338 merge_inst = inst;
2339 }
2340
2341 scoreboard->first_thrsw_emitted = true;
2342
2343 /* If we're emitting the last THRSW (other than program end), then
2344 * signal that to the HW by emitting two THRSWs in a row.
2345 */
2346 if (inst->is_last_thrsw) {
2347 if (slots_filled <= 1) {
2348 emit_nop(c, block, scoreboard);
2349 time++;
2350 }
2351 struct qinst *second_inst =
2352 (struct qinst *)merge_inst->link.next;
2353 second_inst->qpu.sig.thrsw = true;
2354 scoreboard->last_thrsw_emitted = true;
2355 }
2356
2357 /* Make sure the thread end executes within the program lifespan */
2358 if (is_thrend) {
2359 for (int i = 0; i < 3 - slots_filled; i++) {
2360 emit_nop(c, block, scoreboard);
2361 time++;
2362 }
2363 }
2364
2365 /* If we put our THRSW into another instruction, free up the
2366 * instruction that didn't end up scheduled into the list.
2367 */
2368 if (needs_free)
2369 free(inst);
2370
2371 return time;
2372 }
2373
2374 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile * c,struct qinst * inst)2375 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
2376 {
2377 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2378 return false;
2379
2380 if (inst->qpu.sig.thrsw)
2381 return false;
2382
2383 if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
2384 return false;
2385
2386 if (vir_has_uniform(inst))
2387 return false;
2388
2389 return true;
2390 }
2391
2392 static void
emit_branch(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)2393 emit_branch(struct v3d_compile *c,
2394 struct qblock *block,
2395 struct choose_scoreboard *scoreboard,
2396 struct qinst *inst)
2397 {
2398 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2399
2400 /* We should've not picked up a branch for the delay slots of a previous
2401 * thrsw, branch or unifa write instruction.
2402 */
2403 int branch_tick = scoreboard->tick;
2404 assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
2405 assert(scoreboard->last_branch_tick + 3 < branch_tick);
2406 assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
2407
2408 /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
2409 * setmsf.
2410 */
2411 bool is_safe_msf_branch =
2412 c->devinfo->ver >= 71 ||
2413 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
2414 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
2415 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
2416 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
2417 assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
2418 is_safe_msf_branch);
2419
2420 /* Insert the branch instruction */
2421 insert_scheduled_instruction(c, block, scoreboard, inst);
2422
2423 /* Now see if we can move the branch instruction back into the
2424 * instruction stream to fill its delay slots
2425 */
2426 int slots_filled = 0;
2427 while (slots_filled < 3 && block->instructions.next != &inst->link) {
2428 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
2429 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
2430
2431 /* Can't move the branch instruction if that would place it
2432 * in the delay slots of other instructions.
2433 */
2434 if (scoreboard->last_branch_tick + 3 >=
2435 branch_tick - slots_filled - 1) {
2436 break;
2437 }
2438
2439 if (scoreboard->last_thrsw_tick + 2 >=
2440 branch_tick - slots_filled - 1) {
2441 break;
2442 }
2443
2444 if (scoreboard->last_unifa_write_tick + 3 >=
2445 branch_tick - slots_filled - 1) {
2446 break;
2447 }
2448
2449 /* Do not move up a branch if it can disrupt an ldvary sequence
2450 * as that can cause stomping of the r5 register.
2451 */
2452 if (scoreboard->last_ldvary_tick + 2 >=
2453 branch_tick - slots_filled) {
2454 break;
2455 }
2456
2457 /* Can't move a conditional branch before the instruction
2458 * that writes the flags for its condition.
2459 */
2460 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
2461 inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
2462 break;
2463 }
2464
2465 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
2466 break;
2467
2468 if (!is_safe_msf_branch) {
2469 struct qinst *prev_prev_inst =
2470 (struct qinst *) prev_inst->link.prev;
2471 if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
2472 prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
2473 break;
2474 }
2475 }
2476
2477 list_del(&prev_inst->link);
2478 list_add(&prev_inst->link, &inst->link);
2479 slots_filled++;
2480 }
2481
2482 block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
2483 scoreboard->last_branch_tick = branch_tick - slots_filled;
2484
2485 /* Fill any remaining delay slots.
2486 *
2487 * For unconditional branches we'll try to fill these with the
2488 * first instructions in the successor block after scheduling
2489 * all blocks when setting up branch targets.
2490 */
2491 for (int i = 0; i < 3 - slots_filled; i++)
2492 emit_nop(c, block, scoreboard);
2493 }
2494
2495 static bool
alu_reads_register(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst,bool add,bool magic,uint32_t index)2496 alu_reads_register(const struct v3d_device_info *devinfo,
2497 struct v3d_qpu_instr *inst,
2498 bool add, bool magic, uint32_t index)
2499 {
2500 uint32_t num_src;
2501 if (add)
2502 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2503 else
2504 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2505
2506 if (devinfo->ver == 42) {
2507 enum v3d_qpu_mux mux_a, mux_b;
2508 if (add) {
2509 mux_a = inst->alu.add.a.mux;
2510 mux_b = inst->alu.add.b.mux;
2511 } else {
2512 mux_a = inst->alu.mul.a.mux;
2513 mux_b = inst->alu.mul.b.mux;
2514 }
2515
2516 for (int i = 0; i < num_src; i++) {
2517 if (magic) {
2518 if (i == 0 && mux_a == index)
2519 return true;
2520 if (i == 1 && mux_b == index)
2521 return true;
2522 } else {
2523 if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2524 inst->raddr_a == index) {
2525 return true;
2526 }
2527 if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2528 inst->raddr_b == index) {
2529 return true;
2530 }
2531 if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2532 inst->raddr_a == index) {
2533 return true;
2534 }
2535 if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2536 inst->raddr_b == index) {
2537 return true;
2538 }
2539 }
2540 }
2541
2542 return false;
2543 }
2544
2545 assert(devinfo->ver >= 71);
2546 assert(!magic);
2547
2548 uint32_t raddr_a, raddr_b;
2549 if (add) {
2550 raddr_a = inst->alu.add.a.raddr;
2551 raddr_b = inst->alu.add.b.raddr;
2552 } else {
2553 raddr_a = inst->alu.mul.a.raddr;
2554 raddr_b = inst->alu.mul.b.raddr;
2555 }
2556
2557 for (int i = 0; i < num_src; i++) {
2558 if (i == 0 && raddr_a == index)
2559 return true;
2560 if (i == 1 && raddr_b == index)
2561 return true;
2562 }
2563
2564 return false;
2565 }
2566
2567 /**
2568 * This takes and ldvary signal merged into 'inst' and tries to move it up to
2569 * the previous instruction to get good pipelining of ldvary sequences,
2570 * transforming this:
2571 *
2572 * nop ; nop ; ldvary.r4
2573 * nop ; fmul r0, r4, rf0 ;
2574 * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst
2575 *
2576 * into:
2577 *
2578 * nop ; nop ; ldvary.r4
2579 * nop ; fmul r0, r4, rf0 ; ldvary.r1
2580 * fadd rf13, r0, r5 ; nop; ; <-- inst
2581 *
2582 * If we manage to do this successfully (we return true here), then flagging
2583 * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2584 * we will be able to pick up to merge into 'inst', leading to code like this:
2585 *
2586 * nop ; nop ; ldvary.r4
2587 * nop ; fmul r0, r4, rf0 ; ldvary.r1
2588 * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst
2589 */
2590 static bool
fixup_pipelined_ldvary(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct v3d_qpu_instr * inst)2591 fixup_pipelined_ldvary(struct v3d_compile *c,
2592 struct choose_scoreboard *scoreboard,
2593 struct qblock *block,
2594 struct v3d_qpu_instr *inst)
2595 {
2596 const struct v3d_device_info *devinfo = c->devinfo;
2597
2598 /* We only call this if we have successfully merged an ldvary into a
2599 * previous instruction.
2600 */
2601 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2602 assert(inst->sig.ldvary);
2603 uint32_t ldvary_magic = inst->sig_magic;
2604 uint32_t ldvary_index = inst->sig_addr;
2605
2606 /* The instruction in which we merged the ldvary cannot read
2607 * the ldvary destination, if it does, then moving the ldvary before
2608 * it would overwrite it.
2609 */
2610 if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
2611 return false;
2612 if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
2613 return false;
2614
2615 /* The implicit ldvary destination may not be written to by a signal
2616 * in the instruction following ldvary. Since we are planning to move
2617 * ldvary to the previous instruction, this means we need to check if
2618 * the current instruction has any other signal that could create this
2619 * conflict. The only other signal that can write to the implicit
2620 * ldvary destination that is compatible with ldvary in the same
2621 * instruction is ldunif.
2622 */
2623 if (inst->sig.ldunif)
2624 return false;
2625
2626 /* The previous instruction can't write to the same destination as the
2627 * ldvary.
2628 */
2629 struct qinst *prev = (struct qinst *) block->instructions.prev;
2630 if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2631 return false;
2632
2633 if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2634 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2635 prev->qpu.alu.add.waddr == ldvary_index) {
2636 return false;
2637 }
2638 }
2639
2640 if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2641 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2642 prev->qpu.alu.mul.waddr == ldvary_index) {
2643 return false;
2644 }
2645 }
2646
2647 /* The previous instruction cannot have a conflicting signal */
2648 if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
2649 return false;
2650
2651 uint32_t sig;
2652 struct v3d_qpu_sig new_sig = prev->qpu.sig;
2653 new_sig.ldvary = true;
2654 if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
2655 return false;
2656
2657 /* The previous instruction cannot use flags since ldvary uses the
2658 * 'cond' instruction field to store the destination.
2659 */
2660 if (v3d_qpu_writes_flags(&prev->qpu))
2661 return false;
2662 if (v3d_qpu_reads_flags(&prev->qpu))
2663 return false;
2664
2665 /* We can't put an ldvary in the delay slots of a thrsw. We should've
2666 * prevented this when pairing up the ldvary with another instruction
2667 * and flagging it for a fixup. In V3D 7.x this is limited only to the
2668 * second delay slot.
2669 */
2670 assert((devinfo->ver == 42 &&
2671 scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
2672 (devinfo->ver >= 71 &&
2673 scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
2674
2675 /* Move the ldvary to the previous instruction and remove it from the
2676 * current one.
2677 */
2678 prev->qpu.sig.ldvary = true;
2679 prev->qpu.sig_magic = ldvary_magic;
2680 prev->qpu.sig_addr = ldvary_index;
2681 scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2682
2683 inst->sig.ldvary = false;
2684 inst->sig_magic = false;
2685 inst->sig_addr = 0;
2686
2687 /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
2688 if (devinfo->ver >= 71) {
2689 scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
2690 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
2691 }
2692
2693 /* By moving ldvary to the previous instruction we make it update r5
2694 * (rf0 for ver >= 71) in the current one, so nothing else in it
2695 * should write this register.
2696 *
2697 * This should've been prevented by our depedency tracking, which
2698 * would not allow ldvary to be paired up with an instruction that
2699 * writes r5/rf0 (since our dependency tracking doesn't know that the
2700 * ldvary write to r5/rf0 happens in the next instruction).
2701 */
2702 assert(!v3d_qpu_writes_r5(devinfo, inst));
2703 assert(devinfo->ver == 42 ||
2704 (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
2705 !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
2706
2707 return true;
2708 }
2709
2710 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2711 schedule_instructions(struct v3d_compile *c,
2712 struct choose_scoreboard *scoreboard,
2713 struct qblock *block,
2714 enum quniform_contents *orig_uniform_contents,
2715 uint32_t *orig_uniform_data,
2716 uint32_t *next_uniform)
2717 {
2718 const struct v3d_device_info *devinfo = c->devinfo;
2719 uint32_t time = 0;
2720
2721 while (!list_is_empty(&scoreboard->dag->heads)) {
2722 struct schedule_node *chosen =
2723 choose_instruction_to_schedule(c, scoreboard, NULL);
2724 struct schedule_node *merge = NULL;
2725
2726 /* If there are no valid instructions to schedule, drop a NOP
2727 * in.
2728 */
2729 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2730 struct v3d_qpu_instr *inst = &qinst->qpu;
2731
2732 if (debug) {
2733 fprintf(stderr, "t=%4d: current list:\n",
2734 time);
2735 dump_state(devinfo, scoreboard->dag);
2736 fprintf(stderr, "t=%4d: chose: ", time);
2737 v3d_qpu_dump(devinfo, inst);
2738 fprintf(stderr, "\n");
2739 }
2740
2741 /* We can't mark_instruction_scheduled() the chosen inst until
2742 * we're done identifying instructions to merge, so put the
2743 * merged instructions on a list for a moment.
2744 */
2745 struct list_head merged_list;
2746 list_inithead(&merged_list);
2747
2748 /* Schedule this instruction onto the QPU list. Also try to
2749 * find an instruction to pair with it.
2750 */
2751 if (chosen) {
2752 time = MAX2(chosen->unblocked_time, time);
2753 pre_remove_head(scoreboard->dag, chosen);
2754
2755 while ((merge =
2756 choose_instruction_to_schedule(c, scoreboard,
2757 chosen))) {
2758 time = MAX2(merge->unblocked_time, time);
2759 pre_remove_head(scoreboard->dag, merge);
2760 list_addtail(&merge->link, &merged_list);
2761 (void)qpu_merge_inst(devinfo, inst,
2762 inst, &merge->inst->qpu);
2763 if (merge->inst->uniform != -1) {
2764 chosen->inst->uniform =
2765 merge->inst->uniform;
2766 }
2767
2768 chosen->inst->ldtmu_count +=
2769 merge->inst->ldtmu_count;
2770
2771 if (debug) {
2772 fprintf(stderr, "t=%4d: merging: ",
2773 time);
2774 v3d_qpu_dump(devinfo, &merge->inst->qpu);
2775 fprintf(stderr, "\n");
2776 fprintf(stderr, " result: ");
2777 v3d_qpu_dump(devinfo, inst);
2778 fprintf(stderr, "\n");
2779 }
2780
2781 if (scoreboard->fixup_ldvary) {
2782 scoreboard->fixup_ldvary = false;
2783 if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2784 /* Flag the ldvary as scheduled
2785 * now so we can try to merge the
2786 * follow-up instruction in the
2787 * the ldvary sequence into the
2788 * current instruction.
2789 */
2790 mark_instruction_scheduled(
2791 devinfo, scoreboard->dag,
2792 time, merge);
2793 }
2794 }
2795 }
2796 if (read_stalls(c->devinfo, scoreboard, inst))
2797 c->qpu_inst_stalled_count++;
2798 }
2799
2800 /* Update the uniform index for the rewritten location --
2801 * branch target updating will still need to change
2802 * c->uniform_data[] using this index.
2803 */
2804 if (qinst->uniform != -1) {
2805 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2806 block->branch_uniform = *next_uniform;
2807
2808 c->uniform_data[*next_uniform] =
2809 orig_uniform_data[qinst->uniform];
2810 c->uniform_contents[*next_uniform] =
2811 orig_uniform_contents[qinst->uniform];
2812 qinst->uniform = *next_uniform;
2813 (*next_uniform)++;
2814 }
2815
2816 if (debug) {
2817 fprintf(stderr, "\n");
2818 }
2819
2820 /* Now that we've scheduled a new instruction, some of its
2821 * children can be promoted to the list of instructions ready to
2822 * be scheduled. Update the children's unblocked time for this
2823 * DAG edge as we do so.
2824 */
2825 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2826 list_for_each_entry(struct schedule_node, merge, &merged_list,
2827 link) {
2828 mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2829
2830 /* The merged VIR instruction doesn't get re-added to the
2831 * block, so free it now.
2832 */
2833 free(merge->inst);
2834 }
2835
2836 if (inst->sig.thrsw) {
2837 time += emit_thrsw(c, block, scoreboard, qinst, false);
2838 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2839 emit_branch(c, block, scoreboard, qinst);
2840 } else {
2841 insert_scheduled_instruction(c, block,
2842 scoreboard, qinst);
2843 }
2844 }
2845
2846 return time;
2847 }
2848
2849 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2850 qpu_schedule_instructions_block(struct v3d_compile *c,
2851 struct choose_scoreboard *scoreboard,
2852 struct qblock *block,
2853 enum quniform_contents *orig_uniform_contents,
2854 uint32_t *orig_uniform_data,
2855 uint32_t *next_uniform)
2856 {
2857 void *mem_ctx = ralloc_context(NULL);
2858 scoreboard->dag = dag_create(mem_ctx);
2859 struct list_head setup_list;
2860
2861 list_inithead(&setup_list);
2862
2863 /* Wrap each instruction in a scheduler structure. */
2864 while (!list_is_empty(&block->instructions)) {
2865 struct qinst *qinst = (struct qinst *)block->instructions.next;
2866 struct schedule_node *n =
2867 rzalloc(mem_ctx, struct schedule_node);
2868
2869 dag_init_node(scoreboard->dag, &n->dag);
2870 n->inst = qinst;
2871
2872 list_del(&qinst->link);
2873 list_addtail(&n->link, &setup_list);
2874 }
2875
2876 calculate_forward_deps(c, scoreboard->dag, &setup_list);
2877 calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2878
2879 dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2880
2881 uint32_t cycles = schedule_instructions(c, scoreboard, block,
2882 orig_uniform_contents,
2883 orig_uniform_data,
2884 next_uniform);
2885
2886 ralloc_free(mem_ctx);
2887 scoreboard->dag = NULL;
2888
2889 return cycles;
2890 }
2891
2892 static void
qpu_set_branch_targets(struct v3d_compile * c)2893 qpu_set_branch_targets(struct v3d_compile *c)
2894 {
2895 vir_for_each_block(block, c) {
2896 /* The end block of the program has no branch. */
2897 if (!block->successors[0])
2898 continue;
2899
2900 /* If there was no branch instruction, then the successor
2901 * block must follow immediately after this one.
2902 */
2903 if (block->branch_qpu_ip == ~0) {
2904 assert(block->end_qpu_ip + 1 ==
2905 block->successors[0]->start_qpu_ip);
2906 continue;
2907 }
2908
2909 /* Walk back through the delay slots to find the branch
2910 * instr.
2911 */
2912 struct qinst *branch = NULL;
2913 struct list_head *entry = block->instructions.prev;
2914 int32_t delay_slot_count = -1;
2915 struct qinst *delay_slots_start = NULL;
2916 for (int i = 0; i < 3; i++) {
2917 entry = entry->prev;
2918 struct qinst *inst =
2919 container_of(entry, struct qinst, link);
2920
2921 if (delay_slot_count == -1) {
2922 if (!v3d_qpu_is_nop(&inst->qpu))
2923 delay_slot_count = i;
2924 else
2925 delay_slots_start = inst;
2926 }
2927
2928 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2929 branch = inst;
2930 break;
2931 }
2932 }
2933 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2934 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2935 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2936
2937 /* Make sure that the if-we-don't-jump
2938 * successor was scheduled just after the
2939 * delay slots.
2940 */
2941 assert(!block->successors[1] ||
2942 block->successors[1]->start_qpu_ip ==
2943 block->branch_qpu_ip + 4);
2944
2945 branch->qpu.branch.offset =
2946 ((block->successors[0]->start_qpu_ip -
2947 (block->branch_qpu_ip + 4)) *
2948 sizeof(uint64_t));
2949
2950 /* Set up the relative offset to jump in the
2951 * uniform stream.
2952 *
2953 * Use a temporary here, because
2954 * uniform_data[inst->uniform] may be shared
2955 * between multiple instructions.
2956 */
2957 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2958 c->uniform_data[branch->uniform] =
2959 (block->successors[0]->start_uniform -
2960 (block->branch_uniform + 1)) * 4;
2961
2962 /* If this is an unconditional branch, try to fill any remaining
2963 * delay slots with the initial instructions of the successor
2964 * block.
2965 *
2966 * FIXME: we can do the same for conditional branches if we
2967 * predicate the instructions to match the branch condition.
2968 */
2969 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2970 struct list_head *successor_insts =
2971 &block->successors[0]->instructions;
2972 delay_slot_count = MIN2(delay_slot_count,
2973 list_length(successor_insts));
2974 struct qinst *s_inst =
2975 (struct qinst *) successor_insts->next;
2976 struct qinst *slot = delay_slots_start;
2977 int slots_filled = 0;
2978 while (slots_filled < delay_slot_count &&
2979 qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2980 memcpy(&slot->qpu, &s_inst->qpu,
2981 sizeof(slot->qpu));
2982 s_inst = (struct qinst *) s_inst->link.next;
2983 slot = (struct qinst *) slot->link.next;
2984 slots_filled++;
2985 }
2986 branch->qpu.branch.offset +=
2987 slots_filled * sizeof(uint64_t);
2988 }
2989 }
2990 }
2991
2992 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)2993 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2994 {
2995 const struct v3d_device_info *devinfo = c->devinfo;
2996 struct qblock *end_block = list_last_entry(&c->blocks,
2997 struct qblock, link);
2998
2999 /* We reorder the uniforms as we schedule instructions, so save the
3000 * old data off and replace it.
3001 */
3002 uint32_t *uniform_data = c->uniform_data;
3003 enum quniform_contents *uniform_contents = c->uniform_contents;
3004 c->uniform_contents = ralloc_array(c, enum quniform_contents,
3005 c->num_uniforms);
3006 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
3007 c->uniform_array_size = c->num_uniforms;
3008 uint32_t next_uniform = 0;
3009
3010 struct choose_scoreboard scoreboard;
3011 memset(&scoreboard, 0, sizeof(scoreboard));
3012 scoreboard.last_ldvary_tick = -10;
3013 scoreboard.last_unifa_write_tick = -10;
3014 scoreboard.last_magic_sfu_write_tick = -10;
3015 scoreboard.last_uniforms_reset_tick = -10;
3016 scoreboard.last_thrsw_tick = -10;
3017 scoreboard.last_branch_tick = -10;
3018 scoreboard.last_setmsf_tick = -10;
3019 scoreboard.last_stallable_sfu_tick = -10;
3020 scoreboard.first_ldtmu_after_thrsw = true;
3021 scoreboard.last_implicit_rf0_write_tick = - 10;
3022
3023 if (debug) {
3024 fprintf(stderr, "Pre-schedule instructions\n");
3025 vir_for_each_block(block, c) {
3026 fprintf(stderr, "BLOCK %d\n", block->index);
3027 list_for_each_entry(struct qinst, qinst,
3028 &block->instructions, link) {
3029 v3d_qpu_dump(devinfo, &qinst->qpu);
3030 fprintf(stderr, "\n");
3031 }
3032 }
3033 fprintf(stderr, "\n");
3034 }
3035
3036 uint32_t cycles = 0;
3037 vir_for_each_block(block, c) {
3038 block->start_qpu_ip = c->qpu_inst_count;
3039 block->branch_qpu_ip = ~0;
3040 block->start_uniform = next_uniform;
3041
3042 cycles += qpu_schedule_instructions_block(c,
3043 &scoreboard,
3044 block,
3045 uniform_contents,
3046 uniform_data,
3047 &next_uniform);
3048
3049 block->end_qpu_ip = c->qpu_inst_count - 1;
3050 }
3051
3052 /* Emit the program-end THRSW instruction. */;
3053 struct qinst *thrsw = vir_nop();
3054 thrsw->qpu.sig.thrsw = true;
3055 emit_thrsw(c, end_block, &scoreboard, thrsw, true);
3056
3057 qpu_set_branch_targets(c);
3058
3059 assert(next_uniform == c->num_uniforms);
3060
3061 return cycles;
3062 }
3063