/* * Copyright © 2010 Intel Corporation * Copyright © 2014-2017 Broadcom * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /** * @file * * The basic model of the list scheduler is to take a basic block, compute a * DAG of the dependencies, and make a list of the DAG heads. Heuristically * pick a DAG head, then put all the children that are now DAG heads into the * list of things to schedule. * * The goal of scheduling here is to pack pairs of operations together in a * single QPU instruction. */ #include "qpu/qpu_disasm.h" #include "v3d_compiler.h" #include "util/ralloc.h" #include "util/dag.h" static bool debug; struct schedule_node_child; struct schedule_node { struct dag_node dag; struct list_head link; struct qinst *inst; /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; /** * Minimum number of cycles from scheduling this instruction until the * end of the program, based on the slowest dependency chain through * the children. */ uint32_t delay; /** * cycles between this instruction being scheduled and when its result * can be consumed. */ uint32_t latency; }; /* When walking the instructions in reverse, we need to swap before/after in * add_dep(). */ enum direction { F, R }; struct schedule_state { const struct v3d_device_info *devinfo; struct dag *dag; struct schedule_node *last_r[6]; struct schedule_node *last_rf[64]; struct schedule_node *last_sf; struct schedule_node *last_vpm_read; struct schedule_node *last_tmu_write; struct schedule_node *last_tmu_config; struct schedule_node *last_tmu_read; struct schedule_node *last_tlb; struct schedule_node *last_vpm; struct schedule_node *last_unif; struct schedule_node *last_rtop; struct schedule_node *last_unifa; enum direction dir; /* Estimated cycle when the current instruction would start. */ uint32_t time; }; static void add_dep(struct schedule_state *state, struct schedule_node *before, struct schedule_node *after, bool write) { bool write_after_read = !write && state->dir == R; uintptr_t edge_data = write_after_read; if (!before || !after) return; assert(before != after); if (state->dir == F) dag_add_edge(&before->dag, &after->dag, edge_data); else dag_add_edge(&after->dag, &before->dag, edge_data); } static void add_read_dep(struct schedule_state *state, struct schedule_node *before, struct schedule_node *after) { add_dep(state, before, after, false); } static void add_write_dep(struct schedule_state *state, struct schedule_node **before, struct schedule_node *after) { add_dep(state, *before, after, true); *before = after; } static bool qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) { if (inst->sig.ldtlb || inst->sig.ldtlbu) return true; if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; if (inst->alu.add.magic_write && (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) return true; if (inst->alu.mul.magic_write && (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) return true; return false; } static void process_mux_deps(struct schedule_state *state, struct schedule_node *n, enum v3d_qpu_mux mux) { switch (mux) { case V3D_QPU_MUX_A: add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: if (!n->inst->qpu.sig.small_imm) { add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); } break; default: add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); break; } } static bool tmu_write_is_sequence_terminator(uint32_t waddr) { switch (waddr) { case V3D_QPU_WADDR_TMUS: case V3D_QPU_WADDR_TMUSCM: case V3D_QPU_WADDR_TMUSF: case V3D_QPU_WADDR_TMUSLOD: case V3D_QPU_WADDR_TMUA: case V3D_QPU_WADDR_TMUAU: return true; default: return false; } } static bool can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr) { if (devinfo->ver < 40) return false; if (tmu_write_is_sequence_terminator(waddr)) return false; if (waddr == V3D_QPU_WADDR_TMUD) return false; return true; } static void process_waddr_deps(struct schedule_state *state, struct schedule_node *n, uint32_t waddr, bool magic) { if (!magic) { add_write_dep(state, &state->last_rf[waddr], n); } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) { if (can_reorder_tmu_write(state->devinfo, waddr)) add_read_dep(state, state->last_tmu_write, n); else add_write_dep(state, &state->last_tmu_write, n); if (tmu_write_is_sequence_terminator(waddr)) add_write_dep(state, &state->last_tmu_config, n); } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { /* Handled by v3d_qpu_writes_r4() check. */ } else { switch (waddr) { case V3D_QPU_WADDR_R0: case V3D_QPU_WADDR_R1: case V3D_QPU_WADDR_R2: add_write_dep(state, &state->last_r[waddr - V3D_QPU_WADDR_R0], n); break; case V3D_QPU_WADDR_R3: case V3D_QPU_WADDR_R4: case V3D_QPU_WADDR_R5: /* Handled by v3d_qpu_writes_r*() checks below. */ break; case V3D_QPU_WADDR_VPM: case V3D_QPU_WADDR_VPMU: add_write_dep(state, &state->last_vpm, n); break; case V3D_QPU_WADDR_TLB: case V3D_QPU_WADDR_TLBU: add_write_dep(state, &state->last_tlb, n); break; case V3D_QPU_WADDR_SYNC: case V3D_QPU_WADDR_SYNCB: case V3D_QPU_WADDR_SYNCU: /* For CS barrier(): Sync against any other memory * accesses. There doesn't appear to be any need for * barriers to affect ALU operations. */ add_write_dep(state, &state->last_tmu_write, n); add_write_dep(state, &state->last_tmu_read, n); break; case V3D_QPU_WADDR_UNIFA: if (state->devinfo->ver >= 40) add_write_dep(state, &state->last_unifa, n); break; case V3D_QPU_WADDR_NOP: break; default: fprintf(stderr, "Unknown waddr %d\n", waddr); abort(); } } } /** * Common code for dependencies that need to be tracked both forward and * backward. * * This is for things like "all reads of r4 have to happen between the r4 * writes that surround them". */ static void calculate_deps(struct schedule_state *state, struct schedule_node *n) { const struct v3d_device_info *devinfo = state->devinfo; struct qinst *qinst = n->inst; struct v3d_qpu_instr *inst = &qinst->qpu; /* If the input and output segments are shared, then all VPM reads to * a location need to happen before all writes. We handle this by * serializing all VPM operations for now. */ bool separate_vpm_segment = false; if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) add_read_dep(state, state->last_sf, n); /* XXX: BDI */ /* XXX: BDU */ /* XXX: ub */ /* XXX: raddr_a */ add_write_dep(state, &state->last_unif, n); return; } assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); /* XXX: LOAD_IMM */ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) process_mux_deps(state, n, inst->alu.add.a); if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) process_mux_deps(state, n, inst->alu.add.b); if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) process_mux_deps(state, n, inst->alu.mul.a); if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) process_mux_deps(state, n, inst->alu.mul.b); switch (inst->alu.add.op) { case V3D_QPU_A_VPMSETUP: /* Could distinguish read/write by unpacking the uniform. */ add_write_dep(state, &state->last_vpm, n); add_write_dep(state, &state->last_vpm_read, n); break; case V3D_QPU_A_STVPMV: case V3D_QPU_A_STVPMD: case V3D_QPU_A_STVPMP: add_write_dep(state, &state->last_vpm, n); break; case V3D_QPU_A_LDVPMV_IN: case V3D_QPU_A_LDVPMD_IN: case V3D_QPU_A_LDVPMG_IN: case V3D_QPU_A_LDVPMP: if (!separate_vpm_segment) add_write_dep(state, &state->last_vpm, n); break; case V3D_QPU_A_VPMWT: add_read_dep(state, state->last_vpm, n); break; case V3D_QPU_A_MSF: add_read_dep(state, state->last_tlb, n); break; case V3D_QPU_A_SETMSF: case V3D_QPU_A_SETREVF: add_write_dep(state, &state->last_tlb, n); break; default: break; } switch (inst->alu.mul.op) { case V3D_QPU_M_MULTOP: case V3D_QPU_M_UMUL24: /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and * resets it to 0. We could possibly reorder umul24s relative * to each other, but for now just keep all the MUL parts in * order. */ add_write_dep(state, &state->last_rtop, n); break; default: break; } if (inst->alu.add.op != V3D_QPU_A_NOP) { process_waddr_deps(state, n, inst->alu.add.waddr, inst->alu.add.magic_write); } if (inst->alu.mul.op != V3D_QPU_M_NOP) { process_waddr_deps(state, n, inst->alu.mul.waddr, inst->alu.mul.magic_write); } if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { process_waddr_deps(state, n, inst->sig_addr, inst->sig_magic); } if (v3d_qpu_writes_r3(devinfo, inst)) add_write_dep(state, &state->last_r[3], n); if (v3d_qpu_writes_r4(devinfo, inst)) add_write_dep(state, &state->last_r[4], n); if (v3d_qpu_writes_r5(devinfo, inst)) add_write_dep(state, &state->last_r[5], n); /* If we add any more dependencies here we should consider whether we * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. */ if (inst->sig.thrsw) { /* All accumulator contents and flags are undefined after the * switch. */ for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) add_write_dep(state, &state->last_r[i], n); add_write_dep(state, &state->last_sf, n); add_write_dep(state, &state->last_rtop, n); /* Scoreboard-locking operations have to stay after the last * thread switch. */ add_write_dep(state, &state->last_tlb, n); add_write_dep(state, &state->last_tmu_write, n); add_write_dep(state, &state->last_tmu_config, n); } if (v3d_qpu_waits_on_tmu(inst)) { /* TMU loads are coming from a FIFO, so ordering is important. */ add_write_dep(state, &state->last_tmu_read, n); /* Keep TMU loads after their TMU lookup terminator */ add_read_dep(state, state->last_tmu_config, n); } /* Allow wrtmuc to be reordered with other instructions in the * same TMU sequence by using a read dependency on the last TMU * sequence terminator. */ if (inst->sig.wrtmuc) add_read_dep(state, state->last_tmu_config, n); if (inst->sig.ldtlb | inst->sig.ldtlbu) add_write_dep(state, &state->last_tlb, n); if (inst->sig.ldvpm) { add_write_dep(state, &state->last_vpm_read, n); /* At least for now, we're doing shared I/O segments, so queue * all writes after all reads. */ if (!separate_vpm_segment) add_write_dep(state, &state->last_vpm, n); } /* inst->sig.ldunif or sideband uniform read */ if (vir_has_uniform(qinst)) add_write_dep(state, &state->last_unif, n); /* Both unifa and ldunifa must preserve ordering */ if (inst->sig.ldunifa || inst->sig.ldunifarf) add_write_dep(state, &state->last_unifa, n); if (v3d_qpu_reads_flags(inst)) add_read_dep(state, state->last_sf, n); if (v3d_qpu_writes_flags(inst)) add_write_dep(state, &state->last_sf, n); } static void calculate_forward_deps(struct v3d_compile *c, struct dag *dag, struct list_head *schedule_list) { struct schedule_state state; memset(&state, 0, sizeof(state)); state.dag = dag; state.devinfo = c->devinfo; state.dir = F; list_for_each_entry(struct schedule_node, node, schedule_list, link) calculate_deps(&state, node); } static void calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, struct list_head *schedule_list) { struct schedule_state state; memset(&state, 0, sizeof(state)); state.dag = dag; state.devinfo = c->devinfo; state.dir = R; list_for_each_entry_rev(struct schedule_node, node, schedule_list, link) { calculate_deps(&state, (struct schedule_node *)node); } } struct choose_scoreboard { struct dag *dag; int tick; int last_magic_sfu_write_tick; int last_stallable_sfu_reg; int last_stallable_sfu_tick; int last_ldvary_tick; int last_unifa_write_tick; int last_uniforms_reset_tick; int last_thrsw_tick; int last_branch_tick; int last_setmsf_tick; bool first_thrsw_emitted; bool last_thrsw_emitted; bool fixup_ldvary; int ldvary_count; }; static bool mux_reads_too_soon(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) { switch (mux) { case V3D_QPU_MUX_R4: if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2) return true; break; case V3D_QPU_MUX_R5: if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) return true; break; default: break; } return false; } static bool reads_too_soon_after_write(struct choose_scoreboard *scoreboard, struct qinst *qinst) { const struct v3d_qpu_instr *inst = &qinst->qpu; /* XXX: Branching off of raddr. */ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return false; assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); if (inst->alu.add.op != V3D_QPU_A_NOP) { if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { return true; } if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { return true; } } if (inst->alu.mul.op != V3D_QPU_M_NOP) { if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { return true; } if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { return true; } } /* XXX: imm */ return false; } static bool writes_too_soon_after_write(const struct v3d_device_info *devinfo, struct choose_scoreboard *scoreboard, struct qinst *qinst) { const struct v3d_qpu_instr *inst = &qinst->qpu; /* Don't schedule any other r4 write too soon after an SFU write. * This would normally be prevented by dependency tracking, but might * occur if a dead SFU computation makes it to scheduling. */ if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 && v3d_qpu_writes_r4(devinfo, inst)) return true; return false; } static bool scoreboard_is_locked(struct choose_scoreboard *scoreboard, bool lock_scoreboard_on_first_thrsw) { if (lock_scoreboard_on_first_thrsw) { return scoreboard->first_thrsw_emitted && scoreboard->tick - scoreboard->last_thrsw_tick >= 3; } return scoreboard->last_thrsw_emitted && scoreboard->tick - scoreboard->last_thrsw_tick >= 3; } static bool pixel_scoreboard_too_soon(struct v3d_compile *c, struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { return qpu_inst_is_tlb(inst) && !scoreboard_is_locked(scoreboard, c->lock_scoreboard_on_first_thrsw); } static bool qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, uint32_t waddr) { if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && inst->raddr_a == waddr) return true; if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && !inst->sig.small_imm && (inst->raddr_b == waddr)) return true; return false; } static bool mux_read_stalls(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && qpu_instruction_uses_rf(inst, scoreboard->last_stallable_sfu_reg); } /* We define a max schedule priority to allow negative priorities as result of * substracting this max when an instruction stalls. So instructions that * stall have lower priority than regular instructions. */ #define MAX_SCHEDULE_PRIORITY 16 static int get_instruction_priority(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { uint32_t baseline_score; uint32_t next_score = 0; /* Schedule TLB operations as late as possible, to get more * parallelism between shaders. */ if (qpu_inst_is_tlb(inst)) return next_score; next_score++; /* Empirical testing shows that using priorities to hide latency of * TMU operations when scheduling QPU leads to slightly worse * performance, even at 2 threads. We think this is because the thread * switching is already quite effective at hiding latency and NIR * scheduling (and possibly TMU pipelining too) are sufficient to hide * TMU latency, so piling up on that here doesn't provide any benefits * and instead may cause us to postpone critical paths that depend on * the TMU results. */ #if 0 /* Schedule texture read results collection late to hide latency. */ if (v3d_qpu_waits_on_tmu(inst)) return next_score; next_score++; #endif /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; #if 0 /* Schedule texture read setup early to hide their latency better. */ if (v3d_qpu_writes_tmu(devinfo, inst)) return next_score; next_score++; #endif /* We should increase the maximum if we assert here */ assert(next_score < MAX_SCHEDULE_PRIORITY); return baseline_score; } enum { V3D_PERIPHERAL_VPM_READ = (1 << 0), V3D_PERIPHERAL_VPM_WRITE = (1 << 1), V3D_PERIPHERAL_VPM_WAIT = (1 << 2), V3D_PERIPHERAL_SFU = (1 << 3), V3D_PERIPHERAL_TMU_WRITE = (1 << 4), V3D_PERIPHERAL_TMU_READ = (1 << 5), V3D_PERIPHERAL_TMU_WAIT = (1 << 6), V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), V3D_PERIPHERAL_TSY = (1 << 8), V3D_PERIPHERAL_TLB = (1 << 9), }; static uint32_t qpu_peripherals(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { uint32_t result = 0; if (v3d_qpu_reads_vpm(inst)) result |= V3D_PERIPHERAL_VPM_READ; if (v3d_qpu_writes_vpm(inst)) result |= V3D_PERIPHERAL_VPM_WRITE; if (v3d_qpu_waits_vpm(inst)) result |= V3D_PERIPHERAL_VPM_WAIT; if (v3d_qpu_writes_tmu(devinfo, inst)) result |= V3D_PERIPHERAL_TMU_WRITE; if (inst->sig.ldtmu) result |= V3D_PERIPHERAL_TMU_READ; if (inst->sig.wrtmuc) result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG; if (v3d_qpu_uses_sfu(inst)) result |= V3D_PERIPHERAL_SFU; if (v3d_qpu_uses_tlb(inst)) result |= V3D_PERIPHERAL_TLB; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) { result |= V3D_PERIPHERAL_TSY; } if (inst->alu.add.op == V3D_QPU_A_TMUWT) result |= V3D_PERIPHERAL_TMU_WAIT; } return result; } static bool qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { const uint32_t a_peripherals = qpu_peripherals(devinfo, a); const uint32_t b_peripherals = qpu_peripherals(devinfo, b); /* We can always do one peripheral access per instruction. */ if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1) return true; if (devinfo->ver < 41) return false; /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than * tmuc). */ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); } if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE && b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) { return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); } /* V3D 4.1+ allows TMU read with VPM read/write. */ if (a_peripherals == V3D_PERIPHERAL_TMU_READ && (b_peripherals == V3D_PERIPHERAL_VPM_READ || b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { return true; } if (b_peripherals == V3D_PERIPHERAL_TMU_READ && (a_peripherals == V3D_PERIPHERAL_VPM_READ || a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { return true; } return false; } /* Compute a bitmask of which rf registers are used between * the two instructions. */ static uint64_t qpu_raddrs_used(const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { assert(a->type == V3D_QPU_INSTR_TYPE_ALU); assert(b->type == V3D_QPU_INSTR_TYPE_ALU); uint64_t raddrs_used = 0; if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) raddrs_used |= (1ll << a->raddr_a); if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) raddrs_used |= (1ll << a->raddr_b); if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) raddrs_used |= (1ll << b->raddr_a); if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) raddrs_used |= (1ll << b->raddr_b); return raddrs_used; } /* Take two instructions and attempt to merge their raddr fields * into one merged instruction. Returns false if the two instructions * access more than two different rf registers between them, or more * than one rf register and one small immediate. */ static bool qpu_merge_raddrs(struct v3d_qpu_instr *result, const struct v3d_qpu_instr *add_instr, const struct v3d_qpu_instr *mul_instr) { uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); int naddrs = util_bitcount64(raddrs_used); if (naddrs > 2) return false; if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { if (naddrs > 1) return false; if (add_instr->sig.small_imm && mul_instr->sig.small_imm) if (add_instr->raddr_b != mul_instr->raddr_b) return false; result->sig.small_imm = true; result->raddr_b = add_instr->sig.small_imm ? add_instr->raddr_b : mul_instr->raddr_b; } if (naddrs == 0) return true; int raddr_a = ffsll(raddrs_used) - 1; raddrs_used &= ~(1ll << raddr_a); result->raddr_a = raddr_a; if (!result->sig.small_imm) { if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && raddr_a == add_instr->raddr_b) { if (add_instr->alu.add.a == V3D_QPU_MUX_B) result->alu.add.a = V3D_QPU_MUX_A; if (add_instr->alu.add.b == V3D_QPU_MUX_B && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { result->alu.add.b = V3D_QPU_MUX_A; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && raddr_a == mul_instr->raddr_b) { if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) result->alu.mul.a = V3D_QPU_MUX_A; if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { result->alu.mul.b = V3D_QPU_MUX_A; } } } if (!raddrs_used) return true; int raddr_b = ffsll(raddrs_used) - 1; result->raddr_b = raddr_b; if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && raddr_b == add_instr->raddr_a) { if (add_instr->alu.add.a == V3D_QPU_MUX_A) result->alu.add.a = V3D_QPU_MUX_B; if (add_instr->alu.add.b == V3D_QPU_MUX_A && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { result->alu.add.b = V3D_QPU_MUX_B; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && raddr_b == mul_instr->raddr_a) { if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) result->alu.mul.a = V3D_QPU_MUX_B; if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { result->alu.mul.b = V3D_QPU_MUX_B; } } return true; } static bool can_do_add_as_mul(enum v3d_qpu_add_op op) { switch (op) { case V3D_QPU_A_ADD: case V3D_QPU_A_SUB: return true; default: return false; } } static enum v3d_qpu_mul_op add_op_as_mul_op(enum v3d_qpu_add_op op) { switch (op) { case V3D_QPU_A_ADD: return V3D_QPU_M_ADD; case V3D_QPU_A_SUB: return V3D_QPU_M_SUB; default: unreachable("unexpected add opcode"); } } static void qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) { STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); assert(inst->alu.add.op != V3D_QPU_A_NOP); assert(inst->alu.mul.op == V3D_QPU_M_NOP); memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul)); inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op); inst->alu.add.op = V3D_QPU_A_NOP; inst->flags.mc = inst->flags.ac; inst->flags.mpf = inst->flags.apf; inst->flags.muf = inst->flags.auf; inst->flags.ac = V3D_QPU_COND_NONE; inst->flags.apf = V3D_QPU_PF_NONE; inst->flags.auf = V3D_QPU_UF_NONE; inst->alu.mul.output_pack = inst->alu.add.output_pack; inst->alu.mul.a_unpack = inst->alu.add.a_unpack; inst->alu.mul.b_unpack = inst->alu.add.b_unpack; inst->alu.add.output_pack = V3D_QPU_PACK_NONE; inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; } static bool qpu_merge_inst(const struct v3d_device_info *devinfo, struct v3d_qpu_instr *result, const struct v3d_qpu_instr *a, const struct v3d_qpu_instr *b) { if (a->type != V3D_QPU_INSTR_TYPE_ALU || b->type != V3D_QPU_INSTR_TYPE_ALU) { return false; } if (!qpu_compatible_peripheral_access(devinfo, a, b)) return false; struct v3d_qpu_instr merge = *a; const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL; struct v3d_qpu_instr mul_inst; if (b->alu.add.op != V3D_QPU_A_NOP) { if (a->alu.add.op == V3D_QPU_A_NOP) { merge.alu.add = b->alu.add; merge.flags.ac = b->flags.ac; merge.flags.apf = b->flags.apf; merge.flags.auf = b->flags.auf; add_instr = b; mul_instr = a; } /* If a's add op is used but its mul op is not, then see if we * can convert either a's add op or b's add op to a mul op * so we can merge. */ else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(b->alu.add.op)) { mul_inst = *b; qpu_convert_add_to_mul(&mul_inst); merge.alu.mul = mul_inst.alu.mul; merge.flags.mc = b->flags.ac; merge.flags.mpf = b->flags.apf; merge.flags.muf = b->flags.auf; add_instr = a; mul_instr = &mul_inst; } else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(a->alu.add.op)) { mul_inst = *a; qpu_convert_add_to_mul(&mul_inst); merge = mul_inst; merge.alu.add = b->alu.add; merge.flags.ac = b->flags.ac; merge.flags.apf = b->flags.apf; merge.flags.auf = b->flags.auf; add_instr = b; mul_instr = &mul_inst; } else { return false; } } if (b->alu.mul.op != V3D_QPU_M_NOP) { if (a->alu.mul.op != V3D_QPU_M_NOP) return false; merge.alu.mul = b->alu.mul; merge.flags.mc = b->flags.mc; merge.flags.mpf = b->flags.mpf; merge.flags.muf = b->flags.muf; mul_instr = b; add_instr = a; } if (add_instr && mul_instr && !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { return false; } merge.sig.thrsw |= b->sig.thrsw; merge.sig.ldunif |= b->sig.ldunif; merge.sig.ldunifrf |= b->sig.ldunifrf; merge.sig.ldunifa |= b->sig.ldunifa; merge.sig.ldunifarf |= b->sig.ldunifarf; merge.sig.ldtmu |= b->sig.ldtmu; merge.sig.ldvary |= b->sig.ldvary; merge.sig.ldvpm |= b->sig.ldvpm; merge.sig.small_imm |= b->sig.small_imm; merge.sig.ldtlb |= b->sig.ldtlb; merge.sig.ldtlbu |= b->sig.ldtlbu; merge.sig.ucb |= b->sig.ucb; merge.sig.rotate |= b->sig.rotate; merge.sig.wrtmuc |= b->sig.wrtmuc; if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && v3d_qpu_sig_writes_address(devinfo, &b->sig)) return false; merge.sig_addr |= b->sig_addr; merge.sig_magic |= b->sig_magic; uint64_t packed; bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); *result = merge; /* No modifying the real instructions on failure. */ assert(ok || (a != result && b != result)); return ok; } static inline bool try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst) { return inst->sig.ldunif || inst->sig.ldunifrf; } static bool qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, struct choose_scoreboard *scoreboard, const struct qinst *qinst); static struct schedule_node * choose_instruction_to_schedule(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct schedule_node *prev_inst) { struct schedule_node *chosen = NULL; int chosen_prio = 0; /* Don't pair up anything with a thread switch signal -- emit_thrsw() * will handle pairing it along with filling the delay slots. */ if (prev_inst) { if (prev_inst->inst->qpu.sig.thrsw) return NULL; } bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT && scoreboard->ldvary_count < c->num_inputs; bool skipped_insts_for_ldvary_pipelining = false; retry: list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, dag.link) { const struct v3d_qpu_instr *inst = &n->inst->qpu; if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) { skipped_insts_for_ldvary_pipelining = true; continue; } /* Don't choose the branch instruction until it's the last one * left. We'll move it up to fit its delay slots after we * choose it. */ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && !list_is_singular(&scoreboard->dag->heads)) { continue; } /* We need to have 3 delay slots between a write to unifa and * a follow-up ldunifa. */ if ((inst->sig.ldunifa || inst->sig.ldunifarf) && scoreboard->tick - scoreboard->last_unifa_write_tick <= 3) continue; /* "An instruction must not read from a location in physical * regfile A or B that was written to by the previous * instruction." */ if (reads_too_soon_after_write(scoreboard, n->inst)) continue; if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; /* "Before doing a TLB access a scoreboard wait must have been * done. This happens either on the first or last thread * switch, depending on a setting (scb_wait_on_first_thrsw) in * the shader state." */ if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; /* ldunif and ldvary both write r5, but ldunif does so a tick * sooner. If the ldvary's r5 wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update * r5 in the same tick. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { continue; } /* If we are in a thrsw delay slot check that this instruction * is valid for that. */ if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick && !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard, n->inst)) { continue; } if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { /* Don't try to put a branch in the delay slots of another * branch or a unifa write. */ if (scoreboard->last_branch_tick + 3 >= scoreboard->tick) continue; if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick) continue; /* No branch with cond != 0,2,3 and msfign != 0 after * setmsf. */ if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 && inst->branch.msfign != V3D_QPU_MSFIGN_NONE && inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { continue; } } /* If we're trying to pair with another instruction, check * that they're compatible. */ if (prev_inst) { /* Don't pair up a thread switch signal -- we'll * handle pairing it when we pick it on its own. */ if (inst->sig.thrsw) continue; if (prev_inst->inst->uniform != -1 && n->inst->uniform != -1) continue; /* Simulator complains if we have two uniforms loaded in * the the same instruction, which could happen if we * have a ldunif or sideband uniform and we pair that * with ldunifa. */ if (vir_has_uniform(prev_inst->inst) && (inst->sig.ldunifa || inst->sig.ldunifarf)) { continue; } if ((prev_inst->inst->qpu.sig.ldunifa || prev_inst->inst->qpu.sig.ldunifarf) && vir_has_uniform(n->inst)) { continue; } /* Don't merge TLB instructions before we have acquired * the scoreboard lock. */ if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; /* When we succesfully pair up an ldvary we then try * to merge it into the previous instruction if * possible to improve pipelining. Don't pick up the * ldvary now if the follow-up fixup would place * it in the delay slots of a thrsw, which is not * allowed and would prevent the fixup from being * successul. */ if (inst->sig.ldvary && scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { continue; } struct v3d_qpu_instr merged_inst; if (!qpu_merge_inst(c->devinfo, &merged_inst, &prev_inst->inst->qpu, inst)) { continue; } } int prio = get_instruction_priority(c->devinfo, inst); if (mux_read_stalls(scoreboard, inst)) { /* Don't merge an instruction that stalls */ if (prev_inst) continue; else { /* Any instruction that don't stall will have * higher scheduling priority */ prio -= MAX_SCHEDULE_PRIORITY; assert(prio < 0); } } /* Found a valid instruction. If nothing better comes along, * this one works. */ if (!chosen) { chosen = n; chosen_prio = prio; continue; } if (prio > chosen_prio) { chosen = n; chosen_prio = prio; } else if (prio < chosen_prio) { continue; } if (n->delay > chosen->delay) { chosen = n; chosen_prio = prio; } else if (n->delay < chosen->delay) { continue; } } /* If we did not find any instruction to schedule but we discarded * some of them to prioritize ldvary pipelining, try again. */ if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) { skipped_insts_for_ldvary_pipelining = false; ldvary_pipelining = false; goto retry; } if (chosen && chosen->inst->qpu.sig.ldvary) { scoreboard->ldvary_count++; /* If we are pairing an ldvary, flag it so we can fix it up for * optimal pipelining of ldvary sequences. */ if (prev_inst) scoreboard->fixup_ldvary = true; } return chosen; } static void update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, enum v3d_qpu_waddr waddr, const struct v3d_device_info *devinfo) { if (v3d_qpu_magic_waddr_is_sfu(waddr)) scoreboard->last_magic_sfu_write_tick = scoreboard->tick; else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA) scoreboard->last_unifa_write_tick = scoreboard->tick; } static void update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { if (v3d_qpu_instr_is_sfu(inst)) { scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; scoreboard->last_stallable_sfu_tick = scoreboard->tick; } } static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst, const struct v3d_device_info *devinfo) { if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) return; assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); if (inst->alu.add.op != V3D_QPU_A_NOP) { if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.add.waddr, devinfo); } else { update_scoreboard_for_sfu_stall_waddr(scoreboard, inst); } if (inst->alu.add.op == V3D_QPU_A_SETMSF) scoreboard->last_setmsf_tick = scoreboard->tick; } if (inst->alu.mul.op != V3D_QPU_M_NOP) { if (inst->alu.mul.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.mul.waddr, devinfo); } } if (inst->sig.ldvary) scoreboard->last_ldvary_tick = scoreboard->tick; } static void dump_state(const struct v3d_device_info *devinfo, struct dag *dag) { list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { fprintf(stderr, " t=%4d: ", n->unblocked_time); v3d_qpu_dump(devinfo, &n->inst->qpu); fprintf(stderr, "\n"); util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { struct schedule_node *child = (struct schedule_node *)edge->child; if (!child) continue; fprintf(stderr, " - "); v3d_qpu_dump(devinfo, &child->inst->qpu); fprintf(stderr, " (%d parents, %c)\n", child->dag.parent_count, edge->data ? 'w' : 'r'); } } } static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo, enum v3d_qpu_waddr waddr, const struct v3d_qpu_instr *after) { /* Apply some huge latency between texture fetch requests and getting * their results back. * * FIXME: This is actually pretty bogus. If we do: * * mov tmu0_s, a * * mov tmu0_s, b * load_tmu0 * * load_tmu0 * * we count that as worse than * * mov tmu0_s, a * mov tmu0_s, b * * load_tmu0 * * load_tmu0 * * because we associate the first load_tmu0 with the *second* tmu0_s. */ if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) && v3d_qpu_waits_on_tmu(after)) { return 100; } /* Assume that anything depending on us is consuming the SFU result. */ if (v3d_qpu_magic_waddr_is_sfu(waddr)) return 3; return 1; } static uint32_t instruction_latency(const struct v3d_device_info *devinfo, struct schedule_node *before, struct schedule_node *after) { const struct v3d_qpu_instr *before_inst = &before->inst->qpu; const struct v3d_qpu_instr *after_inst = &after->inst->qpu; uint32_t latency = 1; if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || after_inst->type != V3D_QPU_INSTR_TYPE_ALU) return latency; if (before_inst->alu.add.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.add.waddr, after_inst)); } if (before_inst->alu.mul.magic_write) { latency = MAX2(latency, magic_waddr_latency(devinfo, before_inst->alu.mul.waddr, after_inst)); } if (v3d_qpu_instr_is_sfu(before_inst)) return 2; return latency; } /** Recursive computation of the delay member of a node. */ static void compute_delay(struct dag_node *node, void *state) { struct schedule_node *n = (struct schedule_node *)node; struct v3d_compile *c = (struct v3d_compile *) state; n->delay = 1; util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { struct schedule_node *child = (struct schedule_node *)edge->child; n->delay = MAX2(n->delay, (child->delay + instruction_latency(c->devinfo, n, child))); } } /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() * should be called on it later to finish pruning the other edges). */ static void pre_remove_head(struct dag *dag, struct schedule_node *n) { list_delinit(&n->dag.link); util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { if (edge->data) dag_remove_edge(dag, edge); } } static void mark_instruction_scheduled(const struct v3d_device_info *devinfo, struct dag *dag, uint32_t time, struct schedule_node *node) { if (!node) return; util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { struct schedule_node *child = (struct schedule_node *)edge->child; if (!child) continue; uint32_t latency = instruction_latency(devinfo, node, child); child->unblocked_time = MAX2(child->unblocked_time, time + latency); } dag_prune_head(dag, &node->dag); } static void insert_scheduled_instruction(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, struct qinst *inst) { list_addtail(&inst->link, &block->instructions); update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo); c->qpu_inst_count++; scoreboard->tick++; } static struct qinst * vir_nop() { struct qreg undef = vir_nop_reg(); struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); return qinst; } static void emit_nop(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard) { insert_scheduled_instruction(c, block, scoreboard, vir_nop()); } static bool qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, const struct qinst *qinst, int slot) { const struct v3d_qpu_instr *inst = &qinst->qpu; if (slot == 2 && qinst->is_tlb_z_write) return false; if (slot > 0 && qinst->uniform != ~0) return false; if (v3d_qpu_waits_vpm(inst)) return false; if (inst->sig.ldvary) return false; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { /* GFXH-1625: TMUWT not allowed in the final instruction. */ if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) return false; /* No writing physical registers at the end. */ if (!inst->alu.add.magic_write || !inst->alu.mul.magic_write) { return false; } if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && !inst->sig_magic) { return false; } if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) return false; /* RF0-2 might be overwritten during the delay slots by * fragment shader setup. */ if (inst->raddr_a < 3 && (inst->alu.add.a == V3D_QPU_MUX_A || inst->alu.add.b == V3D_QPU_MUX_A || inst->alu.mul.a == V3D_QPU_MUX_A || inst->alu.mul.b == V3D_QPU_MUX_A)) { return false; } if (inst->raddr_b < 3 && !inst->sig.small_imm && (inst->alu.add.a == V3D_QPU_MUX_B || inst->alu.add.b == V3D_QPU_MUX_B || inst->alu.mul.a == V3D_QPU_MUX_B || inst->alu.mul.b == V3D_QPU_MUX_B)) { return false; } } return true; } /** * This is called when trying to merge a thrsw back into the instruction stream * of instructions that were scheduled *before* the thrsw signal to fill its * delay slots. Because the actual execution of the thrsw happens after the * delay slots, it is usually safe to do this, but there are some cases that * need special care. */ static bool qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, const struct qinst *qinst, uint32_t slot) { /* No scheduling SFU when the result would land in the other * thread. The simulator complains for safety, though it * would only occur for dead code in our case. */ if (slot > 0 && qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { return false; } if (slot > 0 && qinst->qpu.sig.ldvary) return false; /* unifa and the following 3 instructions can't overlap a * thread switch/end. The docs further clarify that this means * the cycle at which the actual thread switch/end happens * and not when the thrsw instruction is processed, which would * be after the 2 delay slots following the thrsw instruction. * This means that we can move up a thrsw up to the instruction * right after unifa: * * unifa, r5 * thrsw * delay slot 1 * delay slot 2 * Thread switch happens here, 4 instructions away from unifa */ if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) return false; return true; } /** * This is called for instructions scheduled *after* a thrsw signal that may * land in the delay slots of the thrsw. Because these instructions were * scheduled after the thrsw, we need to be careful when placing them into * the delay slots, since that means that we are moving them ahead of the * thread switch and we need to ensure that is not a problem. */ static bool qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, struct choose_scoreboard *scoreboard, const struct qinst *qinst) { const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick; assert(slot <= 2); /* We merge thrsw instructions back into the instruction stream * manually, so any instructions scheduled after a thrsw shold be * in the actual delay slots and not in the same slot as the thrsw. */ assert(slot >= 1); /* No emitting a thrsw while the previous thrsw hasn't happened yet. */ if (qinst->qpu.sig.thrsw) return false; /* The restrictions for instructions scheduled before the the thrsw * also apply to instructions scheduled after the thrsw that we want * to place in its delay slots. */ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) return false; /* TLB access is disallowed until scoreboard wait is executed, which * we do on the last thread switch. */ if (qpu_inst_is_tlb(&qinst->qpu)) return false; /* Instruction sequence restrictions: Branch is not allowed in delay * slots of a thrsw. */ if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) return false; /* Miscellaneous restrictions: At the point of a thrsw we need to have * at least one outstanding lookup or TSY wait. * * So avoid placing TMU instructions scheduled after the thrsw into * its delay slots or we may be compromising the integrity of our TMU * sequences. Also, notice that if we moved these instructions into * the delay slots of a previous thrsw we could overflow our TMU output * fifo, since we could be effectively pipelining a lookup scheduled * after the thrsw into the sequence before the thrsw. */ if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) || qinst->qpu.sig.wrtmuc) { return false; } /* Don't move instructions that wait on the TMU before the thread switch * happens since that would make the current thread stall before the * switch, which is exactly what we want to avoid with the thrsw * instruction. */ if (v3d_qpu_waits_on_tmu(&qinst->qpu)) return false; /* A thread switch invalidates all accumulators, so don't place any * instructions that write accumulators into the delay slots. */ if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu)) return false; /* Multop has an implicit write to the rtop register which is an * specialized accumulator that is only used with this instruction. */ if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) return false; /* Flags are invalidated across a thread switch, so dont' place * instructions that write flags into delay slots. */ if (v3d_qpu_writes_flags(&qinst->qpu)) return false; /* TSY sync ops materialize at the point of the next thread switch, * therefore, if we have a TSY sync right after a thread switch, we * cannot place it in its delay slots, or we would be moving the sync * to the thrsw before it instead. */ if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID) return false; return true; } static bool valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qinst *qinst, int instructions_in_sequence, bool is_thrend) { /* No emitting our thrsw while the previous thrsw hasn't happened yet. */ if (scoreboard->last_thrsw_tick + 3 > scoreboard->tick - instructions_in_sequence) { return false; } for (int slot = 0; slot < instructions_in_sequence; slot++) { if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) return false; if (is_thrend && !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { return false; } /* Note that the list is circular, so we can only do this up * to instructions_in_sequence. */ qinst = (struct qinst *)qinst->link.next; } return true; } /** * Emits a THRSW signal in the stream, trying to move it up to pair with * another instruction. */ static int emit_thrsw(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, struct qinst *inst, bool is_thrend) { int time = 0; /* There should be nothing in a thrsw inst being scheduled other than * the signal bits. */ assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); /* Don't try to emit a thrsw in the delay slots of a previous thrsw * or branch. */ while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) { emit_nop(c, block, scoreboard); time++; } while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) { emit_nop(c, block, scoreboard); time++; } /* Find how far back into previous instructions we can put the THRSW. */ int slots_filled = 0; int invalid_sig_count = 0; bool last_thrsw_after_invalid_ok = false; struct qinst *merge_inst = NULL; vir_for_each_inst_rev(prev_inst, block) { if (!valid_thrsw_sequence(c, scoreboard, prev_inst, slots_filled + 1, is_thrend)) { break; } struct v3d_qpu_sig sig = prev_inst->qpu.sig; sig.thrsw = true; uint32_t packed_sig; if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) { /* If we can't merge the thrsw here because of signal * incompatibility, keep going, we might be able to * merge it in an earlier instruction. */ invalid_sig_count++; goto cont_block; } /* For last thrsw we need 2 consecutive slots that are * thrsw compatible, so if we have previously jumped over * an incompatible signal, flag that we have found the first * valid slot here and keep going. */ if (inst->is_last_thrsw && invalid_sig_count > 0 && !last_thrsw_after_invalid_ok) { last_thrsw_after_invalid_ok = true; invalid_sig_count++; goto cont_block; } last_thrsw_after_invalid_ok = false; invalid_sig_count = 0; merge_inst = prev_inst; cont_block: if (++slots_filled == 3) break; } /* If we jumped over a signal incompatibility and did not manage to * merge the thrsw in the end, we need to adjust slots filled to match * the last valid merge point. */ assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count); if (invalid_sig_count > 0) slots_filled -= invalid_sig_count; bool needs_free = false; if (merge_inst) { merge_inst->qpu.sig.thrsw = true; needs_free = true; scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled; } else { scoreboard->last_thrsw_tick = scoreboard->tick; insert_scheduled_instruction(c, block, scoreboard, inst); time++; slots_filled++; merge_inst = inst; } scoreboard->first_thrsw_emitted = true; /* If we're emitting the last THRSW (other than program end), then * signal that to the HW by emitting two THRSWs in a row. */ if (inst->is_last_thrsw) { if (slots_filled <= 1) { emit_nop(c, block, scoreboard); time++; } struct qinst *second_inst = (struct qinst *)merge_inst->link.next; second_inst->qpu.sig.thrsw = true; scoreboard->last_thrsw_emitted = true; } /* Make sure the thread end executes within the program lifespan */ if (is_thrend) { for (int i = 0; i < 3 - slots_filled; i++) { emit_nop(c, block, scoreboard); time++; } } /* If we put our THRSW into another instruction, free up the * instruction that didn't end up scheduled into the list. */ if (needs_free) free(inst); return time; } static bool qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst) { if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) return false; if (inst->qpu.sig.thrsw) return false; if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu)) return false; if (vir_has_uniform(inst)) return false; return true; } static void emit_branch(struct v3d_compile *c, struct qblock *block, struct choose_scoreboard *scoreboard, struct qinst *inst) { assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); /* We should've not picked up a branch for the delay slots of a previous * thrsw, branch or unifa write instruction. */ int branch_tick = scoreboard->tick; assert(scoreboard->last_thrsw_tick + 2 < branch_tick); assert(scoreboard->last_branch_tick + 3 < branch_tick); assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); /* Can't place a branch with msfign != 0 and cond != 0,2,3 after * setmsf. */ bool is_safe_msf_branch = inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0; assert(scoreboard->last_setmsf_tick != branch_tick - 1 || is_safe_msf_branch); /* Insert the branch instruction */ insert_scheduled_instruction(c, block, scoreboard, inst); /* Now see if we can move the branch instruction back into the * instruction stream to fill its delay slots */ int slots_filled = 0; while (slots_filled < 3 && block->instructions.next != &inst->link) { struct qinst *prev_inst = (struct qinst *) inst->link.prev; assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH); /* Can't move the branch instruction if that would place it * in the delay slots of other instructions. */ if (scoreboard->last_branch_tick + 3 >= branch_tick - slots_filled - 1) { break; } if (scoreboard->last_thrsw_tick + 2 >= branch_tick - slots_filled - 1) { break; } if (scoreboard->last_unifa_write_tick + 3 >= branch_tick - slots_filled - 1) { break; } /* Do not move up a branch if it can disrupt an ldvary sequence * as that can cause stomping of the r5 register. */ if (scoreboard->last_ldvary_tick + 2 >= branch_tick - slots_filled) { break; } /* Can't move a conditional branch before the instruction * that writes the flags for its condition. */ if (v3d_qpu_writes_flags(&prev_inst->qpu) && inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) { break; } if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst)) break; if (!is_safe_msf_branch) { struct qinst *prev_prev_inst = (struct qinst *) prev_inst->link.prev; if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) { break; } } list_del(&prev_inst->link); list_add(&prev_inst->link, &inst->link); slots_filled++; } block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled; scoreboard->last_branch_tick = branch_tick - slots_filled; /* Fill any remaining delay slots. * * For unconditional branches we'll try to fill these with the * first instructions in the successor block after scheduling * all blocks when setting up branch targets. */ for (int i = 0; i < 3 - slots_filled; i++) emit_nop(c, block, scoreboard); } static bool alu_reads_register(struct v3d_qpu_instr *inst, bool add, bool magic, uint32_t index) { uint32_t num_src; enum v3d_qpu_mux mux_a, mux_b; if (add) { num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); mux_a = inst->alu.add.a; mux_b = inst->alu.add.b; } else { num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); mux_a = inst->alu.mul.a; mux_b = inst->alu.mul.b; } for (int i = 0; i < num_src; i++) { if (magic) { if (i == 0 && mux_a == index) return true; if (i == 1 && mux_b == index) return true; } else { if (i == 0 && mux_a == V3D_QPU_MUX_A && inst->raddr_a == index) { return true; } if (i == 0 && mux_a == V3D_QPU_MUX_B && inst->raddr_b == index) { return true; } if (i == 1 && mux_b == V3D_QPU_MUX_A && inst->raddr_a == index) { return true; } if (i == 1 && mux_b == V3D_QPU_MUX_B && inst->raddr_b == index) { return true; } } } return false; } /** * This takes and ldvary signal merged into 'inst' and tries to move it up to * the previous instruction to get good pipelining of ldvary sequences, * transforming this: * * nop ; nop ; ldvary.r4 * nop ; fmul r0, r4, rf0 ; * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst * * into: * * nop ; nop ; ldvary.r4 * nop ; fmul r0, r4, rf0 ; ldvary.r1 * fadd rf13, r0, r5 ; nop; ; <-- inst * * If we manage to do this successfully (we return true here), then flagging * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that * we will be able to pick up to merge into 'inst', leading to code like this: * * nop ; nop ; ldvary.r4 * nop ; fmul r0, r4, rf0 ; ldvary.r1 * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst */ static bool fixup_pipelined_ldvary(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, struct v3d_qpu_instr *inst) { /* We only call this if we have successfuly merged an ldvary into a * previous instruction. */ assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); assert(inst->sig.ldvary); uint32_t ldvary_magic = inst->sig_magic; uint32_t ldvary_index = inst->sig_addr; /* The instruction in which we merged the ldvary cannot read * the ldvary destination, if it does, then moving the ldvary before * it would overwrite it. */ if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) return false; if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) return false; /* The implicit ldvary destination may not be written to by a signal * in the instruction following ldvary. Since we are planning to move * ldvary to the previous instruction, this means we need to check if * the current instruction has any other signal that could create this * conflict. The only other signal that can write to the implicit * ldvary destination that is compatible with ldvary in the same * instruction is ldunif. */ if (inst->sig.ldunif) return false; /* The previous instruction can't write to the same destination as the * ldvary. */ struct qinst *prev = (struct qinst *) block->instructions.prev; if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) return false; if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { if (prev->qpu.alu.add.magic_write == ldvary_magic && prev->qpu.alu.add.waddr == ldvary_index) { return false; } } if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { if (prev->qpu.alu.mul.magic_write == ldvary_magic && prev->qpu.alu.mul.waddr == ldvary_index) { return false; } } /* The previous instruction cannot have a conflicting signal */ if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) return false; uint32_t sig; struct v3d_qpu_sig new_sig = prev->qpu.sig; new_sig.ldvary = true; if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig)) return false; /* The previous instruction cannot use flags since ldvary uses the * 'cond' instruction field to store the destination. */ if (v3d_qpu_writes_flags(&prev->qpu)) return false; if (v3d_qpu_reads_flags(&prev->qpu)) return false; /* We can't put an ldvary in the delay slots of a thrsw. We should've * prevented this when pairing up the ldvary with another instruction * and flagging it for a fixup. */ assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); /* Move the ldvary to the previous instruction and remove it from the * current one. */ prev->qpu.sig.ldvary = true; prev->qpu.sig_magic = ldvary_magic; prev->qpu.sig_addr = ldvary_index; scoreboard->last_ldvary_tick = scoreboard->tick - 1; inst->sig.ldvary = false; inst->sig_magic = false; inst->sig_addr = 0; /* By moving ldvary to the previous instruction we make it update * r5 in the current one, so nothing else in it should write r5. * This should've been prevented by our depedency tracking, which * would not allow ldvary to be paired up with an instruction that * writes r5 (since our dependency tracking doesn't know that the * ldvary write r5 happens in the next instruction). */ assert(!v3d_qpu_writes_r5(c->devinfo, inst)); return true; } static uint32_t schedule_instructions(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform) { const struct v3d_device_info *devinfo = c->devinfo; uint32_t time = 0; while (!list_is_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = choose_instruction_to_schedule(c, scoreboard, NULL); struct schedule_node *merge = NULL; /* If there are no valid instructions to schedule, drop a NOP * in. */ struct qinst *qinst = chosen ? chosen->inst : vir_nop(); struct v3d_qpu_instr *inst = &qinst->qpu; if (debug) { fprintf(stderr, "t=%4d: current list:\n", time); dump_state(devinfo, scoreboard->dag); fprintf(stderr, "t=%4d: chose: ", time); v3d_qpu_dump(devinfo, inst); fprintf(stderr, "\n"); } /* We can't mark_instruction_scheduled() the chosen inst until * we're done identifying instructions to merge, so put the * merged instructions on a list for a moment. */ struct list_head merged_list; list_inithead(&merged_list); /* Schedule this instruction onto the QPU list. Also try to * find an instruction to pair with it. */ if (chosen) { time = MAX2(chosen->unblocked_time, time); pre_remove_head(scoreboard->dag, chosen); while ((merge = choose_instruction_to_schedule(c, scoreboard, chosen))) { time = MAX2(merge->unblocked_time, time); pre_remove_head(scoreboard->dag, merge); list_addtail(&merge->link, &merged_list); (void)qpu_merge_inst(devinfo, inst, inst, &merge->inst->qpu); if (merge->inst->uniform != -1) { chosen->inst->uniform = merge->inst->uniform; } if (debug) { fprintf(stderr, "t=%4d: merging: ", time); v3d_qpu_dump(devinfo, &merge->inst->qpu); fprintf(stderr, "\n"); fprintf(stderr, " result: "); v3d_qpu_dump(devinfo, inst); fprintf(stderr, "\n"); } if (scoreboard->fixup_ldvary) { scoreboard->fixup_ldvary = false; if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { /* Flag the ldvary as scheduled * now so we can try to merge the * follow-up instruction in the * the ldvary sequence into the * current instruction. */ mark_instruction_scheduled( devinfo, scoreboard->dag, time, merge); } } } if (mux_read_stalls(scoreboard, inst)) c->qpu_inst_stalled_count++; } /* Update the uniform index for the rewritten location -- * branch target updating will still need to change * c->uniform_data[] using this index. */ if (qinst->uniform != -1) { if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) block->branch_uniform = *next_uniform; c->uniform_data[*next_uniform] = orig_uniform_data[qinst->uniform]; c->uniform_contents[*next_uniform] = orig_uniform_contents[qinst->uniform]; qinst->uniform = *next_uniform; (*next_uniform)++; } if (debug) { fprintf(stderr, "\n"); } /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen); list_for_each_entry(struct schedule_node, merge, &merged_list, link) { mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge); /* The merged VIR instruction doesn't get re-added to the * block, so free it now. */ free(merge->inst); } if (inst->sig.thrsw) { time += emit_thrsw(c, block, scoreboard, qinst, false); } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { emit_branch(c, block, scoreboard, qinst); } else { insert_scheduled_instruction(c, block, scoreboard, qinst); } } return time; } static uint32_t qpu_schedule_instructions_block(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform) { void *mem_ctx = ralloc_context(NULL); scoreboard->dag = dag_create(mem_ctx); struct list_head setup_list; list_inithead(&setup_list); /* Wrap each instruction in a scheduler structure. */ while (!list_is_empty(&block->instructions)) { struct qinst *qinst = (struct qinst *)block->instructions.next; struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); dag_init_node(scoreboard->dag, &n->dag); n->inst = qinst; list_del(&qinst->link); list_addtail(&n->link, &setup_list); } calculate_forward_deps(c, scoreboard->dag, &setup_list); calculate_reverse_deps(c, scoreboard->dag, &setup_list); dag_traverse_bottom_up(scoreboard->dag, compute_delay, c); uint32_t cycles = schedule_instructions(c, scoreboard, block, orig_uniform_contents, orig_uniform_data, next_uniform); ralloc_free(mem_ctx); scoreboard->dag = NULL; return cycles; } static void qpu_set_branch_targets(struct v3d_compile *c) { vir_for_each_block(block, c) { /* The end block of the program has no branch. */ if (!block->successors[0]) continue; /* If there was no branch instruction, then the successor * block must follow immediately after this one. */ if (block->branch_qpu_ip == ~0) { assert(block->end_qpu_ip + 1 == block->successors[0]->start_qpu_ip); continue; } /* Walk back through the delay slots to find the branch * instr. */ struct qinst *branch = NULL; struct list_head *entry = block->instructions.prev; int32_t delay_slot_count = -1; struct qinst *delay_slots_start = NULL; for (int i = 0; i < 3; i++) { entry = entry->prev; struct qinst *inst = container_of(entry, struct qinst, link); if (delay_slot_count == -1) { if (!v3d_qpu_is_nop(&inst->qpu)) delay_slot_count = i; else delay_slots_start = inst; } if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) { branch = inst; break; } } assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); assert(delay_slot_count >= 0 && delay_slot_count <= 3); assert(delay_slot_count == 0 || delay_slots_start != NULL); /* Make sure that the if-we-don't-jump * successor was scheduled just after the * delay slots. */ assert(!block->successors[1] || block->successors[1]->start_qpu_ip == block->branch_qpu_ip + 4); branch->qpu.branch.offset = ((block->successors[0]->start_qpu_ip - (block->branch_qpu_ip + 4)) * sizeof(uint64_t)); /* Set up the relative offset to jump in the * uniform stream. * * Use a temporary here, because * uniform_data[inst->uniform] may be shared * between multiple instructions. */ assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); c->uniform_data[branch->uniform] = (block->successors[0]->start_uniform - (block->branch_uniform + 1)) * 4; /* If this is an unconditional branch, try to fill any remaining * delay slots with the initial instructions of the successor * block. * * FIXME: we can do the same for conditional branches if we * predicate the instructions to match the branch condition. */ if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) { struct list_head *successor_insts = &block->successors[0]->instructions; delay_slot_count = MIN2(delay_slot_count, list_length(successor_insts)); struct qinst *s_inst = (struct qinst *) successor_insts->next; struct qinst *slot = delay_slots_start; int slots_filled = 0; while (slots_filled < delay_slot_count && qpu_inst_valid_in_branch_delay_slot(c, s_inst)) { memcpy(&slot->qpu, &s_inst->qpu, sizeof(slot->qpu)); s_inst = (struct qinst *) s_inst->link.next; slot = (struct qinst *) slot->link.next; slots_filled++; } branch->qpu.branch.offset += slots_filled * sizeof(uint64_t); } } } uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c) { const struct v3d_device_info *devinfo = c->devinfo; struct qblock *end_block = list_last_entry(&c->blocks, struct qblock, link); /* We reorder the uniforms as we schedule instructions, so save the * old data off and replace it. */ uint32_t *uniform_data = c->uniform_data; enum quniform_contents *uniform_contents = c->uniform_contents; c->uniform_contents = ralloc_array(c, enum quniform_contents, c->num_uniforms); c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); c->uniform_array_size = c->num_uniforms; uint32_t next_uniform = 0; struct choose_scoreboard scoreboard; memset(&scoreboard, 0, sizeof(scoreboard)); scoreboard.last_ldvary_tick = -10; scoreboard.last_unifa_write_tick = -10; scoreboard.last_magic_sfu_write_tick = -10; scoreboard.last_uniforms_reset_tick = -10; scoreboard.last_thrsw_tick = -10; scoreboard.last_branch_tick = -10; scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); vir_for_each_block(block, c) { fprintf(stderr, "BLOCK %d\n", block->index); list_for_each_entry(struct qinst, qinst, &block->instructions, link) { v3d_qpu_dump(devinfo, &qinst->qpu); fprintf(stderr, "\n"); } } fprintf(stderr, "\n"); } uint32_t cycles = 0; vir_for_each_block(block, c) { block->start_qpu_ip = c->qpu_inst_count; block->branch_qpu_ip = ~0; block->start_uniform = next_uniform; cycles += qpu_schedule_instructions_block(c, &scoreboard, block, uniform_contents, uniform_data, &next_uniform); block->end_qpu_ip = c->qpu_inst_count - 1; } /* Emit the program-end THRSW instruction. */; struct qinst *thrsw = vir_nop(); thrsw->qpu.sig.thrsw = true; emit_thrsw(c, end_block, &scoreboard, thrsw, true); qpu_set_branch_targets(c); assert(next_uniform == c->num_uniforms); return cycles; }