/* * Copyright © 2018 Valve Corporation * * SPDX-License-Identifier: MIT */ #include "aco_builder.h" #include "aco_ir.h" #include #include #include namespace aco { namespace { /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal * that we should switch to a different wave and contains info on dependencies as to * when we can switch back. * * This seems to apply only for ALU->ALU dependencies as other instructions have better * integration with the frontend. * * Note that if we do not emit s_delay_alu things will still be correct, but the wave * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU * instructions can take a different number of cycles based on the exec mask) */ struct alu_delay_info { /* These are the values directly above the max representable value, i.e. the wait * would turn into a no-op when we try to wait for something further back than * this. */ static constexpr int8_t valu_nop = 5; static constexpr int8_t trans_nop = 4; /* How many VALU instructions ago this value was written */ int8_t valu_instrs = valu_nop; /* Cycles until the writing VALU instruction is finished */ int8_t valu_cycles = 0; /* How many Transcedent instructions ago this value was written */ int8_t trans_instrs = trans_nop; /* Cycles until the writing Transcendent instruction is finished */ int8_t trans_cycles = 0; /* Cycles until the writing SALU instruction is finished*/ int8_t salu_cycles = 0; /* VALU wrote this as lane mask. */ bool lane_mask_forwarding = true; bool combine(const alu_delay_info& other) { bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs || other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles || other.trans_cycles > trans_cycles; valu_instrs = std::min(valu_instrs, other.valu_instrs); trans_instrs = std::min(trans_instrs, other.trans_instrs); salu_cycles = std::max(salu_cycles, other.salu_cycles); valu_cycles = std::max(valu_cycles, other.valu_cycles); trans_cycles = std::max(trans_cycles, other.trans_cycles); lane_mask_forwarding &= other.lane_mask_forwarding; return changed; } /* Needs to be called after any change to keep the data consistent. */ bool fixup() { if (valu_instrs >= valu_nop || valu_cycles <= 0) { valu_instrs = valu_nop; valu_cycles = 0; } if (trans_instrs >= trans_nop || trans_cycles <= 0) { trans_instrs = trans_nop; trans_cycles = 0; } salu_cycles = std::max(salu_cycles, 0); return empty(); } /* Returns true if a wait would be a no-op */ bool empty() const { return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0; } UNUSED void print(FILE* output) const { if (valu_instrs != valu_nop) fprintf(output, "valu_instrs: %u\n", valu_instrs); if (valu_cycles) fprintf(output, "valu_cycles: %u\n", valu_cycles); if (trans_instrs != trans_nop) fprintf(output, "trans_instrs: %u\n", trans_instrs); if (trans_cycles) fprintf(output, "trans_cycles: %u\n", trans_cycles); if (salu_cycles) fprintf(output, "salu_cycles: %u\n", salu_cycles); } }; struct delay_ctx { Program* program; std::map gpr_map; delay_ctx() {} delay_ctx(Program* program_) : program(program_) {} UNUSED void print(FILE* output) const { for (const auto& entry : gpr_map) { fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's', entry.first.reg() & 0xff); entry.second.print(output); fprintf(output, "}\n"); } } }; void check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr) { for (unsigned i = 0; i < instr->operands.size(); i++) { const Operand op = instr->operands[i]; alu_delay_info op_delay; if (op.isConstant() || op.isUndefined()) continue; /* check consecutively read gprs */ for (unsigned j = 0; j < op.size(); j++) { std::map::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j}); if (it != ctx.gpr_map.end()) op_delay.combine(it->second); } bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 || instr->opcode == aco_opcode::v_cndmask_b16 || instr->opcode == aco_opcode::v_dual_cndmask_b32) && i == 2; fast_forward |= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 && i + 1 == instr->operands.size(); if (!op_delay.lane_mask_forwarding || !fast_forward) delay.combine(op_delay); } } void update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles) { std::map::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { alu_delay_info& entry = it->second; entry.valu_instrs += is_valu ? 1 : 0; entry.trans_instrs += is_trans ? 1 : 0; entry.salu_cycles -= cycles; entry.valu_cycles -= cycles; entry.trans_cycles -= cycles; it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); } } void kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx) { /* Consider frontend waits first. These are automatically done by the hardware, * so we don't need to insert s_delay_alu. * They are also lower granularity, waiting for accesses of a counter instead * of only the real per register dependencies. */ depctr_wait wait = parse_depctr_wait(instr); int8_t implict_cycles = 0; if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec || !wait.va_exec) { std::map::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { alu_delay_info& entry = it->second; bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) || (it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) || (it->first >= exec && it->first <= exec_hi && !wait.va_exec); if (wait_valu) { implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles); entry.valu_cycles = 0; entry.trans_cycles = 0; } bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) || (it->first >= exec && it->first <= exec_hi && !wait.sa_exec); if (wait_salu) { implict_cycles = MAX2(implict_cycles, entry.salu_cycles); entry.salu_cycles = 0; } it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); } } /* Previous alu progresses as usual while the frontend waits. */ if (implict_cycles != 0) update_alu(ctx, false, false, implict_cycles); if (instr->isVALU() || instr->isSALU()) check_alu(ctx, delay, instr); if (!delay.empty()) { update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles)); /* remove all gprs with higher counter from map */ std::map::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { if (delay.valu_instrs <= it->second.valu_instrs) it->second.valu_instrs = alu_delay_info::valu_nop; if (delay.trans_instrs <= it->second.trans_instrs) it->second.trans_instrs = alu_delay_info::trans_nop; it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); } } } void gen_alu(Instruction* instr, delay_ctx& ctx) { Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr); bool is_valu = instr->isVALU(); bool is_trans = instr->isTrans(); if (is_trans || is_valu || instr->isSALU()) { alu_delay_info delay; delay.lane_mask_forwarding = false; if (is_trans) { delay.trans_instrs = 0; delay.trans_cycles = cycle_info.latency; } else if (is_valu) { delay.valu_instrs = 0; delay.valu_cycles = cycle_info.latency; } else if (instr->isSALU()) { delay.salu_cycles = cycle_info.latency; } for (Definition& def : instr->definitions) { if (is_valu && def.regClass() == ctx.program->lane_mask) { delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 && instr->opcode != aco_opcode::v_readfirstlane_b32; } for (unsigned j = 0; j < def.size(); j++) { auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay); if (!it.second) it.first->second.combine(delay); } } } update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans, cycle_info.issue_cycles); } void emit_delay_alu(delay_ctx& ctx, std::vector>& instructions, alu_delay_info& delay) { uint32_t imm = 0; if (delay.trans_instrs != delay.trans_nop) { imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1; } if (delay.valu_instrs != delay.valu_nop) { imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0); } /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally * getting this wrong isn't an issue. */ if (delay.salu_cycles && imm <= 0xf) { unsigned cycles = std::min(3, delay.salu_cycles); imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0); } Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0); inst->salu().imm = imm; instructions.emplace_back(inst); delay = alu_delay_info(); } void handle_block(Program* program, Block& block, delay_ctx& ctx) { std::vector> new_instructions; alu_delay_info queued_delay; for (size_t i = 0; i < block.instructions.size(); i++) { aco_ptr& instr = block.instructions[i]; assert(instr->opcode != aco_opcode::s_delay_alu); kill_alu(queued_delay, instr.get(), ctx); gen_alu(instr.get(), ctx); if (!queued_delay.empty()) emit_delay_alu(ctx, new_instructions, queued_delay); new_instructions.emplace_back(std::move(instr)); } block.instructions.swap(new_instructions); } } /* end namespace */ void insert_delay_alu(Program* program) { /* per BB ctx */ delay_ctx ctx(program); for (unsigned i = 0; i < program->blocks.size(); i++) { Block& current = program->blocks[i]; if (current.instructions.empty()) continue; handle_block(program, current, ctx); /* Reset ctx if there is a jump, assuming ALU will be done * because branch latency is pretty high. */ if (current.linear_succs.empty() || current.instructions.back()->opcode == aco_opcode::s_branch) { ctx = delay_ctx(program); } } } void combine_delay_alu(Program* program) { /* Combine s_delay_alu using the skip field. */ for (Block& block : program->blocks) { int i = 0; int prev_delay_alu = -1; for (aco_ptr& instr : block.instructions) { if (instr->opcode != aco_opcode::s_delay_alu) { block.instructions[i++] = std::move(instr); continue; } uint16_t imm = instr->salu().imm; int skip = i - prev_delay_alu - 1; if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) { if (imm >> 7 == 0) prev_delay_alu = i; block.instructions[i++] = std::move(instr); continue; } block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7); prev_delay_alu = -1; } block.instructions.resize(i); } } } // namespace aco