/* * Copyright © 2024 Valve Corporation * * SPDX-License-Identifier: MIT */ #include "aco_builder.h" #include "aco_ir.h" namespace aco { namespace { struct branch_ctx { Program* program; std::vector blocks_incoming_exec_used; branch_ctx(Program* program_) : program(program_), blocks_incoming_exec_used(program_->blocks.size(), true) {} }; void remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index) { Block& succ = ctx.program->blocks[succ_index]; ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index); assert(std::next(it) == succ.linear_preds.end()); succ.linear_preds.pop_back(); it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index); assert(std::next(it) == block.linear_succs.end()); block.linear_succs.pop_back(); if (succ.linear_preds.empty()) { /* This block became unreachable - Recursively remove successors. */ succ.instructions.clear(); for (unsigned i : succ.linear_succs) remove_linear_successor(ctx, succ, i); } } void try_remove_simple_block(branch_ctx& ctx, Block& block) { if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch) return; /* Don't remove the preheader as it might be needed as convergence point * in order to insert code (e.g. for loop alignment, wait states, etc.). */ if (block.kind & block_kind_loop_preheader) return; unsigned succ_idx = block.linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; for (unsigned pred_idx : block.linear_preds) { Block& pred = ctx.program->blocks[pred_idx]; assert(pred.index < block.index); assert(!pred.instructions.empty() && pred.instructions.back()->isBranch()); Instruction* branch = pred.instructions.back().get(); if (branch->opcode == aco_opcode::p_branch) { /* The predecessor unconditionally jumps to this block. Redirect to successor. */ pred.linear_succs[0] = succ_idx; succ.linear_preds.push_back(pred_idx); } else if (pred.linear_succs[0] == succ_idx || pred.linear_succs[1] == succ_idx) { /* The predecessor's alternative target is this block's successor. */ pred.linear_succs[0] = succ_idx; pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */ pred.linear_succs.pop_back(); branch->opcode = aco_opcode::p_branch; } else if (pred.linear_succs[1] == block.index) { /* The predecessor jumps to this block. Redirect to successor. */ pred.linear_succs[1] = succ_idx; succ.linear_preds.push_back(pred_idx); } else { /* This block is the fall-through target of the predecessor. */ assert(pred_idx == block.index - 1); if (block.instructions.empty()) { /* If this block is empty, just fall-through to the successor. */ pred.linear_succs[0] = succ_idx; succ.linear_preds.push_back(pred_idx); continue; } /* Otherwise, check if there is a fall-through path for the jump target. */ if (block.index >= pred.linear_succs[1]) return; for (unsigned j = block.index + 1; j < pred.linear_succs[1]; j++) { if (!ctx.program->blocks[j].instructions.empty()) return; } pred.linear_succs[0] = pred.linear_succs[1]; pred.linear_succs[1] = succ_idx; succ.linear_preds.push_back(pred_idx); /* Invert the condition. This branch now falls through to its original target. * However, we don't update the fall-through target since this instruction * gets lowered in the next step, anyway. */ if (branch->opcode == aco_opcode::p_cbranch_nz) branch->opcode = aco_opcode::p_cbranch_z; else branch->opcode = aco_opcode::p_cbranch_nz; } /* Update the branch target. */ branch->branch().target[0] = succ_idx; } /* If this block is part of the logical CFG, also connect pre- and successors. */ if (!block.logical_succs.empty()) { assert(block.logical_succs.size() == 1); unsigned logical_succ_idx = block.logical_succs[0]; Block& logical_succ = ctx.program->blocks[logical_succ_idx]; ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(), logical_succ.logical_preds.end(), block.index); assert(std::next(it) == logical_succ.logical_preds.end()); logical_succ.logical_preds.pop_back(); for (unsigned pred_idx : block.logical_preds) { Block& pred = ctx.program->blocks[pred_idx]; std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index, logical_succ_idx); if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1]) pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */ else logical_succ.logical_preds.push_back(pred_idx); } block.logical_succs.clear(); block.logical_preds.clear(); } remove_linear_successor(ctx, block, succ_idx); block.linear_preds.clear(); block.instructions.clear(); } void try_merge_break_with_continue(branch_ctx& ctx, Block& block) { /* Look for this: * BB1: * ... * p_branch_z exec BB3, BB2 * BB2: * ... * s[0:1], scc = s_andn2 s[0:1], exec * s_cbranch_scc0 BB4 * BB3: * exec = s_mov_b64 s[0:1] * s_branch BB1 * BB4: * ... * * And turn it into this: * BB1: * ... * p_branch_z exec BB3, BB2 * BB2: * ... * BB3: * s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec * s_cbranch_scc1 BB1, BB4 * BB4: * ... */ if (block.linear_succs.size() != 2 || block.instructions.size() < 2) return; Instruction* branch = block.instructions.back().get(); if (branch->opcode != aco_opcode::s_cbranch_scc0) return; Block& merge = ctx.program->blocks[block.linear_succs[0]]; Block& loopexit = ctx.program->blocks[block.linear_succs[1]]; /* Just a jump to the loop header. */ if (merge.linear_succs.size() != 1) return; for (unsigned merge_pred : merge.linear_preds) { if (merge_pred == block.index) continue; Block& pred = ctx.program->blocks[merge_pred]; Instruction* pred_branch = pred.instructions.back().get(); /* The branch needs to be exec zero only, otherwise we corrupt exec. */ if (pred_branch->opcode != aco_opcode::p_cbranch_z || pred_branch->operands[0].physReg() != exec) return; } /* merge block: copy to exec, branch */ if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch) return; Builder bld(ctx.program); Instruction* execwrite = merge.instructions[0].get(); if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec()) return; Instruction* execsrc = block.instructions[block.instructions.size() - 2].get(); if (execsrc->opcode != bld.w64or32(Builder::s_andn2) || execsrc->definitions[0].physReg() != execwrite->operands[0].physReg() || execsrc->operands[0].physReg() != execwrite->operands[0].physReg() || execsrc->operands[1].physReg() != exec) return; /* Use conditional branch in merge block. */ merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1; block.linear_succs.pop_back(); block.linear_succs[0] = merge.index; merge.linear_succs.push_back(loopexit.index); std::swap(merge.linear_succs[0], merge.linear_succs[1]); std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index, merge.index); /* Check if we can use the loopexit as the fallthrough block. * Otherwise, we'll need an extra branch instruction. */ for (unsigned i = merge.index + 1; i < loopexit.index; i++) { if (!ctx.program->blocks[i].instructions.empty()) { branch->opcode = aco_opcode::s_branch; merge.instructions.emplace_back(std::move(block.instructions.back())); break; } } block.instructions.pop_back(); if (ctx.program->gfx_level >= GFX9) { /* Combine s_andn2 and copy to exec to s_andn2_wrexec. */ Instruction* wr_exec = bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1], Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]); merge.instructions[0].reset(wr_exec); } else { /* Move s_andn2 to the merge block. */ merge.instructions.emplace(merge.instructions.begin(), std::move(block.instructions.back())); } block.instructions.pop_back(); ctx.blocks_incoming_exec_used[merge.index] = true; } void eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block) { /* Check if any successor needs the outgoing exec mask from the current block. */ bool exec_write_used; if (block.kind & block_kind_end_with_regs) { /* Last block of a program with succeed shader part should respect final exec write. */ exec_write_used = true; } else if (block.linear_succs.empty() && !block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_setpc_b64) { /* This block ends in a long jump and exec might be needed for the next shader part. */ exec_write_used = true; } else { /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */ exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(), [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; }); } /* Go through all instructions and eliminate useless exec writes. */ for (int i = block.instructions.size() - 1; i >= 0; --i) { aco_ptr& instr = block.instructions[i]; /* See if the current instruction needs or writes exec. */ bool needs_exec = needs_exec_mask(instr.get()); bool writes_exec = instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask; /* See if we found an unused exec write. */ if (writes_exec && !exec_write_used) { /* Don't eliminate an instruction that writes registers other than exec and scc. * It is possible that this is eg. an s_and_saveexec and the saved value is * used by a later branch. */ bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition& def) -> bool { return def.physReg() != exec && def.physReg() != scc; }); if (!writes_other) { instr.reset(); continue; } } /* For a newly encountered exec write, clear the used flag. */ if (writes_exec) exec_write_used = false; /* If the current instruction needs exec, mark it as used. */ exec_write_used |= needs_exec; } /* Remember if the current block needs an incoming exec mask from its predecessors. */ ctx.blocks_incoming_exec_used[block.index] = exec_write_used; /* Cleanup: remove deleted instructions from the vector. */ auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr); block.instructions.resize(new_end - block.instructions.begin()); } /** * Check if the branch instruction can be removed: * This is beneficial when executing the next block with an empty exec mask * is faster than the branch instruction itself. * * Override this judgement when: * - The application prefers to remove control flow * - The compiler stack knows that it's a divergent branch never taken */ bool can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch) { const uint32_t target = branch->target[0]; const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) && branch->operands[0].physReg() == exec); if (branch->never_taken) { assert(!uniform_branch || std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1), std::next(ctx.program->blocks.begin(), target), [](Block& b) { return b.instructions.empty(); })); return true; } /* Cannot remove back-edges. */ if (block.index >= target) return false; const bool prefer_remove = branch->rarely_taken; unsigned num_scalar = 0; unsigned num_vector = 0; /* Check the instructions between branch and target */ for (unsigned i = block.index + 1; i < target; i++) { /* Uniform conditional branches must not be ignored if they * are about to jump over actual instructions */ if (uniform_branch && !ctx.program->blocks[i].instructions.empty()) return false; for (aco_ptr& instr : ctx.program->blocks[i].instructions) { if (instr->isSOPP()) { /* Discard early exits and loop breaks and continues should work fine with * an empty exec mask. */ if (instr->opcode == aco_opcode::s_cbranch_scc0 || instr->opcode == aco_opcode::s_cbranch_scc1 || instr->opcode == aco_opcode::s_cbranch_execz || instr->opcode == aco_opcode::s_cbranch_execnz) { bool is_break_continue = ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue); bool discard_early_exit = ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit; if (is_break_continue || discard_early_exit) continue; } return false; } else if (instr->isSALU()) { num_scalar++; } else if (instr->isVALU() || instr->isVINTRP()) { if (instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { /* writelane ignores exec, writing inactive lanes results in UB. */ return false; } num_vector++; /* VALU which writes SGPRs are always executed on GFX10+ */ if (ctx.program->gfx_level >= GFX10) { for (Definition& def : instr->definitions) { if (def.regClass().type() == RegType::sgpr) num_scalar++; } } } else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) { /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs), * SMEM might be an invalid access, and barriers are probably expensive. */ return false; } else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) { // TODO: GFX6-9 can use vskip if (!prefer_remove) return false; } else if (instr->opcode != aco_opcode::p_debug_info) { assert(false && "Pseudo instructions should be lowered by this point."); return false; } if (!prefer_remove) { /* Under these conditions, we shouldn't remove the branch. * Don't care about the estimated cycles when the shader prefers flattening. */ unsigned est_cycles; if (ctx.program->gfx_level >= GFX10) est_cycles = num_scalar * 2 + num_vector; else est_cycles = num_scalar * 4 + num_vector * 4; if (est_cycles > 16) return false; } } } return true; } void lower_branch_instruction(branch_ctx& ctx, Block& block) { if (block.instructions.empty() || !block.instructions.back()->isBranch()) return; aco_ptr branch = std::move(block.instructions.back()); const uint32_t target = branch->branch().target[0]; block.instructions.pop_back(); if (can_remove_branch(ctx, block, &branch->branch())) { if (branch->opcode != aco_opcode::p_branch) remove_linear_successor(ctx, block, target); return; } /* emit branch instruction */ Builder bld(ctx.program, &block.instructions); switch (branch->opcode) { case aco_opcode::p_branch: assert(block.linear_succs[0] == target); bld.sopp(aco_opcode::s_branch, target); break; case aco_opcode::p_cbranch_nz: assert(block.linear_succs[1] == target); if (branch->operands[0].physReg() == exec) bld.sopp(aco_opcode::s_cbranch_execnz, target); else if (branch->operands[0].physReg() == vcc) bld.sopp(aco_opcode::s_cbranch_vccnz, target); else { assert(branch->operands[0].physReg() == scc); bld.sopp(aco_opcode::s_cbranch_scc1, target); } break; case aco_opcode::p_cbranch_z: assert(block.linear_succs[1] == target); if (branch->operands[0].physReg() == exec) bld.sopp(aco_opcode::s_cbranch_execz, target); else if (branch->operands[0].physReg() == vcc) bld.sopp(aco_opcode::s_cbranch_vccz, target); else { assert(branch->operands[0].physReg() == scc); bld.sopp(aco_opcode::s_cbranch_scc0, target); } break; default: unreachable("Unknown Pseudo branch instruction!"); } } void try_stitch_linear_block(branch_ctx& ctx, Block& block) { /* Don't stitch blocks that are part of the logical CFG. */ if (block.linear_preds.empty() || block.linear_succs.empty() || !block.logical_preds.empty()) return; /* Try to stitch this block with the predecessor: * This block must have exactly one predecessor and * the predecessor must have exactly one successor. */ Block& pred = ctx.program->blocks[block.linear_preds[0]]; if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 && (pred.instructions.empty() || !pred.instructions.back()->isSOPP())) { /* Insert the instructions at the end of the predecessor and fixup edges. */ pred.instructions.insert(pred.instructions.end(), std::move_iterator(block.instructions.begin()), std::move_iterator(block.instructions.end())); for (unsigned succ_idx : block.linear_succs) { Block& s = ctx.program->blocks[succ_idx]; std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index); } pred.linear_succs = std::move(block.linear_succs); block.instructions.clear(); block.linear_preds.clear(); block.linear_succs.clear(); return; } /* Try to stitch this block with the successor: * This block must have exactly one successor and * the successor must have exactly one predecessor. */ Block& succ = ctx.program->blocks[block.linear_succs[0]]; if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 && (block.instructions.empty() || !block.instructions.back()->isSOPP())) { /* Insert the instructions at the beginning of the successor. */ succ.instructions.insert(succ.instructions.begin(), std::move_iterator(block.instructions.begin()), std::move_iterator(block.instructions.end())); for (unsigned pred_idx : block.linear_preds) { Block& p = ctx.program->blocks[pred_idx]; if (!p.instructions.empty() && instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch && p.instructions.back()->salu().imm == block.index) { p.instructions.back()->salu().imm = succ.index; } std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index); } succ.linear_preds = std::move(block.linear_preds); block.instructions.clear(); block.linear_preds.clear(); block.linear_succs.clear(); } } } /* end namespace */ void lower_branches(Program* program) { branch_ctx ctx(program); for (int i = program->blocks.size() - 1; i >= 0; i--) { Block& block = program->blocks[i]; lower_branch_instruction(ctx, block); eliminate_useless_exec_writes_in_block(ctx, block); if (block.kind & block_kind_break) try_merge_break_with_continue(ctx, block); if (block.linear_succs.size() == 1) try_remove_simple_block(ctx, block); } for (Block& block : program->blocks) { try_stitch_linear_block(ctx, block); } } } // namespace aco