/* * Copyright © 2019 Valve Corporation * * SPDX-License-Identifier: MIT */ #include "aco_builder.h" #include "aco_ir.h" #include #include #include #include /* * Implements an algorithm to lower to Conventional SSA Form (CSSA). * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency" * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon, * * By lowering the IR to CSSA, the insertion of parallelcopies is separated from * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling. * The algorithm coalesces non-interfering phi-resources while taking value-equality * into account. Re-indexes the SSA-defs. */ namespace aco { namespace { typedef std::vector merge_set; struct copy { Definition def; Operand op; }; struct merge_node { Operand value = Operand(); /* original value: can be an SSA-def or constant value */ uint32_t index = -1u; /* index into the vector of merge sets */ uint32_t defined_at = -1u; /* defining block */ /* We also remember two closest equal intersecting ancestors. Because they intersect with this * merge node, they must dominate it (intersection isn't possible otherwise) and have the same * value (or else they would not be allowed to be in the same merge set). */ Temp equal_anc_in = Temp(); /* within the same merge set */ Temp equal_anc_out = Temp(); /* from the other set we're currently trying to merge with */ }; struct cssa_ctx { Program* program; std::vector> parallelcopies; /* copies per block */ std::vector merge_sets; /* each vector is one (ordered) merge set */ std::unordered_map merge_node_table; /* tempid -> merge node */ }; /* create (virtual) parallelcopies for each phi instruction and * already merge copy-definitions with phi-defs into merge sets */ void collect_parallelcopies(cssa_ctx& ctx) { ctx.parallelcopies.resize(ctx.program->blocks.size()); Builder bld(ctx.program); for (Block& block : ctx.program->blocks) { for (aco_ptr& phi : block.instructions) { if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; const Definition& def = phi->definitions[0]; /* if the definition is not temp, it is the exec mask. * We can reload the exec mask directly from the spill slot. */ if (!def.isTemp() || def.isKill()) continue; Block::edge_vec& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; uint32_t index = ctx.merge_sets.size(); merge_set set; bool has_preheader_copy = false; for (unsigned i = 0; i < phi->operands.size(); i++) { Operand op = phi->operands[i]; if (op.isUndefined()) continue; if (def.regClass().type() == RegType::sgpr && !op.isTemp()) { /* SGPR inline constants and literals on GFX10+ can be spilled * and reloaded directly (without intermediate register) */ if (op.isConstant()) { if (ctx.program->gfx_level >= GFX10) continue; if (op.size() == 1 && !op.isLiteral()) continue; } else { assert(op.isFixed() && op.physReg() == exec); continue; } } /* create new temporary and rename operands */ Temp tmp = bld.tmp(def.regClass()); ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op}); phi->operands[i] = Operand(tmp); phi->operands[i].setKill(true); /* place the new operands in the same merge set */ set.emplace_back(tmp); ctx.merge_node_table[tmp.id()] = {op, index, preds[i]}; has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header; } if (set.empty()) continue; /* place the definition in dominance-order */ if (def.isTemp()) { if (has_preheader_copy) set.emplace(std::next(set.begin()), def.getTemp()); else if (block.kind & block_kind_loop_header) set.emplace(set.begin(), def.getTemp()); else set.emplace_back(def.getTemp()); ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index}; } ctx.merge_sets.emplace_back(set); } } } /* check whether the definition of a comes after b. */ inline bool defined_after(cssa_ctx& ctx, Temp a, Temp b) { merge_node& node_a = ctx.merge_node_table[a.id()]; merge_node& node_b = ctx.merge_node_table[b.id()]; if (node_a.defined_at == node_b.defined_at) return a.id() > b.id(); return node_a.defined_at > node_b.defined_at; } /* check whether a dominates b where b is defined after a */ inline bool dominates(cssa_ctx& ctx, Temp a, Temp b) { assert(defined_after(ctx, b, a)); Block& parent = ctx.program->blocks[ctx.merge_node_table[a.id()].defined_at]; Block& child = ctx.program->blocks[ctx.merge_node_table[b.id()].defined_at]; if (b.regClass().type() == RegType::vgpr) return dominates_logical(parent, child); else return dominates_linear(parent, child); } /* Checks whether some variable is live-out, not considering any phi-uses. */ inline bool is_live_out(cssa_ctx& ctx, Temp var, uint32_t block_idx) { Block::edge_vec& succs = var.is_linear() ? ctx.program->blocks[block_idx].linear_succs : ctx.program->blocks[block_idx].logical_succs; return std::any_of(succs.begin(), succs.end(), [&](unsigned succ) { return ctx.program->live.live_in[succ].count(var.id()); }); } /* check intersection between var and parent: * We already know that parent dominates var. */ inline bool intersects(cssa_ctx& ctx, Temp var, Temp parent) { merge_node& node_var = ctx.merge_node_table[var.id()]; merge_node& node_parent = ctx.merge_node_table[parent.id()]; assert(node_var.index != node_parent.index); uint32_t block_idx = node_var.defined_at; /* if parent is defined in a different block than var */ if (node_parent.defined_at < node_var.defined_at) { /* if the parent is not live-in, they don't interfere */ if (!ctx.program->live.live_in[block_idx].count(parent.id())) return false; } /* if the parent is live-out at the definition block of var, they intersect */ bool parent_live = is_live_out(ctx, parent, block_idx); if (parent_live) return true; for (const copy& cp : ctx.parallelcopies[block_idx]) { /* if var is defined at the edge, they don't intersect */ if (cp.def.getTemp() == var) return false; if (cp.op.isTemp() && cp.op.getTemp() == parent) parent_live = true; } /* if the parent is live at the edge, they intersect */ if (parent_live) return true; /* both, parent and var, are present in the same block */ const Block& block = ctx.program->blocks[block_idx]; for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) { /* if the parent was not encountered yet, it can only be used by a phi */ if (is_phi(it->get())) break; for (const Definition& def : (*it)->definitions) { if (!def.isTemp()) continue; /* if parent was not found yet, they don't intersect */ if (def.getTemp() == var) return false; } for (const Operand& op : (*it)->operands) { if (!op.isTemp()) continue; /* if the var was defined before this point, they intersect */ if (op.getTemp() == parent) return true; } } return false; } /* check interference between var and parent: * i.e. they have different values and intersect. * If parent and var intersect and share the same value, also updates the equal ancestor. */ inline bool interference(cssa_ctx& ctx, Temp var, Temp parent) { assert(var != parent); merge_node& node_var = ctx.merge_node_table[var.id()]; node_var.equal_anc_out = Temp(); if (node_var.index == ctx.merge_node_table[parent.id()].index) { /* Check/update in other set. equal_anc_out is only present if it intersects with 'parent', * but that's fine since it has to for it to intersect with 'var'. */ parent = ctx.merge_node_table[parent.id()].equal_anc_out; } Temp tmp = parent; /* Check if 'var' intersects with 'parent' or any ancestors which might intersect too. */ while (tmp != Temp() && !intersects(ctx, var, tmp)) { merge_node& node_tmp = ctx.merge_node_table[tmp.id()]; tmp = node_tmp.equal_anc_in; } /* no intersection found */ if (tmp == Temp()) return false; /* var and parent, same value and intersect, but in different sets */ if (node_var.value == ctx.merge_node_table[parent.id()].value) { node_var.equal_anc_out = tmp; return false; } /* var and parent, different values and intersect */ return true; } /* tries to merge set_b into set_a of given temporary and * drops that temporary as it is being coalesced */ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b) { auto def_node_it = ctx.merge_node_table.find(dst.id()); uint32_t index = def_node_it->second.index; merge_set& set_a = ctx.merge_sets[index]; std::vector dom; /* stack of the traversal */ merge_set union_set; /* the new merged merge-set */ uint32_t i_a = 0; uint32_t i_b = 0; while (i_a < set_a.size() || i_b < set_b.size()) { Temp current; if (i_a == set_a.size()) current = set_b[i_b++]; else if (i_b == set_b.size()) current = set_a[i_a++]; /* else pick the one defined first */ else if (defined_after(ctx, set_a[i_a], set_b[i_b])) current = set_b[i_b++]; else current = set_a[i_a++]; while (!dom.empty() && !dominates(ctx, dom.back(), current)) dom.pop_back(); /* not the desired parent, remove */ if (!dom.empty() && interference(ctx, current, dom.back())) { for (Temp t : union_set) ctx.merge_node_table[t.id()].equal_anc_out = Temp(); return false; /* intersection detected */ } dom.emplace_back(current); /* otherwise, keep checking */ if (current != dst) union_set.emplace_back(current); /* maintain the new merge-set sorted */ } /* update hashmap */ for (Temp t : union_set) { merge_node& node = ctx.merge_node_table[t.id()]; /* update the equal ancestors: * i.e. the 'closest' dominating def which intersects */ Temp in = node.equal_anc_in; Temp out = node.equal_anc_out; if (in == Temp() || (out != Temp() && defined_after(ctx, out, in))) node.equal_anc_in = out; node.equal_anc_out = Temp(); /* update merge-set index */ node.index = index; } set_b = merge_set(); /* free the old set_b */ ctx.merge_sets[index] = union_set; ctx.merge_node_table.erase(dst.id()); /* remove the temporary */ return true; } /* returns true if the copy can safely be omitted */ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx) { /* we can only coalesce temporaries */ if (!copy.op.isTemp() || !copy.op.isKill()) return false; /* we can only coalesce copies of the same register class */ if (copy.op.regClass() != copy.def.regClass()) return false; /* try emplace a merge_node for the copy operand */ merge_node& op_node = ctx.merge_node_table[copy.op.tempId()]; if (op_node.defined_at == -1u) { /* find defining block of operand */ while (ctx.program->live.live_in[block_idx].count(copy.op.tempId())) block_idx = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[block_idx].logical_idom : ctx.program->blocks[block_idx].linear_idom; op_node.defined_at = block_idx; op_node.value = copy.op; } /* check if this operand has not yet been coalesced */ if (op_node.index == -1u) { merge_set op_set = merge_set{copy.op.getTemp()}; return try_merge_merge_set(ctx, copy.def.getTemp(), op_set); } /* check if this operand has been coalesced into the same set */ assert(ctx.merge_node_table.count(copy.def.tempId())); if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index) return true; /* otherwise, try to coalesce both merge sets */ return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]); } /* node in the location-transfer-graph */ struct ltg_node { copy* cp; uint32_t read_idx; uint32_t num_uses = 0; }; /* emit the copies in an order that does not * create interferences within a merge-set */ void emit_copies_block(Builder& bld, std::map& ltg, RegType type) { RegisterDemand live_changes; RegisterDemand reg_demand = bld.it->get()->register_demand - get_temp_registers(bld.it->get()) - get_live_changes(bld.it->get()); auto&& it = ltg.begin(); while (it != ltg.end()) { copy& cp = *it->second.cp; /* wrong regclass or still needed as operand */ if (cp.def.regClass().type() != type || it->second.num_uses > 0) { ++it; continue; } /* update the location transfer graph */ if (it->second.read_idx != -1u) { auto&& other = ltg.find(it->second.read_idx); if (other != ltg.end()) other->second.num_uses--; } ltg.erase(it); /* Remove the kill flag if we still need this operand for other copies. */ if (cp.op.isKill() && std::any_of(ltg.begin(), ltg.end(), [&](auto& other) { return other.second.cp->op == cp.op; })) cp.op.setKill(false); /* emit the copy */ Instruction* instr = bld.copy(cp.def, cp.op); live_changes += get_live_changes(instr); RegisterDemand temps = get_temp_registers(instr); instr->register_demand = reg_demand + live_changes + temps; it = ltg.begin(); } /* count the number of remaining circular dependencies */ unsigned num = std::count_if( ltg.begin(), ltg.end(), [&](auto& n) { return n.second.cp->def.regClass().type() == type; }); /* if there are circular dependencies, we just emit them as single parallelcopy */ if (num) { // TODO: this should be restricted to a feasible number of registers // and otherwise use a temporary to avoid having to reload more (spilled) // variables than we have registers. aco_ptr copy{ create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)}; it = ltg.begin(); for (unsigned i = 0; i < num; i++) { while (it->second.cp->def.regClass().type() != type) ++it; copy->definitions[i] = it->second.cp->def; copy->operands[i] = it->second.cp->op; it = ltg.erase(it); } live_changes += get_live_changes(copy.get()); RegisterDemand temps = get_temp_registers(copy.get()); copy->register_demand = reg_demand + live_changes + temps; bld.insert(std::move(copy)); } /* Update RegisterDemand after inserted copies */ for (auto instr_it = bld.it; instr_it != bld.instructions->end(); ++instr_it) { instr_it->get()->register_demand += live_changes; } } /* either emits or coalesces all parallelcopies and * renames the phi-operands accordingly. */ void emit_parallelcopies(cssa_ctx& ctx) { std::unordered_map renames; /* we iterate backwards to prioritize coalescing in else-blocks */ for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { if (ctx.parallelcopies[i].empty()) continue; std::map ltg; bool has_vgpr_copy = false; bool has_sgpr_copy = false; /* first, try to coalesce all parallelcopies */ for (copy& cp : ctx.parallelcopies[i]) { if (try_coalesce_copy(ctx, cp, i)) { assert(cp.op.isTemp() && cp.op.isKill()); /* As this temp will be used as phi operand and becomes live-out, * remove the kill flag from any other copy of this same temp. */ for (copy& other : ctx.parallelcopies[i]) { if (&other != &cp && other.op.isTemp() && other.op.getTemp() == cp.op.getTemp()) other.op.setKill(false); } renames.emplace(cp.def.tempId(), cp.op); } else { uint32_t read_idx = -1u; if (cp.op.isTemp()) { read_idx = ctx.merge_node_table[cp.op.tempId()].index; /* In case the original phi-operand was killed, it might still be live-out * if the logical successor is not the same as linear successors. * Thus, re-check whether the temp is live-out. */ cp.op.setKill(cp.op.isKill() && !is_live_out(ctx, cp.op.getTemp(), i)); cp.op.setFirstKill(cp.op.isKill()); } uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index; assert(write_idx != -1u); ltg[write_idx] = {&cp, read_idx}; bool is_vgpr = cp.def.regClass().type() == RegType::vgpr; has_vgpr_copy |= is_vgpr; has_sgpr_copy |= !is_vgpr; } } /* build location-transfer-graph */ for (auto& pair : ltg) { if (pair.second.read_idx == -1u) continue; auto&& it = ltg.find(pair.second.read_idx); if (it != ltg.end()) it->second.num_uses++; } /* emit parallelcopies ordered */ Builder bld(ctx.program); Block& block = ctx.program->blocks[i]; if (has_vgpr_copy) { /* emit VGPR copies */ auto IsLogicalEnd = [](const aco_ptr& inst) -> bool { return inst->opcode == aco_opcode::p_logical_end; }; auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd); bld.reset(&block.instructions, std::prev(it.base())); emit_copies_block(bld, ltg, RegType::vgpr); } if (has_sgpr_copy) { /* emit SGPR copies */ bld.reset(&block.instructions, std::prev(block.instructions.end())); emit_copies_block(bld, ltg, RegType::sgpr); } } RegisterDemand new_demand; for (Block& block : ctx.program->blocks) { /* Finally, rename coalesced phi operands */ for (aco_ptr& phi : block.instructions) { if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; for (Operand& op : phi->operands) { if (!op.isTemp()) continue; auto&& it = renames.find(op.tempId()); if (it != renames.end()) { op = it->second; renames.erase(it); } } } /* Resummarize the block's register demand */ block.register_demand = block.live_in_demand; for (const aco_ptr& instr : block.instructions) block.register_demand.update(instr->register_demand); new_demand.update(block.register_demand); } /* Update max_reg_demand and num_waves */ update_vgpr_sgpr_demand(ctx.program, new_demand); assert(renames.empty()); } } /* end namespace */ void lower_to_cssa(Program* program) { reindex_ssa(program); cssa_ctx ctx = {program}; collect_parallelcopies(ctx); emit_parallelcopies(ctx); /* Validate live variable information */ if (!validate_live_vars(program)) abort(); } } // namespace aco