//===- subzero/src/IceVariableSplitting.cpp - Local variable splitting ----===//
//
//                        The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Aggressive block-local variable splitting to improve linear-scan
/// register allocation.
///
//===----------------------------------------------------------------------===//

#include "IceVariableSplitting.h"

#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
#include "IceInst.h"
#include "IceOperand.h"
#include "IceTargetLowering.h"

namespace Ice {

namespace {

/// A Variable is "allocable" if it is a register allocation candidate but
/// doesn't already have a register.
bool isAllocable(const Variable *Var) {
  if (Var == nullptr)
    return false;
  return !Var->hasReg() && Var->mayHaveReg();
}

/// A Variable is "inf" if it already has a register or is infinite-weight.
bool isInf(const Variable *Var) {
  if (Var == nullptr)
    return false;
  return Var->hasReg() || Var->mustHaveReg();
}

/// VariableMap is a simple helper class that keeps track of the latest split
/// version of the original Variables, as well as the instruction containing the
/// last use of the Variable within the current block.  For each entry, the
/// Variable is tagged with the CfgNode that it is valid in, so that we don't
/// need to clear the entire Map[] vector for each block.
class VariableMap {
private:
  VariableMap() = delete;
  VariableMap(const VariableMap &) = delete;
  VariableMap &operator=(const VariableMap &) = delete;

  struct VarInfo {
    /// MappedVar is the latest mapped/split version of the Variable.
    Variable *MappedVar = nullptr;
    /// MappedVarNode is the block in which MappedVar is valid.
    const CfgNode *MappedVarNode = nullptr;
    /// LastUseInst is the last instruction in the block that uses the Variable
    /// as a source operand.
    const Inst *LastUseInst = nullptr;
    /// LastUseNode is the block in which LastUseInst is valid.
    const CfgNode *LastUseNode = nullptr;
    VarInfo() = default;

  private:
    VarInfo(const VarInfo &) = delete;
    VarInfo &operator=(const VarInfo &) = delete;
  };

public:
  explicit VariableMap(Cfg *Func)
      : Func(Func), NumVars(Func->getNumVariables()), Map(NumVars) {}
  /// Reset the mappings at the start of a block.
  void reset(const CfgNode *CurNode) {
    Node = CurNode;
    // Do a prepass through all the instructions, marking which instruction is
    // the last use of each Variable within the block.
    for (const Inst &Instr : Node->getInsts()) {
      if (Instr.isDeleted())
        continue;
      for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
        if (auto *SrcVar = llvm::dyn_cast<Variable>(Instr.getSrc(i))) {
          const SizeT VarNum = getVarNum(SrcVar);
          Map[VarNum].LastUseInst = &Instr;
          Map[VarNum].LastUseNode = Node;
        }
      }
    }
  }
  /// Get Var's current mapping (or Var itself if it has no mapping yet).
  Variable *get(Variable *Var) const {
    const SizeT VarNum = getVarNum(Var);
    Variable *MappedVar = Map[VarNum].MappedVar;
    if (MappedVar == nullptr)
      return Var;
    if (Map[VarNum].MappedVarNode != Node)
      return Var;
    return MappedVar;
  }
  /// Create a new linked Variable in the LinkedTo chain, and set it as Var's
  /// latest mapping.
  Variable *makeLinked(Variable *Var) {
    Variable *NewVar = Func->makeVariable(Var->getType());
    NewVar->setRegClass(Var->getRegClass());
    NewVar->setLinkedTo(get(Var));
    const SizeT VarNum = getVarNum(Var);
    Map[VarNum].MappedVar = NewVar;
    Map[VarNum].MappedVarNode = Node;
    return NewVar;
  }
  /// Given Var that is LinkedTo some other variable, re-splice it into the
  /// LinkedTo chain so that the chain is ordered by Variable::getIndex().
  void spliceBlockLocalLinkedToChain(Variable *Var) {
    Variable *LinkedTo = Var->getLinkedTo();
    assert(LinkedTo != nullptr);
    assert(Var->getIndex() > LinkedTo->getIndex());
    const SizeT VarNum = getVarNum(LinkedTo);
    Variable *Link = Map[VarNum].MappedVar;
    if (Link == nullptr || Map[VarNum].MappedVarNode != Node)
      return;
    Variable *LinkParent = Link->getLinkedTo();
    while (LinkParent != nullptr && LinkParent->getIndex() >= Var->getIndex()) {
      Link = LinkParent;
      LinkParent = Link->getLinkedTo();
    }
    Var->setLinkedTo(LinkParent);
    Link->setLinkedTo(Var);
  }
  /// Return whether the given Variable has any uses as a source operand within
  /// the current block.  If it has no source operand uses, but is assigned as a
  /// dest variable in some instruction in the block, then we needn't bother
  /// splitting it.
  bool isDestUsedInBlock(const Variable *Dest) const {
    return Map[getVarNum(Dest)].LastUseNode == Node;
  }
  /// Return whether the given instruction is the last use of the given Variable
  /// within the current block.  If it is, then we needn't bother splitting the
  /// Variable at this instruction.
  bool isInstLastUseOfVar(const Variable *Var, const Inst *Instr) {
    return Map[getVarNum(Var)].LastUseInst == Instr;
  }

private:
  Cfg *const Func;
  // NumVars is for the size of the Map array.  It can be const because any new
  // Variables created during the splitting pass don't need to be mapped.
  const SizeT NumVars;
  CfgVector<VarInfo> Map;
  const CfgNode *Node = nullptr;
  /// Get Var's VarNum, and do some validation.
  SizeT getVarNum(const Variable *Var) const {
    const SizeT VarNum = Var->getIndex();
    assert(VarNum < NumVars);
    return VarNum;
  }
};

/// LocalVariableSplitter tracks the necessary splitting state across
/// instructions.
class LocalVariableSplitter {
  LocalVariableSplitter() = delete;
  LocalVariableSplitter(const LocalVariableSplitter &) = delete;
  LocalVariableSplitter &operator=(const LocalVariableSplitter &) = delete;

public:
  explicit LocalVariableSplitter(Cfg *Func)
      : Target(Func->getTarget()), VarMap(Func) {}
  /// setNode() is called before processing the instructions of a block.
  void setNode(CfgNode *CurNode) {
    Node = CurNode;
    VarMap.reset(Node);
    LinkedToFixups.clear();
  }
  /// finalizeNode() is called after all instructions in the block are
  /// processed.
  void finalizeNode() {
    // Splice in any preexisting LinkedTo links into the single chain.  These
    // are the ones that were recorded during setInst().
    for (Variable *Var : LinkedToFixups) {
      VarMap.spliceBlockLocalLinkedToChain(Var);
    }
  }
  /// setInst() is called before processing the next instruction.  The iterators
  /// are the insertion points for a new instructions, depending on whether the
  /// new instruction should be inserted before or after the current
  /// instruction.
  void setInst(Inst *CurInst, InstList::iterator Cur, InstList::iterator Next) {
    Instr = CurInst;
    Dest = Instr->getDest();
    IterCur = Cur;
    IterNext = Next;
    ShouldSkipRemainingInstructions = false;
    // Note any preexisting LinkedTo relationships that were created during
    // target lowering.  Record them in LinkedToFixups which is then processed
    // in finalizeNode().
    if (Dest != nullptr && Dest->getLinkedTo() != nullptr) {
      LinkedToFixups.emplace_back(Dest);
    }
  }
  bool shouldSkipRemainingInstructions() const {
    return ShouldSkipRemainingInstructions;
  }
  bool isUnconditionallyExecuted() const { return WaitingForLabel == nullptr; }

  /// Note: the handle*() functions return true to indicate that the instruction
  /// has now been handled and that the instruction loop should continue to the
  /// next instruction in the block (and return false otherwise).  In addition,
  /// they set the ShouldSkipRemainingInstructions flag to indicate that no more
  /// instructions in the block should be processed.

  /// Handle an "unwanted" instruction by returning true;
  bool handleUnwantedInstruction() {
    // We can limit the splitting to an arbitrary subset of the instructions,
    // and still expect correct code.  As such, we can do instruction-subset
    // bisection to help debug any problems in this pass.
    static constexpr char AnInstructionHasNoName[] = "";
    if (!BuildDefs::minimal() &&
        !getFlags().matchSplitInsts(AnInstructionHasNoName,
                                    Instr->getNumber())) {
      return true;
    }
    if (!llvm::isa<InstTarget>(Instr)) {
      // Ignore non-lowered instructions like FakeDef/FakeUse.
      return true;
    }
    return false;
  }

  /// Process a potential label instruction.
  bool handleLabel() {
    if (!Instr->isLabel())
      return false;
    // A Label instruction shouldn't have any operands, so it can be handled
    // right here and then move on.
    assert(Dest == nullptr);
    assert(Instr->getSrcSize() == 0);
    if (Instr == WaitingForLabel) {
      // If we found the forward-branch-target Label instruction we're waiting
      // for, then clear the WaitingForLabel state.
      WaitingForLabel = nullptr;
    } else if (WaitingForLabel == nullptr && WaitingForBranchTo == nullptr) {
      // If we found a new Label instruction while the WaitingFor* state is
      // clear, then set things up for this being a backward branch target.
      WaitingForBranchTo = Instr;
    } else {
      // We see something we don't understand, so skip to the next block.
      ShouldSkipRemainingInstructions = true;
    }
    return true;
  }

  /// Process a potential intra-block branch instruction.
  bool handleIntraBlockBranch() {
    const Inst *Label = Instr->getIntraBlockBranchTarget();
    if (Label == nullptr)
      return false;
    // An intra-block branch instruction shouldn't have any operands, so it can
    // be handled right here and then move on.
    assert(Dest == nullptr);
    assert(Instr->getSrcSize() == 0);
    if (WaitingForBranchTo == Label && WaitingForLabel == nullptr) {
      WaitingForBranchTo = nullptr;
    } else if (WaitingForBranchTo == nullptr &&
               (WaitingForLabel == nullptr || WaitingForLabel == Label)) {
      WaitingForLabel = Label;
    } else {
      // We see something we don't understand, so skip to the next block.
      ShouldSkipRemainingInstructions = true;
    }
    return true;
  }

  /// Specially process a potential "Variable=Variable" assignment instruction,
  /// when it conforms to certain patterns.
  bool handleSimpleVarAssign() {
    if (!Instr->isVarAssign())
      return false;
    const bool DestIsInf = isInf(Dest);
    const bool DestIsAllocable = isAllocable(Dest);
    auto *SrcVar = llvm::cast<Variable>(Instr->getSrc(0));
    const bool SrcIsInf = isInf(SrcVar);
    const bool SrcIsAllocable = isAllocable(SrcVar);
    if (DestIsInf && SrcIsInf) {
      // The instruction:
      //   t:inf = u:inf
      // No transformation is needed.
      return true;
    }
    if (DestIsInf && SrcIsAllocable && Dest->getType() == SrcVar->getType()) {
      // The instruction:
      //   t:inf = v
      // gets transformed to:
      //   t:inf = v1
      //   v2 = t:inf
      // where:
      //   v1 := map[v]
      //   v2 := linkTo(v)
      //   map[v] := v2
      //
      // If both v2 and its linkedToStackRoot get a stack slot, then "v2=t:inf"
      // is recognized as a redundant assignment and elided.
      //
      // Note that if the dest and src types are different, then this is
      // actually a truncation operation, which would make "v2=t:inf" an invalid
      // instruction.  In this case, the type test will make it fall through to
      // the general case below.
      Variable *OldMapped = VarMap.get(SrcVar);
      Instr->replaceSource(0, OldMapped);
      if (isUnconditionallyExecuted()) {
        // Only create new mapping state if the instruction is unconditionally
        // executed.
        if (!VarMap.isInstLastUseOfVar(SrcVar, Instr)) {
          Variable *NewMapped = VarMap.makeLinked(SrcVar);
          Inst *Mov = Target->createLoweredMove(NewMapped, Dest);
          Node->getInsts().insert(IterNext, Mov);
        }
      }
      return true;
    }
    if (DestIsAllocable && SrcIsInf) {
      if (!VarMap.isDestUsedInBlock(Dest)) {
        return true;
      }
      // The instruction:
      //   v = t:inf
      // gets transformed to:
      //   v = t:inf
      //   v2 = t:inf
      // where:
      //   v2 := linkTo(v)
      //   map[v] := v2
      //
      // If both v2 and v get a stack slot, then "v2=t:inf" is recognized as a
      // redundant assignment and elided.
      if (isUnconditionallyExecuted()) {
        // Only create new mapping state if the instruction is unconditionally
        // executed.
        Variable *NewMapped = VarMap.makeLinked(Dest);
        Inst *Mov = Target->createLoweredMove(NewMapped, SrcVar);
        Node->getInsts().insert(IterNext, Mov);
      } else {
        // For a conditionally executed instruction, add a redefinition of the
        // original Dest mapping, without creating a new linked variable.
        Variable *OldMapped = VarMap.get(Dest);
        Inst *Mov = Target->createLoweredMove(OldMapped, SrcVar);
        Mov->setDestRedefined();
        Node->getInsts().insert(IterNext, Mov);
      }
      return true;
    }
    assert(!ShouldSkipRemainingInstructions);
    return false;
  }

  /// Process the dest Variable of a Phi instruction.
  bool handlePhi() {
    assert(llvm::isa<InstPhi>(Instr));
    const bool DestIsAllocable = isAllocable(Dest);
    if (!DestIsAllocable)
      return true;
    if (!VarMap.isDestUsedInBlock(Dest))
      return true;
    Variable *NewMapped = VarMap.makeLinked(Dest);
    Inst *Mov = Target->createLoweredMove(NewMapped, Dest);
    Node->getInsts().insert(IterCur, Mov);
    return true;
  }

  /// Process an arbitrary instruction.
  bool handleGeneralInst() {
    const bool DestIsAllocable = isAllocable(Dest);
    // The (non-variable-assignment) instruction:
    //   ... = F(v)
    // where v is not infinite-weight, gets transformed to:
    //   v2 = v1
    //   ... = F(v1)
    // where:
    //   v1 := map[v]
    //   v2 := linkTo(v)
    //   map[v] := v2
    // After that, if the "..." dest=u is not infinite-weight, append:
    //   u2 = u
    // where:
    //   u2 := linkTo(u)
    //   map[u] := u2
    for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
      // Iterate over the top-level src vars.  Don't bother to dig into
      // e.g. MemOperands because their vars should all be infinite-weight.
      // (This assumption would need to change if the pass were done
      // pre-lowering.)
      if (auto *SrcVar = llvm::dyn_cast<Variable>(Instr->getSrc(i))) {
        const bool SrcIsAllocable = isAllocable(SrcVar);
        if (SrcIsAllocable) {
          Variable *OldMapped = VarMap.get(SrcVar);
          if (isUnconditionallyExecuted()) {
            if (!VarMap.isInstLastUseOfVar(SrcVar, Instr)) {
              Variable *NewMapped = VarMap.makeLinked(SrcVar);
              Inst *Mov = Target->createLoweredMove(NewMapped, OldMapped);
              Node->getInsts().insert(IterCur, Mov);
            }
          }
          Instr->replaceSource(i, OldMapped);
        }
      }
    }
    // Transformation of Dest is the same as the "v=t:inf" case above.
    if (DestIsAllocable && VarMap.isDestUsedInBlock(Dest)) {
      if (isUnconditionallyExecuted()) {
        Variable *NewMapped = VarMap.makeLinked(Dest);
        Inst *Mov = Target->createLoweredMove(NewMapped, Dest);
        Node->getInsts().insert(IterNext, Mov);
      } else {
        Variable *OldMapped = VarMap.get(Dest);
        Inst *Mov = Target->createLoweredMove(OldMapped, Dest);
        Mov->setDestRedefined();
        Node->getInsts().insert(IterNext, Mov);
      }
    }
    return true;
  }

private:
  TargetLowering *Target;
  CfgNode *Node = nullptr;
  Inst *Instr = nullptr;
  Variable *Dest = nullptr;
  InstList::iterator IterCur;
  InstList::iterator IterNext;
  bool ShouldSkipRemainingInstructions = false;
  VariableMap VarMap;
  CfgVector<Variable *> LinkedToFixups;
  /// WaitingForLabel and WaitingForBranchTo are for tracking intra-block
  /// control flow.
  const Inst *WaitingForLabel = nullptr;
  const Inst *WaitingForBranchTo = nullptr;
};

} // end of anonymous namespace

/// Within each basic block, rewrite Variable references in terms of chained
/// copies of the original Variable.  For example:
///   A = B + C
/// might be rewritten as:
///   B1 = B
///   C1 = C
///   A = B + C
///   A1 = A
/// and then:
///   D = A + B
/// might be rewritten as:
///   A2 = A1
///   B2 = B1
///   D = A1 + B1
///   D1 = D
///
/// The purpose is to present the linear-scan register allocator with smaller
/// live ranges, to help mitigate its "all or nothing" allocation strategy,
/// while counting on its preference mechanism to keep the split versions in the
/// same register when possible.
///
/// When creating new Variables, A2 is linked to A1 which is linked to A, and
/// similar for the other Variable linked-to chains.  Rewrites apply only to
/// Variables where mayHaveReg() is true.
///
/// At code emission time, redundant linked-to stack assignments will be
/// recognized and elided.  To illustrate using the above example, if A1 gets a
/// register but A and A2 are on the stack, the "A2=A1" store instruction is
/// redundant since A and A2 share the same stack slot and A1 originated from A.
///
/// Simple assignment instructions are rewritten slightly differently, to take
/// maximal advantage of Variables known to have registers.
///
/// In general, there may be several valid ways to rewrite an instruction: add
/// the new assignment instruction either before or after the original
/// instruction, and rewrite the original instruction with either the old or the
/// new variable mapping.  We try to pick a strategy most likely to avoid
/// potential performance problems.  For example, try to avoid storing to the
/// stack and then immediately reloading from the same location.  One
/// consequence is that code might be generated that loads a register from a
/// stack location, followed almost immediately by another use of the same stack
/// location, despite its value already being available in a register as a
/// result of the first instruction.  However, the performance impact here is
/// likely to be negligible, and a simple availability peephole optimization
/// could clean it up.
///
/// This pass potentially adds a lot of new instructions and variables, and as
/// such there are compile-time performance concerns, particularly with liveness
/// analysis and register allocation.  Note that for liveness analysis, the new
/// variables have single-block liveness, so they don't increase the size of the
/// liveness bit vectors that need to be merged across blocks.  As a result, the
/// performance impact is likely to be linearly related to the number of new
/// instructions, rather than number of new variables times number of blocks
/// which would be the case if they were multi-block variables.
void splitBlockLocalVariables(Cfg *Func) {
  if (!getFlags().getSplitLocalVars())
    return;
  TimerMarker _(TimerStack::TT_splitLocalVars, Func);
  LocalVariableSplitter Splitter(Func);
  // TODO(stichnot): Fix this mechanism for LinkedTo variables and stack slot
  // assignment.
  //
  // To work around shortcomings with stack frame mapping, we want to arrange
  // LinkedTo structure such that within one block, the LinkedTo structure
  // leading to a root forms a list, not a tree.  A LinkedTo root can have
  // multiple children linking to it, but only one per block.  Furthermore,
  // because stack slot mapping processes variables in numerical order, the
  // LinkedTo chain needs to be ordered such that when A->getLinkedTo() == B,
  // then A->getIndex() > B->getIndex().
  //
  // To effect this, while processing a block we keep track of preexisting
  // LinkedTo relationships via the LinkedToFixups vector, and at the end of the
  // block we splice them in such that the block has a single chain for each
  // root, ordered by getIndex() value.
  CfgVector<Variable *> LinkedToFixups;
  for (CfgNode *Node : Func->getNodes()) {
    // Clear the VarMap and LinkedToFixups at the start of every block.
    LinkedToFixups.clear();
    Splitter.setNode(Node);
    auto &Insts = Node->getInsts();
    auto Iter = Insts.begin();
    auto IterEnd = Insts.end();
    // TODO(stichnot): Figure out why Phi processing usually degrades
    // performance.  Disable for now.
    static constexpr bool ProcessPhis = false;
    if (ProcessPhis) {
      for (Inst &Instr : Node->getPhis()) {
        if (Instr.isDeleted())
          continue;
        Splitter.setInst(&Instr, Iter, Iter);
        Splitter.handlePhi();
      }
    }
    InstList::iterator NextIter;
    for (; Iter != IterEnd && !Splitter.shouldSkipRemainingInstructions();
         Iter = NextIter) {
      NextIter = Iter;
      ++NextIter;
      Inst *Instr = iteratorToInst(Iter);
      if (Instr->isDeleted())
        continue;
      Splitter.setInst(Instr, Iter, NextIter);

      // Before doing any transformations, take care of the bookkeeping for
      // intra-block branching.
      //
      // This is tricky because the transformation for one instruction may
      // depend on a transformation for a previous instruction, but if that
      // previous instruction is not dynamically executed due to intra-block
      // control flow, it may lead to an inconsistent state and incorrect code.
      //
      // We want to handle some simple cases, and reject some others:
      //
      // 1. For something like a select instruction, we could have:
      //   test cond
      //   dest = src_false
      //   branch conditionally to label
      //   dest = src_true
      //   label:
      //
      // Between the conditional branch and the label, we need to treat dest and
      // src variables specially, specifically not creating any new state.
      //
      // 2. Some 64-bit atomic instructions may be lowered to a loop:
      //   label:
      //   ...
      //   branch conditionally to label
      //
      // No special treatment is needed, but it's worth tracking so that case #1
      // above can also be handled.
      //
      // 3. Advanced switch lowering can create really complex intra-block
      // control flow, so when we recognize this, we should just stop splitting
      // for the remainder of the block (which isn't much since a switch
      // instruction is a terminator).
      //
      // 4. Other complex lowering, e.g. an i64 icmp on a 32-bit architecture,
      // can result in an if/then/else like structure with two labels.  One
      // possibility would be to suspect splitting for the remainder of the
      // lowered instruction, and then resume for the remainder of the block,
      // but since we don't have high-level instruction markers, we might as
      // well just stop splitting for the remainder of the block.
      if (Splitter.handleLabel())
        continue;
      if (Splitter.handleIntraBlockBranch())
        continue;
      if (Splitter.handleUnwantedInstruction())
        continue;

      // Intra-block bookkeeping is complete, now do the transformations.

      // Determine the transformation based on the kind of instruction, and
      // whether its Variables are infinite-weight.  New instructions can be
      // inserted before the current instruction via Iter, or after the current
      // instruction via NextIter.
      if (Splitter.handleSimpleVarAssign())
        continue;
      if (Splitter.handleGeneralInst())
        continue;
    }
    Splitter.finalizeNode();
  }

  Func->dump("After splitting local variables");
}

} // end of namespace Ice