Target/WebAssembly/Relooper.cpp

//===-- Relooper.cpp - Top-level interface for WebAssembly  ----*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===---------------------------------------------------------------------===//
///
/// \file
/// \brief This implements the Relooper algorithm. This implementation includes
/// optimizations added since the original academic paper [1] was published.
///
/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In
/// Proceedings of the ACM international conference companion on Object
/// oriented programming systems languages and applications companion
/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224
/// http://doi.acm.org/10.1145/2048147.2048224
///
//===-------------------------------------------------------------------===//

#include "Relooper.h"
#include "WebAssembly.h"

#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

#include <cstring>
#include <cstdlib>
#include <functional>
#include <list>
#include <stack>
#include <string>

#define DEBUG_TYPE "relooper"

using namespace llvm;
using namespace Relooper;

static cl::opt<int> RelooperSplittingFactor(
    "relooper-splitting-factor",
    cl::desc(
        "How much to discount code size when deciding whether to split a node"),
    cl::init(5));

static cl::opt<unsigned> RelooperMultipleSwitchThreshold(
    "relooper-multiple-switch-threshold",
    cl::desc(
        "How many entries to allow in a multiple before we use a switch"),
    cl::init(10));

static cl::opt<unsigned> RelooperNestingLimit(
    "relooper-nesting-limit",
    cl::desc(
        "How much nesting is acceptable"),
    cl::init(20));


namespace {
///
/// Implements the relooper algorithm for a function's blocks.
///
/// Implementation details: The Relooper instance has
/// ownership of the blocks and shapes, and frees them when done.
///
struct RelooperAlgorithm {
  std::deque<Block *> Blocks;
  std::deque<Shape *> Shapes;
  Shape *Root;
  bool MinSize;
  int BlockIdCounter;
  int ShapeIdCounter;

  RelooperAlgorithm();
  ~RelooperAlgorithm();

  void AddBlock(Block *New, int Id = -1);

  // Calculates the shapes
  void Calculate(Block *Entry);

  // Sets us to try to minimize size
  void SetMinSize(bool MinSize_) { MinSize = MinSize_; }
};

struct RelooperAnalysis final : public FunctionPass {
  static char ID;
  RelooperAnalysis() : FunctionPass(ID) {}
  const char *getPassName() const override { return "relooper"; }
  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesAll();
  }
  bool runOnFunction(Function &F) override;
};
}

// RelooperAnalysis

char RelooperAnalysis::ID = 0;
FunctionPass *llvm::createWebAssemblyRelooper() {
  return new RelooperAnalysis();
}

bool RelooperAnalysis::runOnFunction(Function &F) {
  DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n");
  RelooperAlgorithm R;
  // FIXME: remove duplication between relooper's and LLVM's BBs.
  std::map<const BasicBlock *, Block *> BB2B;
  std::map<const Block *, const BasicBlock *> B2BB;
  for (const BasicBlock &BB : F) {
    // FIXME: getName is wrong here, Code is meant to represent amount of code.
    // FIXME: use BranchVarInit for switch.
    Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr);
    R.AddBlock(B);
    assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice");
    assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice");
    BB2B[&BB] = B;
    B2BB[B] = &BB;
  }
  for (Block *B : R.Blocks) {
    const BasicBlock *BB = B2BB[B];
    for (const BasicBlock *Successor : successors(BB))
      // FIXME: add branch's Condition and Code below.
      B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr);
  }
  R.Calculate(BB2B[&F.getEntryBlock()]);
  return false; // Analysis passes don't modify anything.
}

// Helpers

typedef MapVector<Block *, BlockSet> BlockBlockSetMap;
typedef std::list<Block *> BlockList;

template <class T, class U>
static bool contains(const T &container, const U &contained) {
  return container.count(contained);
}


// Branch

Branch::Branch(const char *ConditionInit, const char *CodeInit)
    : Ancestor(nullptr), Labeled(true) {
  // FIXME: move from char* to LLVM data structures
  Condition = ConditionInit ? strdup(ConditionInit) : nullptr;
  Code = CodeInit ? strdup(CodeInit) : nullptr;
}

Branch::~Branch() {
  // FIXME: move from char* to LLVM data structures
  free(static_cast<void *>(const_cast<char *>(Condition)));
  free(static_cast<void *>(const_cast<char *>(Code)));
}

// Block

Block::Block(const char *CodeInit, const char *BranchVarInit)
    : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) {
  // FIXME: move from char* to LLVM data structures
  Code = strdup(CodeInit);
  BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr;
}

Block::~Block() {
  // FIXME: move from char* to LLVM data structures
  free(static_cast<void *>(const_cast<char *>(Code)));
  free(static_cast<void *>(const_cast<char *>(BranchVar)));
}

void Block::AddBranchTo(Block *Target, const char *Condition,
                        const char *Code) {
  assert(!contains(BranchesOut, Target) &&
         "cannot add more than one branch to the same target");
  BranchesOut[Target] = make_unique<Branch>(Condition, Code);
}

// Relooper

RelooperAlgorithm::RelooperAlgorithm()
    : Root(nullptr), MinSize(false), BlockIdCounter(1),
      ShapeIdCounter(0) { // block ID 0 is reserved for clearings
}

RelooperAlgorithm::~RelooperAlgorithm() {
  for (auto Curr : Blocks)
    delete Curr;
  for (auto Curr : Shapes)
    delete Curr;
}

void RelooperAlgorithm::AddBlock(Block *New, int Id) {
  New->Id = Id == -1 ? BlockIdCounter++ : Id;
  Blocks.push_back(New);
}

struct RelooperRecursor {
  RelooperAlgorithm *Parent;
  RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}
};

void RelooperAlgorithm::Calculate(Block *Entry) {
  // Scan and optimize the input
  struct PreOptimizer : public RelooperRecursor {
    PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}
    BlockSet Live;

    void FindLive(Block *Root) {
      BlockList ToInvestigate;
      ToInvestigate.push_back(Root);
      while (!ToInvestigate.empty()) {
        Block *Curr = ToInvestigate.front();
        ToInvestigate.pop_front();
        if (contains(Live, Curr))
          continue;
        Live.insert(Curr);
        for (const auto &iter : Curr->BranchesOut)
          ToInvestigate.push_back(iter.first);
      }
    }

    // If a block has multiple entries but no exits, and it is small enough, it
    // is useful to split it. A common example is a C++ function where
    // everything ends up at a final exit block and does some RAII cleanup.
    // Without splitting, we will be forced to introduce labelled loops to
    // allow reaching the final block
    void SplitDeadEnds() {
      unsigned TotalCodeSize = 0;
      for (const auto &Curr : Live) {
        TotalCodeSize += strlen(Curr->Code);
      }
      BlockSet Splits;
      BlockSet Removed;
      for (const auto &Original : Live) {
        if (Original->BranchesIn.size() <= 1 ||
            !Original->BranchesOut.empty())
          continue; // only dead ends, for now
        if (contains(Original->BranchesOut, Original))
          continue; // cannot split a looping node
        if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) >
            TotalCodeSize / RelooperSplittingFactor)
          continue; // if splitting increases raw code size by a significant
                    // amount, abort
        // Split the node (for simplicity, we replace all the blocks, even
        // though we could have reused the original)
        DEBUG(dbgs() << "  Splitting '" << Original->Code << "'\n");
        for (const auto &Prior : Original->BranchesIn) {
          Block *Split = new Block(Original->Code, Original->BranchVar);
          Parent->AddBlock(Split, Original->Id);
          Split->BranchesIn.insert(Prior);
          std::unique_ptr<Branch> Details;
          Details.swap(Prior->BranchesOut[Original]);
          Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition,
                                                          Details->Code);
          for (const auto &iter : Original->BranchesOut) {
            Block *Post = iter.first;
            Branch *Details = iter.second.get();
            Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition,
                                                           Details->Code);
            Post->BranchesIn.insert(Split);
          }
          Splits.insert(Split);
          Removed.insert(Original);
        }
        for (const auto &iter : Original->BranchesOut) {
          Block *Post = iter.first;
          Post->BranchesIn.remove(Original);
        }
      }
      for (const auto &iter : Splits)
        Live.insert(iter);
      for (const auto &iter : Removed)
        Live.remove(iter);
    }
  };
  PreOptimizer Pre(this);
  Pre.FindLive(Entry);

  // Add incoming branches from live blocks, ignoring dead code
  for (unsigned i = 0; i < Blocks.size(); i++) {
    Block *Curr = Blocks[i];
    if (!contains(Pre.Live, Curr))
      continue;
    for (const auto &iter : Curr->BranchesOut)
      iter.first->BranchesIn.insert(Curr);
  }

  if (!MinSize)
    Pre.SplitDeadEnds();

  // Recursively process the graph

  struct Analyzer : public RelooperRecursor {
    Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {}

    // Add a shape to the list of shapes in this Relooper calculation
    void Notice(Shape *New) {
      New->Id = Parent->ShapeIdCounter++;
      Parent->Shapes.push_back(New);
    }

    // Create a list of entries from a block. If LimitTo is provided, only
    // results in that set will appear
    void GetBlocksOut(Block *Source, BlockSet &Entries,
                      BlockSet *LimitTo = nullptr) {
      for (const auto &iter : Source->BranchesOut)
        if (!LimitTo || contains(*LimitTo, iter.first))
          Entries.insert(iter.first);
    }

    // Converts/processes all branchings to a specific target
    void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor,
                   BlockSet &From) {
      DEBUG(dbgs() << "  Solipsize '" << Target->Code << "' type " << Type
                   << "\n");
      for (auto iter = Target->BranchesIn.begin();
           iter != Target->BranchesIn.end();) {
        Block *Prior = *iter;
        if (!contains(From, Prior)) {
          iter++;
          continue;
        }
        std::unique_ptr<Branch> PriorOut;
        PriorOut.swap(Prior->BranchesOut[Target]);
        PriorOut->Ancestor = Ancestor;
        PriorOut->Type = Type;
        if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor))
          Multiple->Breaks++; // We are breaking out of this Multiple, so need a
                              // loop
        iter++; // carefully increment iter before erasing
        Target->BranchesIn.remove(Prior);
        Target->ProcessedBranchesIn.insert(Prior);
        Prior->ProcessedBranchesOut[Target].swap(PriorOut);
      }
    }

    Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) {
      DEBUG(dbgs() << "  MakeSimple inner block '" << Inner->Code << "'\n");
      SimpleShape *Simple = new SimpleShape;
      Notice(Simple);
      Simple->Inner = Inner;
      Inner->Parent = Simple;
      if (Blocks.size() > 1) {
        Blocks.remove(Inner);
        GetBlocksOut(Inner, NextEntries, &Blocks);
        BlockSet JustInner;
        JustInner.insert(Inner);
        for (const auto &iter : NextEntries)
          Solipsize(iter, Branch::Direct, Simple, JustInner);
      }
      return Simple;
    }

    Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries,
                    BlockSet &NextEntries) {
      // Find the inner blocks in this loop. Proceed backwards from the entries
      // until
      // you reach a seen block, collecting as you go.
      BlockSet InnerBlocks;
      BlockSet Queue = Entries;
      while (!Queue.empty()) {
        Block *Curr = *(Queue.begin());
        Queue.remove(*Queue.begin());
        if (!contains(InnerBlocks, Curr)) {
          // This element is new, mark it as inner and remove from outer
          InnerBlocks.insert(Curr);
          Blocks.remove(Curr);
          // Add the elements prior to it
          for (const auto &iter : Curr->BranchesIn)
            Queue.insert(iter);
        }
      }
      assert(!InnerBlocks.empty());

      for (const auto &Curr : InnerBlocks) {
        for (const auto &iter : Curr->BranchesOut) {
          Block *Possible = iter.first;
          if (!contains(InnerBlocks, Possible))
            NextEntries.insert(Possible);
        }
      }

      LoopShape *Loop = new LoopShape();
      Notice(Loop);

      // Solipsize the loop, replacing with break/continue and marking branches
      // as Processed (will not affect later calculations)
      // A. Branches to the loop entries become a continue to this shape
      for (const auto &iter : Entries)
        Solipsize(iter, Branch::Continue, Loop, InnerBlocks);
      // B. Branches to outside the loop (a next entry) become breaks on this
      // shape
      for (const auto &iter : NextEntries)
        Solipsize(iter, Branch::Break, Loop, InnerBlocks);
      // Finish up
      Shape *Inner = Process(InnerBlocks, Entries, nullptr);
      Loop->Inner = Inner;
      return Loop;
    }

    // For each entry, find the independent group reachable by it. The
    // independent group is the entry itself, plus all the blocks it can
    // reach that cannot be directly reached by another entry. Note that we
    // ignore directly reaching the entry itself by another entry.
    //   @param Ignore - previous blocks that are irrelevant
    void FindIndependentGroups(BlockSet &Entries,
                               BlockBlockSetMap &IndependentGroups,
                               BlockSet *Ignore = nullptr) {
      typedef std::map<Block *, Block *> BlockBlockMap;

      struct HelperClass {
        BlockBlockSetMap &IndependentGroups;
        BlockBlockMap Ownership; // For each block, which entry it belongs to.
                                 // We have reached it from there.

        HelperClass(BlockBlockSetMap &IndependentGroupsInit)
            : IndependentGroups(IndependentGroupsInit) {}
        void InvalidateWithChildren(Block *New) {
          // Being in the list means you need to be invalidated
          BlockList ToInvalidate;
          ToInvalidate.push_back(New);
          while (!ToInvalidate.empty()) {
            Block *Invalidatee = ToInvalidate.front();
            ToInvalidate.pop_front();
            Block *Owner = Ownership[Invalidatee];
            // Owner may have been invalidated, do not add to
            // IndependentGroups!
            if (contains(IndependentGroups, Owner))
              IndependentGroups[Owner].remove(Invalidatee);
            if (Ownership[Invalidatee]) { // may have been seen before and
                                          // invalidated already
              Ownership[Invalidatee] = nullptr;
              for (const auto &iter : Invalidatee->BranchesOut) {
                Block *Target = iter.first;
                BlockBlockMap::iterator Known = Ownership.find(Target);
                if (Known != Ownership.end()) {
                  Block *TargetOwner = Known->second;
                  if (TargetOwner)
                    ToInvalidate.push_back(Target);
                }
              }
            }
          }
        }
      };
      HelperClass Helper(IndependentGroups);

      // We flow out from each of the entries, simultaneously.
      // When we reach a new block, we add it as belonging to the one we got to
      // it from.
      // If we reach a new block that is already marked as belonging to someone,
      // it is reachable by two entries and is not valid for any of them.
      // Remove it and all it can reach that have been visited.

      // Being in the queue means we just added this item, and
      // we need to add its children
      BlockList Queue;
      for (const auto &Entry : Entries) {
        Helper.Ownership[Entry] = Entry;
        IndependentGroups[Entry].insert(Entry);
        Queue.push_back(Entry);
      }
      while (!Queue.empty()) {
        Block *Curr = Queue.front();
        Queue.pop_front();
        Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership
                                               // map if we are in the queue
        if (!Owner)
          continue; // we have been invalidated meanwhile after being reached
                    // from two entries
        // Add all children
        for (const auto &iter : Curr->BranchesOut) {
          Block *New = iter.first;
          BlockBlockMap::iterator Known = Helper.Ownership.find(New);
          if (Known == Helper.Ownership.end()) {
            // New node. Add it, and put it in the queue
            Helper.Ownership[New] = Owner;
            IndependentGroups[Owner].insert(New);
            Queue.push_back(New);
            continue;
          }
          Block *NewOwner = Known->second;
          if (!NewOwner)
            continue; // We reached an invalidated node
          if (NewOwner != Owner)
            // Invalidate this and all reachable that we have seen - we reached
            // this from two locations
            Helper.InvalidateWithChildren(New);
          // otherwise, we have the same owner, so do nothing
        }
      }

      // Having processed all the interesting blocks, we remain with just one
      // potential issue:
      // If a->b, and a was invalidated, but then b was later reached by
      // someone else, we must invalidate b. To check for this, we go over all
      // elements in the independent groups, if an element has a parent which
      // does *not* have the same owner, we/ must remove it and all its
      // children.

      for (const auto &iter : Entries) {
        BlockSet &CurrGroup = IndependentGroups[iter];
        BlockList ToInvalidate;
        for (const auto &iter : CurrGroup) {
          Block *Child = iter;
          for (const auto &iter : Child->BranchesIn) {
            Block *Parent = iter;
            if (Ignore && contains(*Ignore, Parent))
              continue;
            if (Helper.Ownership[Parent] != Helper.Ownership[Child])
              ToInvalidate.push_back(Child);
          }
        }
        while (!ToInvalidate.empty()) {
          Block *Invalidatee = ToInvalidate.front();
          ToInvalidate.pop_front();
          Helper.InvalidateWithChildren(Invalidatee);
        }
      }

      // Remove empty groups
      for (const auto &iter : Entries)
        if (IndependentGroups[iter].empty())
          IndependentGroups.erase(iter);
    }

    Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries,
                        BlockBlockSetMap &IndependentGroups, Shape *Prev,
                        BlockSet &NextEntries) {
      bool Fused = isa<SimpleShape>(Prev);
      MultipleShape *Multiple = new MultipleShape();
      Notice(Multiple);
      BlockSet CurrEntries;
      for (auto &iter : IndependentGroups) {
        Block *CurrEntry = iter.first;
        BlockSet &CurrBlocks = iter.second;
        // Create inner block
        CurrEntries.clear();
        CurrEntries.insert(CurrEntry);
        for (const auto &CurrInner : CurrBlocks) {
          // Remove the block from the remaining blocks
          Blocks.remove(CurrInner);
          // Find new next entries and fix branches to them
          for (auto iter = CurrInner->BranchesOut.begin();
               iter != CurrInner->BranchesOut.end();) {
            Block *CurrTarget = iter->first;
            auto Next = iter;
            Next++;
            if (!contains(CurrBlocks, CurrTarget)) {
              NextEntries.insert(CurrTarget);
              Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks);
            }
            iter = Next; // increment carefully because Solipsize can remove us
          }
        }
        Multiple->InnerMap[CurrEntry->Id] =
            Process(CurrBlocks, CurrEntries, nullptr);
        // If we are not fused, then our entries will actually be checked
        if (!Fused)
          CurrEntry->IsCheckedMultipleEntry = true;
      }
      // Add entries not handled as next entries, they are deferred
      for (const auto &Entry : Entries)
        if (!contains(IndependentGroups, Entry))
          NextEntries.insert(Entry);
      // The multiple has been created, we can decide how to implement it
      if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) {
        Multiple->UseSwitch = true;
        Multiple->Breaks++; // switch captures breaks
      }
      return Multiple;
    }

    // Main function.
    // Process a set of blocks with specified entries, returns a shape
    // The Make* functions receive a NextEntries. If they fill it with data,
    // those are the entries for the ->Next block on them, and the blocks
    // are what remains in Blocks (which Make* modify). In this way
    // we avoid recursing on Next (imagine a long chain of Simples, if we
    // recursed we could blow the stack).
    Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) {
      BlockSet *Entries = &InitialEntries;
      BlockSet TempEntries[2];
      int CurrTempIndex = 0;
      BlockSet *NextEntries;
      Shape *Ret = nullptr;

      auto Make = [&](Shape *Temp) {
        if (Prev)
          Prev->Next = Temp;
        if (!Ret)
          Ret = Temp;
        Prev = Temp;
        Entries = NextEntries;
      };

      while (1) {
        CurrTempIndex = 1 - CurrTempIndex;
        NextEntries = &TempEntries[CurrTempIndex];
        NextEntries->clear();

        if (Entries->empty())
          return Ret;
        if (Entries->size() == 1) {
          Block *Curr = *(Entries->begin());
          if (Curr->BranchesIn.empty()) {
            // One entry, no looping ==> Simple
            Make(MakeSimple(Blocks, Curr, *NextEntries));
            if (NextEntries->empty())
              return Ret;
            continue;
          }
          // One entry, looping ==> Loop
          Make(MakeLoop(Blocks, *Entries, *NextEntries));
          if (NextEntries->empty())
            return Ret;
          continue;
        }

        // More than one entry, try to eliminate through a Multiple groups of
        // independent blocks from an entry/ies. It is important to remove
        // through multiples as opposed to looping since the former is more
        // performant.
        BlockBlockSetMap IndependentGroups;
        FindIndependentGroups(*Entries, IndependentGroups);

        if (!IndependentGroups.empty()) {
          // We can handle a group in a multiple if its entry cannot be reached
          // by another group.
          // Note that it might be reachable by itself - a loop. But that is
          // fine, we will create a loop inside the multiple block (which
          // is the performant order to do it).
          for (auto iter = IndependentGroups.begin();
               iter != IndependentGroups.end();) {
            Block *Entry = iter->first;
            BlockSet &Group = iter->second;
            auto curr = iter++; // iterate carefully, we may delete
            for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin();
                 iterBranch != Entry->BranchesIn.end(); iterBranch++) {
              Block *Origin = *iterBranch;
              if (!contains(Group, Origin)) {
                // Reached from outside the group, so we cannot handle this
                IndependentGroups.erase(curr);
                break;
              }
            }
          }

          // As an optimization, if we have 2 independent groups, and one is a
          // small dead end, we can handle only that dead end.
          // The other then becomes a Next - without nesting in the code and
          // recursion in the analysis.
          // TODO: if the larger is the only dead end, handle that too
          // TODO: handle >2 groups
          // TODO: handle not just dead ends, but also that do not branch to the
          // NextEntries. However, must be careful there since we create a
          // Next, and that Next can prevent eliminating a break (since we no
          // longer naturally reach the same place), which may necessitate a
          // one-time loop, which makes the unnesting pointless.
          if (IndependentGroups.size() == 2) {
            // Find the smaller one
            auto iter = IndependentGroups.begin();
            Block *SmallEntry = iter->first;
            auto SmallSize = iter->second.size();
            iter++;
            Block *LargeEntry = iter->first;
            auto LargeSize = iter->second.size();
            if (SmallSize != LargeSize) { // ignore the case where they are
                                          // identical - keep things symmetrical
                                          // there
              if (SmallSize > LargeSize) {
                Block *Temp = SmallEntry;
                SmallEntry = LargeEntry;
                LargeEntry = Temp; // Note: we did not flip the Sizes too, they
                                   // are now invalid. TODO: use the smaller
                                   // size as a limit?
              }
              // Check if dead end
              bool DeadEnd = true;
              BlockSet &SmallGroup = IndependentGroups[SmallEntry];
              for (const auto &Curr : SmallGroup) {
                for (const auto &iter : Curr->BranchesOut) {
                  Block *Target = iter.first;
                  if (!contains(SmallGroup, Target)) {
                    DeadEnd = false;
                    break;
                  }
                }
                if (!DeadEnd)
                  break;
              }
              if (DeadEnd)
                IndependentGroups.erase(LargeEntry);
            }
          }

          if (!IndependentGroups.empty())
            // Some groups removable ==> Multiple
            Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev,
                              *NextEntries));
            if (NextEntries->empty())
              return Ret;
            continue;
        }
        // No independent groups, must be loopable ==> Loop
        Make(MakeLoop(Blocks, *Entries, *NextEntries));
        if (NextEntries->empty())
          return Ret;
        continue;
      }
    }
  };

  // Main

  BlockSet AllBlocks;
  for (const auto &Curr : Pre.Live) {
    AllBlocks.insert(Curr);
  }

  BlockSet Entries;
  Entries.insert(Entry);
  Root = Analyzer(this).Process(AllBlocks, Entries, nullptr);
  assert(Root);

  ///
  /// Relooper post-optimizer
  ///
  struct PostOptimizer {
    RelooperAlgorithm *Parent;
    std::stack<Shape *> LoopStack;

    PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {}

    void ShapeSwitch(Shape* var,
                     std::function<void (SimpleShape*)> simple,
                     std::function<void (MultipleShape*)> multiple,
                     std::function<void (LoopShape*)> loop) {
      switch (var->getKind()) {
        case Shape::SK_Simple: {
          simple(cast<SimpleShape>(var));
          break;
        }
        case Shape::SK_Multiple: {
          multiple(cast<MultipleShape>(var));
          break;
        }
        case Shape::SK_Loop: {
          loop(cast<LoopShape>(var));
          break;
        }
      }
    }

    // Find the blocks that natural control flow can get us directly to, or
    // through a multiple that we ignore
    void FollowNaturalFlow(Shape *S, BlockSet &Out) {
      ShapeSwitch(S, [&](SimpleShape* Simple) {
        Out.insert(Simple->Inner);
      }, [&](MultipleShape* Multiple) {
        for (const auto &iter : Multiple->InnerMap) {
          FollowNaturalFlow(iter.second, Out);
        }
        FollowNaturalFlow(Multiple->Next, Out);
      }, [&](LoopShape* Loop) {
        FollowNaturalFlow(Loop->Inner, Out);
      });
    }

    void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) {
      if (Root->Next) {
        Root->Natural = Root->Next;
        FindNaturals(Root->Next, Otherwise);
      } else {
        Root->Natural = Otherwise;
      }

      ShapeSwitch(Root, [](SimpleShape* Simple) {
      }, [&](MultipleShape* Multiple) {
        for (const auto &iter : Multiple->InnerMap) {
          FindNaturals(iter.second, Root->Natural);
        }
      }, [&](LoopShape* Loop){
        FindNaturals(Loop->Inner, Loop->Inner);
      });
    }

    // Remove unneeded breaks and continues.
    // A flow operation is trivially unneeded if the shape we naturally get to
    // by normal code execution is the same as the flow forces us to.
    void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr,
                             LoopShape *LastLoop = nullptr,
                             unsigned Depth = 0) {
      BlockSet NaturalBlocks;
      FollowNaturalFlow(Natural, NaturalBlocks);
      Shape *Next = Root;
      while (Next) {
        Root = Next;
        Next = nullptr;
        ShapeSwitch(
            Root,
            [&](SimpleShape* Simple) {
              if (Simple->Inner->BranchVar)
                LastLoop =
                    nullptr; // a switch clears out the loop (TODO: only for
                             // breaks, not continue)

              if (Simple->Next) {
                if (!Simple->Inner->BranchVar &&
                    Simple->Inner->ProcessedBranchesOut.size() == 2 &&
                    Depth < RelooperNestingLimit) {
                  // If there is a next block, we already know at Simple
                  // creation time to make direct branches, and we can do
                  // nothing more in general. But, we try to optimize the
                  // case of a break and a direct: This would normally be
                  //   if (break?) { break; } ..
                  // but if we make sure to nest the else, we can save the
                  // break,
                  //   if (!break?) { .. }
                  // This is also better because the more canonical nested
                  // form is easier to further optimize later. The
                  // downside is more nesting, which adds to size in builds with
                  // whitespace.
                  // Note that we avoid switches, as it complicates control flow
                  // and is not relevant for the common case we optimize here.
                  bool Found = false;
                  bool Abort = false;
                  for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
                    Block *Target = iter.first;
                    Branch *Details = iter.second.get();
                    if (Details->Type == Branch::Break) {
                      Found = true;
                      if (!contains(NaturalBlocks, Target))
                        Abort = true;
                    } else if (Details->Type != Branch::Direct)
                      Abort = true;
                  }
                  if (Found && !Abort) {
                    for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
                      Branch *Details = iter.second.get();
                      if (Details->Type == Branch::Break) {
                        Details->Type = Branch::Direct;
                        if (MultipleShape *Multiple =
                                dyn_cast<MultipleShape>(Details->Ancestor))
                          Multiple->Breaks--;
                      } else {
                        assert(Details->Type == Branch::Direct);
                        Details->Type = Branch::Nested;
                      }
                    }
                  }
                  Depth++; // this optimization increases depth, for us and all
                           // our next chain (i.e., until this call returns)
                }
                Next = Simple->Next;
              } else {
                // If there is no next then Natural is where we will
                // go to by doing nothing, so we can potentially optimize some
                // branches to direct.
                for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
                  Block *Target = iter.first;
                  Branch *Details = iter.second.get();
                  if (Details->Type != Branch::Direct &&
                      contains(NaturalBlocks,
                               Target)) { // note: cannot handle split blocks
                    Details->Type = Branch::Direct;
                    if (MultipleShape *Multiple =
                            dyn_cast<MultipleShape>(Details->Ancestor))
                      Multiple->Breaks--;
                  } else if (Details->Type == Branch::Break && LastLoop &&
                             LastLoop->Natural == Details->Ancestor->Natural) {
                    // it is important to simplify breaks, as simpler breaks
                    // enable other optimizations
                    Details->Labeled = false;
                    if (MultipleShape *Multiple =
                            dyn_cast<MultipleShape>(Details->Ancestor))
                      Multiple->Breaks--;
                  }
                }
              }
            }, [&](MultipleShape* Multiple)
            {
              for (const auto &iter : Multiple->InnerMap) {
                RemoveUnneededFlows(iter.second, Multiple->Next,
                                    Multiple->Breaks ? nullptr : LastLoop,
                                    Depth + 1);
              }
              Next = Multiple->Next;
            }, [&](LoopShape* Loop)
            {
              RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1);
              Next = Loop->Next;
            });
      }
    }

    // After we know which loops exist, we can calculate which need to be
    // labeled
    void FindLabeledLoops(Shape *Root) {
      Shape *Next = Root;
      while (Next) {
        Root = Next;
        Next = nullptr;

        ShapeSwitch(
            Root,
            [&](SimpleShape *Simple) {
          MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next);
          // If we are fusing a Multiple with a loop into this Simple, then
          // visit it now
          if (Fused && Fused->Breaks)
            LoopStack.push(Fused);
          if (Simple->Inner->BranchVar)
            LoopStack.push(nullptr); // a switch means breaks are now useless,
                                     // push a dummy
          if (Fused) {
            if (Fused->UseSwitch)
              LoopStack.push(nullptr); // a switch means breaks are now
                                       // useless, push a dummy
            for (const auto &iter : Fused->InnerMap) {
              FindLabeledLoops(iter.second);
            }
          }
          for (const auto &iter : Simple->Inner->ProcessedBranchesOut) {
            Branch *Details = iter.second.get();
            if (Details->Type == Branch::Break ||
                Details->Type == Branch::Continue) {
              assert(!LoopStack.empty());
              if (Details->Ancestor != LoopStack.top() && Details->Labeled) {
                if (MultipleShape *Multiple =
                        dyn_cast<MultipleShape>(Details->Ancestor)) {
                  Multiple->Labeled = true;
                } else {
                  LoopShape *Loop = cast<LoopShape>(Details->Ancestor);
                  Loop->Labeled = true;
                }
              } else {
                Details->Labeled = false;
              }
            }
            if (Fused && Fused->UseSwitch)
              LoopStack.pop();
            if (Simple->Inner->BranchVar)
              LoopStack.pop();
            if (Fused && Fused->Breaks)
              LoopStack.pop();
            if (Fused)
              Next = Fused->Next;
            else
              Next = Root->Next;
          }
          }
          , [&](MultipleShape* Multiple) {
            if (Multiple->Breaks)
              LoopStack.push(Multiple);
            for (const auto &iter : Multiple->InnerMap)
              FindLabeledLoops(iter.second);
            if (Multiple->Breaks)
              LoopStack.pop();
            Next = Root->Next;
          }
          , [&](LoopShape* Loop) {
            LoopStack.push(Loop);
            FindLabeledLoops(Loop->Inner);
            LoopStack.pop();
            Next = Root->Next;
          });
      }
    }

    void Process(Shape * Root) {
      FindNaturals(Root);
      RemoveUnneededFlows(Root);
      FindLabeledLoops(Root);
    }
  };

  PostOptimizer(this).Process(Root);
}