clang-p2996/bolt/BinaryFunction.cpp

//===--- BinaryFunction.cpp - Interface for machine-level function --------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//


#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "DataReader.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/Object/ObjectFile.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <limits>
#include <queue>
#include <string>

#undef  DEBUG_TYPE
#define DEBUG_TYPE "flo"

namespace llvm {
namespace flo {

BinaryBasicBlock *
BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) {
  if (Offset > Size)
    return nullptr;

  if (BasicBlocks.empty())
    return nullptr;

  auto I = std::upper_bound(BasicBlocks.begin(),
                            BasicBlocks.end(),
                            BinaryBasicBlock(Offset));
  assert(I != BasicBlocks.begin() && "first basic block not at offset 0");

  return &(*--I);
}

unsigned BinaryFunction::eraseDeadBBs(
    std::map<BinaryBasicBlock *, bool> &ToPreserve) {
  BasicBlockOrderType NewLayout;
  unsigned Count = 0;
  for (auto I = BasicBlocksLayout.begin(), E = BasicBlocksLayout.end(); I != E;
       ++I) {
    if (ToPreserve[*I])
      NewLayout.push_back(*I);
    else
      ++Count;
  }
  BasicBlocksLayout = std::move(NewLayout);
  return Count;
}

void BinaryFunction::print(raw_ostream &OS, std::string Annotation,
                           bool PrintInstructions) const {
  StringRef SectionName;
  Section.getName(SectionName);
  OS << "Binary Function \"" << getName() << "\" " << Annotation << " {"
     << "\n  State       : "   << CurrentState
     << "\n  Address     : 0x" << Twine::utohexstr(Address)
     << "\n  Size        : 0x" << Twine::utohexstr(Size)
     << "\n  MaxSize     : 0x" << Twine::utohexstr(MaxSize)
     << "\n  Offset      : 0x" << Twine::utohexstr(FileOffset)
     << "\n  Section     : "   << SectionName
     << "\n  Orc Section : "   << getCodeSectionName()
     << "\n  IsSimple    : "   << IsSimple
     << "\n  BB Count    : "   << BasicBlocksLayout.size();
  if (BasicBlocksLayout.size()) {
    OS << "\n  BB Layout   : ";
    auto Sep = "";
    for (auto BB : BasicBlocksLayout) {
      OS << Sep << BB->getName();
      Sep = ", ";
    }
  }
  if (ImageAddress)
    OS << "\n  Image       : 0x" << Twine::utohexstr(ImageAddress);
  if (ExecutionCount != COUNT_NO_PROFILE)
    OS << "\n  Exec Count  : " << ExecutionCount;

  OS << "\n}\n";

  if (!PrintInstructions || !BC.InstPrinter)
    return;

  // Offset of the instruction in function.
  uint64_t Offset{0};

  if (BasicBlocks.empty() && !Instructions.empty()) {
    // Print before CFG was built.
    for (const auto &II : Instructions) {
      auto Offset = II.first;

      // Print label if exists at this offset.
      auto LI = Labels.find(Offset);
      if (LI != Labels.end())
        OS << LI->second->getName() << ":\n";

      auto &Instruction = II.second;
      OS << format("    %08" PRIx64 ": ", Offset);
      BC.InstPrinter->printInst(&Instruction, OS, "", *BC.STI);
      OS << "\n";
    }
  }

  for (auto BB : BasicBlocksLayout) {
    OS << BB->getName() << " ("
       << BB->Instructions.size() << " instructions, align : "
       << BB->getAlignment() << ")\n";

    uint64_t BBExecCount = BB->getExecutionCount();
    if (BBExecCount != BinaryBasicBlock::COUNT_NO_PROFILE) {
      OS << "  Exec Count : " << BBExecCount << "\n";
    }
    if (!BB->Predecessors.empty()) {
      OS << "  Predecessors: ";
      auto Sep = "";
      for (auto Pred : BB->Predecessors) {
        OS << Sep << Pred->getName();
        Sep = ", ";
      }
      OS << '\n';
    }

    Offset = RoundUpToAlignment(Offset, BB->getAlignment());

    for (auto &Instr : *BB) {
      OS << format("    %08" PRIx64 ": ", Offset);
      BC.InstPrinter->printInst(&Instr, OS, "", *BC.STI);
      OS << "\n";

      // In case we need MCInst printer:
      // Instr.dump_pretty(OS, InstructionPrinter.get());

      // Calculate the size of the instruction.
      // Note: this is imprecise since happening prior to relaxation.
      SmallString<256> Code;
      SmallVector<MCFixup, 4> Fixups;
      raw_svector_ostream VecOS(Code);
      BC.MCE->encodeInstruction(Instr, VecOS, Fixups, *BC.STI);
      Offset += Code.size();
    }

    if (!BB->Successors.empty()) {
      OS << "  Successors: ";
      auto BI = BB->BranchInfo.begin();
      auto Sep = "";
      for (auto Succ : BB->Successors) {
        assert(BI != BB->BranchInfo.end() && "missing BranchInfo entry");
        OS << Sep << Succ->getName();
        if (ExecutionCount != COUNT_NO_PROFILE &&
            BI->MispredictedCount != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) {
          OS << " (mispreds: " << BI->MispredictedCount
             << ", count: " << BI->Count << ")";
        } else if (ExecutionCount != COUNT_NO_PROFILE &&
                   BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) {
          OS << " (inferred count: " << BI->Count << ")";
        }
        Sep = ", ";
        ++BI;
      }
      OS << '\n';
    }

    OS << '\n';
  }

  OS << "End of Function \"" << getName() << "\"\n";
}

bool BinaryFunction::disassemble(ArrayRef<uint8_t> FunctionData) {
  assert(FunctionData.size() == getSize() &&
         "function size does not match raw data size");

  auto &Ctx = BC.Ctx;
  auto &MIA = BC.MIA;

  // Insert a label at the beginning of the function. This will be our first
  // basic block.
  Labels[0] = Ctx->createTempSymbol("BB0", false);

  bool IsSimple = true;
  for (uint64_t Offset = 0; IsSimple && (Offset < getSize()); ) {
    MCInst Instruction;
    uint64_t Size;
    if (!BC.DisAsm->getInstruction(Instruction,
                                   Size,
                                   FunctionData.slice(Offset),
                                   getAddress() + Offset,
                                   nulls(),
                                   nulls())) {
      // Ignore this function. Skip to the next one.
      IsSimple = false;
      break;
    }

    if (MIA->isUnsupported(Instruction)) {
      DEBUG(dbgs() << "FLO: unsupported instruction seen. Skipping function "
                   << getName() << ".\n");
      IsSimple = false;
      break;
    }

    if (MIA->isIndirectBranch(Instruction)) {
      DEBUG(dbgs() << "FLO: indirect branch seen. Skipping function "
                   << getName() << ".\n");
      IsSimple = false;
      break;
    }

    uint64_t AbsoluteInstrAddr = getAddress() + Offset;
    if (MIA->isBranch(Instruction) || MIA->isCall(Instruction)) {
      uint64_t InstructionTarget = 0;
      if (MIA->evaluateBranch(Instruction,
                              AbsoluteInstrAddr,
                              Size,
                              InstructionTarget)) {
        // Check if the target is within the same function. Otherwise it's
        // a call, possibly a tail call.
        //
        // If the target *is* the function address it could be either a branch
        // or a recursive call.
        bool IsCall = MIA->isCall(Instruction);
        MCSymbol *TargetSymbol{nullptr};
        uint64_t TargetOffset{0};

        if (IsCall && containsAddress(InstructionTarget)) {
          if (InstructionTarget == getAddress()) {
            // Recursive call.
            TargetSymbol = Ctx->getOrCreateSymbol(getName());
          } else {
            // Possibly an old-style PIC code
            DEBUG(dbgs() << "FLO: internal call detected at 0x"
                         << Twine::utohexstr(AbsoluteInstrAddr)
                         << " in function " << getName() << "\n");
            IsSimple = false;
            break;
          }
        }

        if (!TargetSymbol) {
          // Create either local label or external symbol.
          if (containsAddress(InstructionTarget)) {
            // Check if there's already a registered label.
            TargetOffset = InstructionTarget - getAddress();
            auto LI = Labels.find(TargetOffset);
            if (LI == Labels.end()) {
              TargetSymbol = Ctx->createTempSymbol();
              Labels[TargetOffset] = TargetSymbol;
            } else {
              TargetSymbol = LI->second;
            }
          } else {
            if (!IsCall && Size == 2) {
              errs() << "FLO-WARNING: relaxed tail call detected at 0x"
                     << Twine::utohexstr(AbsoluteInstrAddr)
                     << ". Code size will be increased.\n";
            }

            // This is a call regardless of the opcode (e.g. tail call).
            IsCall = true;
            TargetSymbol = BC.getOrCreateGlobalSymbol(InstructionTarget,
                                                      "FUNCat");
          }
        }

        Instruction.clear();
        Instruction.addOperand(
            MCOperand::createExpr(
              MCSymbolRefExpr::create(TargetSymbol,
                                      MCSymbolRefExpr::VK_None,
                                      *Ctx)));
        if (!IsCall) {
          // Add local branch info.
          LocalBranches.push_back({Offset, TargetOffset});
        }

      } else {
        // Indirect call
        DEBUG(dbgs() << "FLO: indirect call detected (not yet supported)\n");
        IsSimple = false;
        break;
      }
    } else {
      if (MIA->hasRIPOperand(Instruction)) {
        uint64_t TargetAddress{0};
        MCSymbol *TargetSymbol{nullptr};
        if (!MIA->evaluateRIPOperand(Instruction, AbsoluteInstrAddr,
                                     Size, TargetAddress)) {
          DEBUG(
            dbgs() << "FLO: rip-relative operand could not be evaluated:\n";
            BC.InstPrinter->printInst(&Instruction, dbgs(), "", *BC.STI);
            dbgs() << '\n';
            Instruction.dump_pretty(dbgs(), BC.InstPrinter.get());
            dbgs() << '\n';
          );
          IsSimple = false;
          break;
        }
        // FIXME: check that the address is in data, not in code.
        TargetSymbol = BC.getOrCreateGlobalSymbol(TargetAddress, "DATAat");
        MIA->replaceRIPOperandDisp(
            Instruction,
            MCOperand::createExpr(
              MCSymbolRefExpr::create(TargetSymbol,
                                      MCSymbolRefExpr::VK_None,
                                      *Ctx)));
      }
    }

    addInstruction(Offset, std::move(Instruction));

    Offset += Size;
  }

  setSimple(IsSimple);

  // TODO: clear memory if not simple function?

  // Update state.
  updateState(State::Disassembled);

  return true;
}

bool BinaryFunction::buildCFG() {

  auto &MIA = BC.MIA;

  auto BranchDataOrErr = BC.DR.getFuncBranchData(getName());
  if (std::error_code EC = BranchDataOrErr.getError()) {
    DEBUG(dbgs() << "no branch data found for \"" << getName() << "\"\n");
  } else {
    ExecutionCount = BC.DR.countBranchesTo(getName());
  }

  if (!isSimple())
    return false;

  if (!(CurrentState == State::Disassembled))
    return false;

  assert(BasicBlocks.empty() && "basic block list should be empty");
  assert((Labels.find(0) != Labels.end()) &&
         "first instruction should always have a label");

  // Create basic blocks in the original layout order:
  //
  //  * Every instruction with associated label marks
  //    the beginning of a basic block.
  //  * Conditional instruction marks the end of a basic block,
  //    except when the following instruction is an
  //    unconditional branch, and the unconditional branch is not
  //    a destination of another branch. In the latter case, the
  //    basic block will consist of a single unconditional branch
  //    (missed optimization opportunity?).
  //
  // Created basic blocks are sorted in layout order since they are
  // created in the same order as instructions, and instructions are
  // sorted by offsets.
  BinaryBasicBlock *InsertBB{nullptr};
  BinaryBasicBlock *PrevBB{nullptr};
  bool IsLastInstrNop = false;
  MCInst *PrevInstr{nullptr};
  for (auto &InstrInfo : Instructions) {
    auto LI = Labels.find(InstrInfo.first);
    if (LI != Labels.end()) {
      // Always create new BB at branch destination.
      PrevBB = InsertBB;
      InsertBB = addBasicBlock(LI->first, LI->second,
                               /* DeriveAlignment = */ IsLastInstrNop);
    }
    if (!InsertBB) {
      // It must be a fallthrough or unreachable code. Create a new block unless
      // we see an unconditional branch following a conditional one.
      assert(PrevBB && "no previous basic block for a fall through");
      assert(PrevInstr && "no previous instruction for a fall through");
      if (MIA->isUnconditionalBranch(InstrInfo.second) &&
          !MIA->isUnconditionalBranch(*PrevInstr)) {
        // Temporarily restore inserter basic block.
        InsertBB = PrevBB;
      } else {
        InsertBB = addBasicBlock(InstrInfo.first,
                                 BC.Ctx->createTempSymbol("FT", true),
                                 /* DeriveAlignment = */ IsLastInstrNop);
      }
    }

    // Ignore nops. We use nops to derive alignment of the next basic block.
    // It will not always work, as some blocks are naturally aligned, but
    // it's just part of heuristic for block alignment.
    if (MIA->isNoop(InstrInfo.second)) {
      IsLastInstrNop = true;
      continue;
    }

    IsLastInstrNop = false;
    InsertBB->addInstruction(InstrInfo.second);
    PrevInstr = &InstrInfo.second;

    // How well do we detect tail calls here?
    if (MIA->isTerminator(InstrInfo.second)) {
      PrevBB = InsertBB;
      InsertBB = nullptr;
    }
  }

  // Set the basic block layout to the original order
  for (auto &BB : BasicBlocks) {
    BasicBlocksLayout.emplace_back(&BB);
  }

  // Intermediate dump.
  DEBUG(print(dbgs(), "after creating basic blocks"));

  // TODO: handle properly calls to no-return functions,
  // e.g. exit(3), etc. Otherwise we'll see a false fall-through
  // blocks.

  for (auto &Branch : LocalBranches) {

    DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first)
                 << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n");
    BinaryBasicBlock *FromBB = getBasicBlockContainingOffset(Branch.first);
    assert(FromBB && "cannot find BB containing FROM branch");
    BinaryBasicBlock *ToBB = getBasicBlockAtOffset(Branch.second);
    assert(ToBB && "cannot find BB containing TO branch");

    if (std::error_code EC = BranchDataOrErr.getError()) {
      FromBB->addSuccessor(ToBB);
    } else {
      const FuncBranchData &BranchData = BranchDataOrErr.get();
      auto BranchInfoOrErr = BranchData.getBranch(Branch.first, Branch.second);
      if (std::error_code EC = BranchInfoOrErr.getError()) {
        FromBB->addSuccessor(ToBB);
      } else {
        const BranchInfo &BInfo = BranchInfoOrErr.get();
        FromBB->addSuccessor(ToBB, BInfo.Branches, BInfo.Mispreds);
      }
    }
  }

  // Add fall-through branches.
  PrevBB = nullptr;
  bool IsPrevFT = false; // Is previous block a fall-through.
  for (auto &BB : BasicBlocks) {
    if (IsPrevFT) {
      PrevBB->addSuccessor(&BB, BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE,
                           BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE);
    }

    MCInst &LastInst = BB.back();
    if (BB.empty()) {
      IsPrevFT = true;
    } else if (BB.succ_size() == 0) {
      IsPrevFT = MIA->isTerminator(LastInst) ? false : true;
    } else if (BB.succ_size() == 1) {
      IsPrevFT =  MIA->isConditionalBranch(LastInst) ? true : false;
    } else {
      // Either ends with 2 branches, or with an indirect jump.
      IsPrevFT = false;
    }

    PrevBB = &BB;
  }

  if (!IsPrevFT) {
    // Possibly a call that does not return.
    DEBUG(dbgs() << "last block was marked as a fall-through\n");
  }

  // Infer frequency for non-taken branches
  if (ExecutionCount != COUNT_NO_PROFILE && !BranchDataOrErr.getError()) {
    inferFallThroughCounts();
  }

  // Clean-up memory taken by instructions and labels.
  clearInstructions();
  clearLabels();
  clearLocalBranches();

  // Update the state.
  CurrentState = State::CFG;

  return true;
}

void BinaryFunction::inferFallThroughCounts() {
  assert(!BasicBlocks.empty() && "basic block list should not be empty");

  // Compute preliminary execution time for each basic block
  for (auto &CurBB : BasicBlocks) {
    if (&CurBB == &*BasicBlocks.begin()) {
      CurBB.ExecutionCount = ExecutionCount;
      continue;
    }
    CurBB.ExecutionCount = 0;
  }

  for (auto &CurBB : BasicBlocks) {
    auto SuccCount = CurBB.BranchInfo.begin();
    for (auto Succ : CurBB.successors()) {
      // Do not update execution count of the entry block (when we have tail
      // calls). We already accounted for those when computing the func count.
      if (Succ == &*BasicBlocks.begin())
        continue;
      if (SuccCount->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
        Succ->ExecutionCount += SuccCount->Count;
      ++SuccCount;
    }
  }

  // Work on a basic block at a time, propagating frequency information forwards
  // It is important to walk in the layour order
  for (auto &CurBB : BasicBlocks) {
    uint64_t BBExecCount = CurBB.getExecutionCount();

    // Propagate this information to successors, filling in fall-through edges
    // with frequency information
    if (CurBB.succ_size() == 0)
      continue;

    // Calculate frequency of outgoing branches from this node according to
    // LBR data
    uint64_t ReportedBranches = 0;
    for (auto &SuccCount : CurBB.BranchInfo) {
      if (SuccCount.Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
        ReportedBranches += SuccCount.Count;
    }

    // Infer the frequency of the fall-through edge, representing not taking the
    // branch
    uint64_t Inferred = 0;
    if (BBExecCount > ReportedBranches)
      Inferred = BBExecCount - ReportedBranches;
    if (BBExecCount < ReportedBranches)
      errs() << "FLO-WARNING: Fall-through inference is slightly inconsistent. "
                "exec frequency is less than the outgoing edges frequency ("
             << BBExecCount << " < " << ReportedBranches
             << ") for  BB at offset 0x"
             << Twine::utohexstr(getAddress() + CurBB.getOffset()) << '\n';

    // Put this information into the fall-through edge
    if (CurBB.succ_size() == 0)
      continue;
    // If there is a FT, the last successor will be it.
    auto &SuccCount = CurBB.BranchInfo.back();
    auto &Succ = CurBB.Successors.back();
    if (SuccCount.Count == BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) {
      SuccCount.Count = Inferred;
      Succ->ExecutionCount += Inferred;
    }

  } // end for (CurBB : BasicBlocks)

  return;
}

void BinaryFunction::optimizeLayout(HeuristicPriority Priority) {
  // Bail if no profiling information or if empty
  if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE ||
      BasicBlocksLayout.empty()) {
    return;
  }

  // Work on optimal solution if problem is small enough
  if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD)
    return solveOptimalLayout();

  DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n");

  // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
  // maximize weight during a path traversing all BBs. In this way, we will
  // convert the hottest branches into fall-throughs.

  // Encode an edge between two basic blocks, source and destination
  typedef std::pair<BinaryBasicBlock *, BinaryBasicBlock *> EdgeTy;
  std::map<EdgeTy, uint64_t> Weight;

  // Define a comparison function to establish SWO between edges
  auto Comp = [&Weight](EdgeTy A, EdgeTy B) { return Weight[A] < Weight[B]; };
  std::priority_queue<EdgeTy, std::vector<EdgeTy>, decltype(Comp)> Queue(Comp);

  typedef std::vector<BinaryBasicBlock *> ClusterTy;
  typedef std::map<BinaryBasicBlock *, int> BBToClusterMapTy;
  std::vector<ClusterTy> Clusters;
  BBToClusterMapTy BBToClusterMap;

  // Encode relative weights between two clusters
  std::vector<std::map<uint32_t, uint64_t>> ClusterEdges;
  ClusterEdges.resize(BasicBlocksLayout.size());

  for (auto BB : BasicBlocksLayout) {
    // Create a cluster for this BB
    uint32_t I = Clusters.size();
    Clusters.emplace_back();
    auto &Cluster = Clusters.back();
    Cluster.push_back(BB);
    BBToClusterMap[BB] = I;
    // Populate priority queue with edges
    auto BI = BB->BranchInfo.begin();
    for (auto &I : BB->successors()) {
      if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
        Weight[std::make_pair(BB, I)] = BI->Count;
      Queue.push(std::make_pair(BB, I));
      ++BI;
    }
  }

  // Grow clusters in a greedy fashion
  while (!Queue.empty()) {
    auto elmt = Queue.top();
    Queue.pop();

    BinaryBasicBlock *BBSrc = elmt.first;
    BinaryBasicBlock *BBDst = elmt.second;

    // Case 1: BBSrc and BBDst are the same. Ignore this edge
    if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin())
      continue;

    int I = BBToClusterMap[BBSrc];
    int J = BBToClusterMap[BBDst];

    // Case 2: If they are already allocated at the same cluster, just increase
    // the weight of this cluster
    if (I == J) {
      ClusterEdges[I][I] += Weight[elmt];
      continue;
    }

    auto &ClusterA = Clusters[I];
    auto &ClusterB = Clusters[J];
    if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
      // Case 3: BBSrc is at the end of a cluster and BBDst is at the start,
      // allowing us to merge two clusters
      for (auto BB : ClusterB)
        BBToClusterMap[BB] = I;
      ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
      ClusterB.clear();
      // Iterate through all inter-cluster edges and transfer edges targeting
      // cluster B to cluster A.
      // It is bad to have to iterate though all edges when we could have a list
      // of predecessors for cluster B. However, it's not clear if it is worth
      // the added code complexity to create a data structure for clusters that
      // maintains a list of predecessors. Maybe change this if it becomes a
      // deal breaker.
      for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
        ClusterEdges[K][I] += ClusterEdges[K][J];
    } else {
      // Case 4: Both BBSrc and BBDst are allocated in positions we cannot
      // merge them. Annotate the weight of this edge in the weight between
      // clusters to help us decide ordering between these clusters.
      ClusterEdges[I][J] += Weight[elmt];
    }
  }

  std::vector<uint32_t> Order;  // Cluster layout order

  // Here we have 3 conflicting goals as to how to layout clusters. If we want
  // to minimize jump offsets, we should put clusters with heavy inter-cluster
  // dependence as close as possible. If we want to maximize the probability
  // that all inter-cluster edges are predicted as not-taken, we should enforce
  // a topological order to make targets appear after sources, creating forward
  // branches. If we want to separate hot from cold blocks to maximize the
  // probability that unfrequently executed code doesn't pollute the cache, we
  // should put clusters in descending order of hotness.
  std::vector<double> AvgFreq;
  AvgFreq.resize(Clusters.size(), 0.0);
  for (uint32_t I = 1, E = Clusters.size(); I < E; ++I) {
    double Freq = 0.0;
    for (auto BB : Clusters[I]) {
      if (!BB->empty())
        Freq += BB->getExecutionCount() / BB->size();
    }
    AvgFreq[I] = Freq;
  }

  switch(Priority) {
  case HP_NONE: {
    for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
      if (!Clusters[I].empty())
        Order.push_back(I);
    break;
  }
  case HP_BRANCH_PREDICTOR: {
    // Do a topological sort for clusters, prioritizing frequently-executed BBs
    // during the traversal.
    std::stack<uint32_t> Stack;
    std::vector<uint32_t> Status;
    std::vector<uint32_t> Parent;
    Status.resize(Clusters.size(), 0);
    Parent.resize(Clusters.size(), 0);
    constexpr uint32_t STACKED = 1;
    constexpr uint32_t VISITED = 2;
    Status[0] = STACKED;
    Stack.push(0);
    while (!Stack.empty()) {
      uint32_t I = Stack.top();
      if (!(Status[I] & VISITED)) {
        Status[I] |= VISITED;
        // Order successors by weight
        auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
          return ClusterEdges[I][A] > ClusterEdges[I][B];
        };
        std::priority_queue<uint32_t, std::vector<uint32_t>,
                            decltype(ClusterComp)> SuccQueue(ClusterComp);
        for (auto &Target: ClusterEdges[I]) {
          if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
              !Clusters[Target.first].empty()) {
            Parent[Target.first] = I;
            Status[Target.first] = STACKED;
            SuccQueue.push(Target.first);
          }
        }
        while (!SuccQueue.empty()) {
          Stack.push(SuccQueue.top());
          SuccQueue.pop();
        }
        continue;
      }
      // Already visited this node
      Stack.pop();
      Order.push_back(I);
    }
    std::reverse(Order.begin(), Order.end());
    // Put unreachable clusters at the end
    for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
      if (!(Status[I] & VISITED) && !Clusters[I].empty())
        Order.push_back(I);

    // Sort nodes with equal precedence
    auto Beg = Order.begin();
    // Don't reorder the first cluster, which contains the function entry point
    ++Beg;
    std::stable_sort(Beg, Order.end(),
                     [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
                       uint32_t P = Parent[A];
                       while (Parent[P] != 0) {
                         if (Parent[P] == B)
                           return false;
                         P = Parent[P];
                       }
                       P = Parent[B];
                       while (Parent[P] != 0) {
                         if (Parent[P] == A)
                           return true;
                         P = Parent[P];
                       }
                       return AvgFreq[A] > AvgFreq[B];
                     });
    break;
  }
  case HP_CACHE_UTILIZATION: {
    // Order clusters based on average instruction execution frequency
    for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
      if (!Clusters[I].empty())
        Order.push_back(I);
    auto Beg = Order.begin();
    // Don't reorder the first cluster, which contains the function entry point
    ++Beg;
    std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) {
      return AvgFreq[A] > AvgFreq[B];
    });

    break;
  }
  }

  BasicBlocksLayout.clear();
  for (auto I : Order) {
    auto &Cluster = Clusters[I];
    BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(),
                             Cluster.end());
  }

  fixBranches();
}

void BinaryFunction::solveOptimalLayout() {
  std::vector<std::vector<uint64_t>> Weight;
  std::map<BinaryBasicBlock *, int> BBToIndex;
  std::vector<BinaryBasicBlock *> IndexToBB;

  DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n");

  unsigned N = BasicBlocksLayout.size();
  // Populating weight map and index map
  for (auto BB : BasicBlocksLayout) {
    BBToIndex[BB] = IndexToBB.size();
    IndexToBB.push_back(BB);
  }
  Weight.resize(N);
  for (auto BB : BasicBlocksLayout) {
    auto BI = BB->BranchInfo.begin();
    Weight[BBToIndex[BB]].resize(N);
    for (auto I : BB->successors()) {
      if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
        Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
      ++BI;
    }
  }

  std::vector<std::vector<int64_t>> DP;
  DP.resize(1 << N);
  for (auto &Elmt : DP) {
    Elmt.resize(N, -1);
  }
  // Start with the entry basic block being allocated with cost zero
  DP[1][0] = 0;
  // Walk through TSP solutions using a bitmask to represent state (current set
  // of BBs in the layout)
  unsigned BestSet = 1;
  unsigned BestLast = 0;
  int64_t BestWeight = 0;
  for (unsigned Set = 1; Set < (1U << N); ++Set) {
    // Traverse each possibility of Last BB visited in this layout
    for (unsigned Last = 0; Last < N; ++Last) {
      // Case 1: There is no possible layout with this BB as Last
      if (DP[Set][Last] == -1)
        continue;

      // Case 2: There is a layout with this Set and this Last, and we try
      // to expand this set with New
      for (unsigned New = 1; New < N; ++New) {
        // Case 2a: BB "New" is already in this Set
        if ((Set & (1 << New)) != 0)
          continue;

        // Case 2b: BB "New" is not in this set and we add it to this Set and
        // record total weight of this layout with "New" as the last BB.
        unsigned NewSet = (Set | (1 << New));
        if (DP[NewSet][New] == -1)
          DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
        DP[NewSet][New] = std::max(DP[NewSet][New],
                                   DP[Set][Last] + (int64_t)Weight[Last][New]);

        if (DP[NewSet][New] > BestWeight) {
          BestWeight = DP[NewSet][New];
          BestSet = NewSet;
          BestLast = New;
        }
      }
    }
  }

  std::vector<BinaryBasicBlock *> PastLayout = BasicBlocksLayout;

  // Define final function layout based on layout that maximizes weight
  BasicBlocksLayout.clear();
  unsigned Last = BestLast;
  unsigned Set = BestSet;
  std::vector<bool> Visited;
  Visited.resize(N);
  Visited[Last] = true;
  BasicBlocksLayout.push_back(IndexToBB[Last]);
  Set = Set & ~(1U << Last);
  while (Set != 0) {
    int64_t Best = -1;
    for (unsigned I = 0; I < N; ++I) {
      if (DP[Set][I] == -1)
        continue;
      if (DP[Set][I] > Best) {
        Last = I;
        Best = DP[Set][I];
      }
    }
    Visited[Last] = true;
    BasicBlocksLayout.push_back(IndexToBB[Last]);
    Set = Set & ~(1U << Last);
  }
  std::reverse(BasicBlocksLayout.begin(), BasicBlocksLayout.end());

  // Finalize layout with BBs that weren't assigned to the layout
  for (auto BB : PastLayout) {
    if (Visited[BBToIndex[BB]] == false)
      BasicBlocksLayout.push_back(BB);
  }

  fixBranches();
}

const BinaryBasicBlock *
BinaryFunction::getOriginalLayoutSuccessor(const BinaryBasicBlock *BB) const {
  auto I = std::upper_bound(BasicBlocks.begin(), BasicBlocks.end(), *BB);
  assert(I != BasicBlocks.begin() && "first basic block not at offset 0");

  if (I == BasicBlocks.end())
    return nullptr;
  return &*I;
}

void BinaryFunction::fixBranches() {
  auto &MIA = BC.MIA;

  for (unsigned I = 0, E = BasicBlocksLayout.size(); I != E; ++I) {
    BinaryBasicBlock *BB = BasicBlocksLayout[I];

    const MCSymbol *TBB = nullptr;
    const MCSymbol *FBB = nullptr;
    MCInst *CondBranch = nullptr;
    MCInst *UncondBranch = nullptr;
    if (!MIA->analyzeBranch(BB->Instructions, TBB, FBB, CondBranch,
                            UncondBranch)) {
      continue;
    }

    // Check if the original fall-through for this block has been moved
    const MCSymbol *FT = nullptr;
    if (I + 1 != BasicBlocksLayout.size())
      FT = BasicBlocksLayout[I + 1]->getLabel();
    const BinaryBasicBlock *OldFTBB = getOriginalLayoutSuccessor(BB);
    const MCSymbol *OldFT = nullptr;
    if (OldFTBB != nullptr)
      OldFT = OldFTBB->getLabel();

    // Case 1: There are no branches in this basic block and it just falls
    // through
    if (CondBranch == nullptr && UncondBranch == nullptr) {
      // Case 1a: Last instruction is a return, so it does *not* fall through to
      // the next block.
      if (!BB->empty() && MIA->isReturn(BB->back()))
        continue;
      // Case 1b: Layout has changed and the fallthrough is not the same. Need
      // to add a new unconditional branch to jump to the old fallthrough.
      if (FT != OldFT && OldFT != nullptr) {
        MCInst NewInst;
        if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get()))
          llvm_unreachable("Target does not support creating new branches");
        BB->Instructions.emplace_back(std::move(NewInst));
      }
      // Case 1c: Layout hasn't changed, nothing to do.
      continue;
    }

    // Case 2: There is a single jump, unconditional, in this basic block
    if (CondBranch == nullptr) {
      // Case 2a: It jumps to the new fall-through, so we can delete it
      if (TBB == FT) {
        BB->eraseInstruction(UncondBranch);
      }
      // Case 2b: If 2a doesn't happen, there is nothing we can do
      continue;
    }

    // Case 3: There is a single jump, conditional, in this basic block
    if (UncondBranch == nullptr) {
      // Case 3a: If the taken branch goes to the next block in the new layout,
      // invert this conditional branch logic so we can make this a fallthrough.
      if (TBB == FT) {
        assert(OldFT != nullptr && "malformed CFG");
        if (!MIA->reverseBranchCondition(*CondBranch, OldFT, BC.Ctx.get()))
          llvm_unreachable("Target does not support reversing branches");
        continue;
      }
      // Case 3b: Need to add a new unconditional branch because layout
      // has changed
      if (FT != OldFT && OldFT != nullptr) {
        MCInst NewInst;
        if (!MIA->createUncondBranch(NewInst, OldFT, BC.Ctx.get()))
          llvm_unreachable("Target does not support creating new branches");
        BB->Instructions.emplace_back(std::move(NewInst));
        continue;
      }
      // Case 3c: Old fall-through is the same as the new one, no need to change
      continue;
    }

    // Case 4: There are two jumps in this basic block, one conditional followed
    // by another unconditional.
    // Case 4a: If the unconditional jump target is the new fall through,
    // delete it.
    if (FBB == FT) {
      BB->eraseInstruction(UncondBranch);
      continue;
    }
    // Case 4b: If the taken branch goes to the next block in the new layout,
    // invert this conditional branch logic so we can make this a fallthrough.
    // Now we don't need the unconditional jump anymore, so we also delete it.
    if (TBB == FT) {
      if (!MIA->reverseBranchCondition(*CondBranch, FBB, BC.Ctx.get()))
        llvm_unreachable("Target does not support reversing branches");
      BB->eraseInstruction(UncondBranch);
      continue;
    }
    // Case 4c: Nothing interesting happening.
  }
}

} // namespace flo
} // namespace llvm