Files
clang-p2996/bolt/lib/Passes/IndirectCallPromotion.cpp
Amir Ayupov 52cf07116b [BOLT][NFC] Log through JournalingStreams (#81524)
Make core BOLT functionality more friendly to being used as a
library instead of in our standalone driver llvm-bolt. To
accomplish this, we augment BinaryContext with journaling streams
that are to be used by most BOLT code whenever something needs to
be logged to the screen. Users of the library can decide if logs
should be printed to a file, no file or to the screen, as
before. To illustrate this, this patch adds a new option
`--log-file` that allows the user to redirect BOLT logging to a
file on disk or completely hide it by using
`--log-file=/dev/null`. Future BOLT code should now use
`BinaryContext::outs()` for printing important messages instead of
`llvm::outs()`. A new test log.test enforces this by verifying that
no strings are print to screen once the `--log-file` option is
used.

In previous patches we also added a new BOLTError class to report
common and fatal errors, so code shouldn't call exit(1) now. To
easily handle problems as before (by quitting with exit(1)),
callers can now use
`BinaryContext::logBOLTErrorsAndQuitOnFatal(Error)` whenever code
needs to deal with BOLT errors. To test this, we have fatal.s
that checks we are correctly quitting and printing a fatal error
to the screen.

Because this is a significant change by itself, not all code was
yet ported. Code from Profiler libs (DataAggregator and friends)
still print errors directly to screen.

Co-authored-by: Rafael Auler <rafaelauler@fb.com>

Test Plan: NFC
2024-02-12 14:53:53 -08:00

1484 lines
56 KiB
C++

//===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the IndirectCallPromotion class.
//
//===----------------------------------------------------------------------===//
#include "bolt/Passes/IndirectCallPromotion.h"
#include "bolt/Passes/BinaryFunctionCallGraph.h"
#include "bolt/Passes/DataflowInfoManager.h"
#include "bolt/Passes/Inliner.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/CommandLine.h"
#include <iterator>
#define DEBUG_TYPE "ICP"
#define DEBUG_VERBOSE(Level, X) \
if (opts::Verbosity >= (Level)) { \
X; \
}
using namespace llvm;
using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
extern cl::opt<IndirectCallPromotionType> ICP;
extern cl::opt<unsigned> Verbosity;
extern cl::opt<unsigned> ExecutionCountThreshold;
static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
"icp-jt-remaining-percent-threshold",
cl::desc("The percentage threshold against remaining unpromoted indirect "
"call count for the promotion for jump tables"),
cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<unsigned> ICPJTTotalPercentThreshold(
"icp-jt-total-percent-threshold",
cl::desc(
"The percentage threshold against total count for the promotion for "
"jump tables"),
cl::init(5), cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<unsigned> ICPCallsRemainingPercentThreshold(
"icp-calls-remaining-percent-threshold",
cl::desc("The percentage threshold against remaining unpromoted indirect "
"call count for the promotion for calls"),
cl::init(50), cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<unsigned> ICPCallsTotalPercentThreshold(
"icp-calls-total-percent-threshold",
cl::desc(
"The percentage threshold against total count for the promotion for "
"calls"),
cl::init(30), cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<unsigned> ICPMispredictThreshold(
"indirect-call-promotion-mispredict-threshold",
cl::desc("misprediction threshold for skipping ICP on an "
"indirect call"),
cl::init(0), cl::cat(BoltOptCategory));
static cl::alias ICPMispredictThresholdAlias(
"icp-mp-threshold",
cl::desc("alias for --indirect-call-promotion-mispredict-threshold"),
cl::aliasopt(ICPMispredictThreshold));
static cl::opt<bool> ICPUseMispredicts(
"indirect-call-promotion-use-mispredicts",
cl::desc("use misprediction frequency for determining whether or not ICP "
"should be applied at a callsite. The "
"-indirect-call-promotion-mispredict-threshold value will be used "
"by this heuristic"),
cl::cat(BoltOptCategory));
static cl::alias ICPUseMispredictsAlias(
"icp-use-mp",
cl::desc("alias for --indirect-call-promotion-use-mispredicts"),
cl::aliasopt(ICPUseMispredicts));
static cl::opt<unsigned>
ICPTopN("indirect-call-promotion-topn",
cl::desc("limit number of targets to consider when doing indirect "
"call promotion. 0 = no limit"),
cl::init(3), cl::cat(BoltOptCategory));
static cl::alias
ICPTopNAlias("icp-topn",
cl::desc("alias for --indirect-call-promotion-topn"),
cl::aliasopt(ICPTopN));
static cl::opt<unsigned> ICPCallsTopN(
"indirect-call-promotion-calls-topn",
cl::desc("limit number of targets to consider when doing indirect "
"call promotion on calls. 0 = no limit"),
cl::init(0), cl::cat(BoltOptCategory));
static cl::alias ICPCallsTopNAlias(
"icp-calls-topn",
cl::desc("alias for --indirect-call-promotion-calls-topn"),
cl::aliasopt(ICPCallsTopN));
static cl::opt<unsigned> ICPJumpTablesTopN(
"indirect-call-promotion-jump-tables-topn",
cl::desc("limit number of targets to consider when doing indirect "
"call promotion on jump tables. 0 = no limit"),
cl::init(0), cl::cat(BoltOptCategory));
static cl::alias ICPJumpTablesTopNAlias(
"icp-jt-topn",
cl::desc("alias for --indirect-call-promotion-jump-tables-topn"),
cl::aliasopt(ICPJumpTablesTopN));
static cl::opt<bool> EliminateLoads(
"icp-eliminate-loads",
cl::desc("enable load elimination using memory profiling data when "
"performing ICP"),
cl::init(true), cl::cat(BoltOptCategory));
static cl::opt<unsigned> ICPTopCallsites(
"icp-top-callsites",
cl::desc("optimize hottest calls until at least this percentage of all "
"indirect calls frequency is covered. 0 = all callsites"),
cl::init(99), cl::Hidden, cl::cat(BoltOptCategory));
static cl::list<std::string>
ICPFuncsList("icp-funcs", cl::CommaSeparated,
cl::desc("list of functions to enable ICP for"),
cl::value_desc("func1,func2,func3,..."), cl::Hidden,
cl::cat(BoltOptCategory));
static cl::opt<bool>
ICPOldCodeSequence("icp-old-code-sequence",
cl::desc("use old code sequence for promoted calls"),
cl::Hidden, cl::cat(BoltOptCategory));
static cl::opt<bool> ICPJumpTablesByTarget(
"icp-jump-tables-targets",
cl::desc(
"for jump tables, optimize indirect jmp targets instead of indices"),
cl::Hidden, cl::cat(BoltOptCategory));
static cl::alias
ICPJumpTablesByTargetAlias("icp-jt-targets",
cl::desc("alias for --icp-jump-tables-targets"),
cl::aliasopt(ICPJumpTablesByTarget));
static cl::opt<bool> ICPPeelForInline(
"icp-inline", cl::desc("only promote call targets eligible for inlining"),
cl::Hidden, cl::cat(BoltOptCategory));
} // namespace opts
#ifndef NDEBUG
static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
bool IsValid = true;
for (auto &BFI : BFs) {
BinaryFunction &BF = BFI.second;
if (!BF.isSimple())
continue;
for (const BinaryBasicBlock &BB : BF) {
auto BI = BB.branch_info_begin();
for (BinaryBasicBlock *SuccBB : BB.successors()) {
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
if (BB.getKnownExecutionCount() == 0 ||
SuccBB->getKnownExecutionCount() == 0) {
BF.getBinaryContext().errs()
<< "BOLT-WARNING: profile verification failed after ICP for "
"function "
<< BF << '\n';
IsValid = false;
}
}
++BI;
}
}
}
return IsValid;
}
#endif
namespace llvm {
namespace bolt {
IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
const IndirectCallProfile &ICP)
: From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds),
Branches(ICP.Count) {
if (ICP.Symbol) {
To.Sym = ICP.Symbol;
To.Addr = 0;
}
}
void IndirectCallPromotion::printDecision(
llvm::raw_ostream &OS,
std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
uint64_t TotalCount = 0;
uint64_t TotalMispreds = 0;
for (const Callsite &S : Targets) {
TotalCount += S.Branches;
TotalMispreds += S.Mispreds;
}
if (!TotalCount)
TotalCount = 1;
if (!TotalMispreds)
TotalMispreds = 1;
OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
<< " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
<< "\n";
size_t I = 0;
for (const Callsite &S : Targets) {
OS << "Count = " << S.Branches << ", "
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
<< "Mispreds = " << S.Mispreds << ", "
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
if (I < N)
OS << " * to be optimized *";
if (!S.JTIndices.empty()) {
OS << " Indices:";
for (const uint64_t Idx : S.JTIndices)
OS << " " << Idx;
}
OS << "\n";
I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
}
}
// Get list of targets for a given call sorted by most frequently
// called first.
std::vector<IndirectCallPromotion::Callsite>
IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
const MCInst &Inst) const {
BinaryFunction &BF = *BB.getFunction();
const BinaryContext &BC = BF.getBinaryContext();
std::vector<Callsite> Targets;
if (const JumpTable *JT = BF.getJumpTable(Inst)) {
// Don't support PIC jump tables for now
if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
return Targets;
const Location From(BF.getSymbol());
const std::pair<size_t, size_t> Range =
JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
JumpTable::JumpInfo DefaultJI;
const JumpTable::JumpInfo *JI =
JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
assert(JT->Type == JumpTable::JTT_PIC ||
JT->EntrySize == BC.AsmInfo->getCodePointerSize());
for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
MCSymbol *Entry = JT->Entries[I];
const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry);
assert(ToBB || Entry == BF.getFunctionEndLabel() ||
Entry == BF.getFunctionEndLabel(FragmentNum::cold()));
if (Entry == BF.getFunctionEndLabel() ||
Entry == BF.getFunctionEndLabel(FragmentNum::cold()))
continue;
const Location To(Entry);
const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB);
Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count,
I - Range.first);
}
// Sort by symbol then addr.
llvm::sort(Targets, [](const Callsite &A, const Callsite &B) {
if (A.To.Sym && B.To.Sym)
return A.To.Sym < B.To.Sym;
else if (A.To.Sym && !B.To.Sym)
return true;
else if (!A.To.Sym && B.To.Sym)
return false;
else
return A.To.Addr < B.To.Addr;
});
// Targets may contain multiple entries to the same target, but using
// different indices. Their profile will report the same number of branches
// for different indices if the target is the same. That's because we don't
// profile the index value, but only the target via LBR.
auto First = Targets.begin();
auto Last = Targets.end();
auto Result = First;
while (++First != Last) {
Callsite &A = *Result;
const Callsite &B = *First;
if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym)
A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
B.JTIndices.end());
else
*(++Result) = *First;
}
++Result;
LLVM_DEBUG(if (Targets.end() - Result > 0) {
dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
<< " duplicate targets removed\n";
});
Targets.erase(Result, Targets.end());
} else {
// Don't try to optimize PC relative indirect calls.
if (Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter())
return Targets;
const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(
Inst, "CallProfile");
if (ICSP) {
for (const IndirectCallProfile &CSP : ICSP.get()) {
Callsite Site(BF, CSP);
if (Site.isValid())
Targets.emplace_back(std::move(Site));
}
}
}
// Sort by target count, number of indices in case of jump table, and
// mispredicts. We prioritize targets with high count, small number of indices
// and high mispredicts. Break ties by selecting targets with lower addresses.
llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) {
if (A.Branches != B.Branches)
return A.Branches > B.Branches;
if (A.JTIndices.size() != B.JTIndices.size())
return A.JTIndices.size() < B.JTIndices.size();
if (A.Mispreds != B.Mispreds)
return A.Mispreds > B.Mispreds;
return A.To.Addr < B.To.Addr;
});
// Remove non-symbol targets
llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; });
LLVM_DEBUG(if (BF.getJumpTable(Inst)) {
uint64_t TotalCount = 0;
uint64_t TotalMispreds = 0;
for (const Callsite &S : Targets) {
TotalCount += S.Branches;
TotalMispreds += S.Mispreds;
}
if (!TotalCount)
TotalCount = 1;
if (!TotalMispreds)
TotalMispreds = 1;
dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
<< ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds
<< "\n";
size_t I = 0;
for (const Callsite &S : Targets) {
dbgs() << "Count[" << I << "] = " << S.Branches << ", "
<< format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
<< "Mispreds[" << I << "] = " << S.Mispreds << ", "
<< format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n";
++I;
}
});
return Targets;
}
IndirectCallPromotion::JumpTableInfoType
IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
MCInst &CallInst,
MCInst *&TargetFetchInst,
const JumpTable *JT) const {
assert(JT && "Can't get jump table addrs for non-jump tables.");
BinaryFunction &Function = *BB.getFunction();
BinaryContext &BC = Function.getBinaryContext();
if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
return JumpTableInfoType();
JumpTableInfoType HotTargets;
MCInst *MemLocInstr;
MCInst *PCRelBaseOut;
unsigned BaseReg, IndexReg;
int64_t DispValue;
const MCExpr *DispExpr;
MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
assert(MemLocInstr && "There should always be a load for jump tables");
if (!MemLocInstr)
return JumpTableInfoType();
LLVM_DEBUG({
dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
<< "jump table in " << Function << " at @ "
<< (&CallInst - &BB.front()) << "\n"
<< "BOLT-INFO: ICP target fetch instructions:\n";
BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
if (MemLocInstr != &CallInst)
BC.printInstruction(dbgs(), CallInst, 0, &Function);
});
DEBUG_VERBOSE(1, {
dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
<< "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
<< "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
<< "DispValue = " << Twine::utohexstr(DispValue) << ", "
<< "DispExpr = " << DispExpr << ", "
<< "MemLocInstr = ";
BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
dbgs() << "\n";
});
++TotalIndexBasedCandidates;
auto ErrorOrMemAccessProfile =
BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
"MemoryAccessProfile");
if (!ErrorOrMemAccessProfile) {
DEBUG_VERBOSE(1, dbgs()
<< "BOLT-INFO: ICP no memory profiling data found\n");
return JumpTableInfoType();
}
MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
uint64_t ArrayStart;
if (DispExpr) {
ErrorOr<uint64_t> DispValueOrError =
BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
assert(DispValueOrError && "global symbol needs a value");
ArrayStart = *DispValueOrError;
} else {
ArrayStart = static_cast<uint64_t>(DispValue);
}
if (BaseReg == BC.MRI->getProgramCounter())
ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
// This is a map of [symbol] -> [count, index] and is used to combine indices
// into the jump table since there may be multiple addresses that all have the
// same entry.
std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);
for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
size_t Index;
// Mem data occasionally includes nullprs, ignore them.
if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
continue;
if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
return JumpTableInfoType();
if (AccessInfo.MemoryObject) {
// Deal with bad/stale data
if (!AccessInfo.MemoryObject->getName().starts_with(
"JUMP_TABLE/" + Function.getOneName().str()))
return JumpTableInfoType();
Index =
(AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
} else {
Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
}
// If Index is out of range it probably means the memory profiling data is
// wrong for this instruction, bail out.
if (Index >= Range.second) {
LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
<< ", " << Range.second << "\n");
return JumpTableInfoType();
}
// Make sure the hot index points at a legal label corresponding to a BB,
// e.g. not the end of function (unreachable) label.
if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
LLVM_DEBUG({
dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
<< "label " << JT->Entries[Index + Range.first]->getName()
<< " in jump table:\n";
JT->print(dbgs());
dbgs() << "HotTargetMap:\n";
for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
HotTargetMap)
dbgs() << "BOLT-INFO: " << HT.first->getName()
<< " = (count=" << HT.second.first
<< ", index=" << HT.second.second << ")\n";
});
return JumpTableInfoType();
}
std::pair<uint64_t, uint64_t> &HotTarget =
HotTargetMap[JT->Entries[Index + Range.first]];
HotTarget.first += AccessInfo.Count;
HotTarget.second = Index;
}
llvm::copy(llvm::make_second_range(HotTargetMap),
std::back_inserter(HotTargets));
// Sort with highest counts first.
llvm::sort(reverse(HotTargets));
LLVM_DEBUG({
dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
for (const std::pair<uint64_t, uint64_t> &Target : HotTargets)
dbgs() << "BOLT-INFO: Idx = " << Target.second << ", "
<< "Count = " << Target.first << "\n";
});
BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;
TargetFetchInst = MemLocInstr;
return HotTargets;
}
IndirectCallPromotion::SymTargetsType
IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
size_t &N, BinaryBasicBlock &BB,
MCInst &CallInst,
MCInst *&TargetFetchInst) const {
const BinaryContext &BC = BB.getFunction()->getBinaryContext();
const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
SymTargetsType SymTargets;
if (!JT) {
for (size_t I = 0; I < N; ++I) {
assert(Targets[I].To.Sym && "All ICP targets must be to known symbols");
assert(Targets[I].JTIndices.empty() &&
"Can't have jump table indices for non-jump tables");
SymTargets.emplace_back(Targets[I].To.Sym, 0);
}
return SymTargets;
}
// Use memory profile to select hot targets.
JumpTableInfoType HotTargets =
maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);
auto findTargetsIndex = [&](uint64_t JTIndex) {
for (size_t I = 0; I < Targets.size(); ++I)
if (llvm::is_contained(Targets[I].JTIndices, JTIndex))
return I;
LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
<< " table entry in " << *BB.getFunction() << "\n");
llvm_unreachable("Hot indices must be referred to by at least one "
"callsite");
};
if (!HotTargets.empty()) {
if (opts::Verbosity >= 1)
for (size_t I = 0; I < HotTargets.size(); ++I)
BC.outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
<< HotTargets[I].first << ", " << HotTargets[I].second
<< ")\n";
// Recompute hottest targets, now discriminating which index is hot
// NOTE: This is a tradeoff. On one hand, we get index information. On the
// other hand, info coming from the memory profile is much less accurate
// than LBRs. So we may actually end up working with more coarse
// profile granularity in exchange for information about indices.
std::vector<Callsite> NewTargets;
std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
uint64_t TotalMemAccesses = 0;
for (size_t I = 0; I < HotTargets.size(); ++I) {
const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
++IndicesPerTarget[Targets[TargetIndex].To.Sym];
TotalMemAccesses += HotTargets[I].first;
}
uint64_t RemainingMemAccesses = TotalMemAccesses;
const size_t TopN =
opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN;
size_t I = 0;
for (; I < HotTargets.size(); ++I) {
const uint64_t MemAccesses = HotTargets[I].first;
if (100 * MemAccesses <
TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
break;
if (100 * MemAccesses <
RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
break;
if (TopN && I >= TopN)
break;
RemainingMemAccesses -= MemAccesses;
const uint64_t JTIndex = HotTargets[I].second;
Callsite &Target = Targets[findTargetsIndex(JTIndex)];
NewTargets.push_back(Target);
std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
llvm::erase(Target.JTIndices, JTIndex);
// Keep fixCFG counts sane if more indices use this same target later
assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
NewTargets.back().Branches =
Target.Branches / IndicesPerTarget[Target.To.Sym];
NewTargets.back().Mispreds =
Target.Mispreds / IndicesPerTarget[Target.To.Sym];
assert(Target.Branches >= NewTargets.back().Branches);
assert(Target.Mispreds >= NewTargets.back().Mispreds);
Target.Branches -= NewTargets.back().Branches;
Target.Mispreds -= NewTargets.back().Mispreds;
}
llvm::copy(Targets, std::back_inserter(NewTargets));
std::swap(NewTargets, Targets);
N = I;
if (N == 0 && opts::Verbosity >= 1) {
BC.outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
<< BB.getName() << ": failed to meet thresholds after memory "
<< "profile data was loaded.\n";
return SymTargets;
}
}
for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
Callsite &Target = Targets[TgtIdx];
assert(Target.To.Sym && "All ICP targets must be to known symbols");
assert(!Target.JTIndices.empty() && "Jump tables must have indices");
for (uint64_t Idx : Target.JTIndices) {
SymTargets.emplace_back(Target.To.Sym, Idx);
++I;
}
}
return SymTargets;
}
IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
BinaryBasicBlock &BB, MCInst &Inst,
const SymTargetsType &SymTargets) const {
BinaryFunction &Function = *BB.getFunction();
BinaryContext &BC = Function.getBinaryContext();
std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
std::vector<MCInst *> MethodFetchInsns;
unsigned VtableReg, MethodReg;
uint64_t MethodOffset;
assert(!Function.getJumpTable(Inst) &&
"Can't get vtable addrs for jump tables.");
if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
return MethodInfoType();
MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(),
MethodFetchInsns, VtableReg, MethodReg,
MethodOffset)) {
DEBUG_VERBOSE(
1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
<< Function << " at @ " << (&Inst - &BB.front()) << "\n");
return MethodInfoType();
}
++TotalMethodLoadEliminationCandidates;
DEBUG_VERBOSE(1, {
dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
<< " at @ " << (&Inst - &BB.front()) << "\n";
dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
for (MCInst *Inst : MethodFetchInsns)
BC.printInstruction(dbgs(), *Inst, 0, &Function);
if (MethodFetchInsns.back() != &Inst)
BC.printInstruction(dbgs(), Inst, 0, &Function);
});
// Try to get value profiling data for the method load instruction.
auto ErrorOrMemAccessProfile =
BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
"MemoryAccessProfile");
if (!ErrorOrMemAccessProfile) {
DEBUG_VERBOSE(1, dbgs()
<< "BOLT-INFO: ICP no memory profiling data found\n");
return MethodInfoType();
}
MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
// Find the vtable that each method belongs to.
std::map<const MCSymbol *, uint64_t> MethodToVtable;
for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
uint64_t Address = AccessInfo.Offset;
if (AccessInfo.MemoryObject)
Address += AccessInfo.MemoryObject->getAddress();
// Ignore bogus data.
if (!Address)
continue;
const uint64_t VtableBase = Address - MethodOffset;
DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
<< Twine::utohexstr(VtableBase) << "+"
<< MethodOffset << "/" << AccessInfo.Count << "\n");
if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
if (!MethodBD) // skip unknown methods
continue;
MCSymbol *MethodSym = MethodBD->getSymbol();
MethodToVtable[MethodSym] = VtableBase;
DEBUG_VERBOSE(1, {
const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
dbgs() << "BOLT-INFO: ICP found method = "
<< Twine::utohexstr(MethodAddr.get()) << "/"
<< (Method ? Method->getPrintName() : "") << "\n";
});
}
}
// Find the vtable for each target symbol.
for (size_t I = 0; I < SymTargets.size(); ++I) {
auto Itr = MethodToVtable.find(SymTargets[I].first);
if (Itr != MethodToVtable.end()) {
if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
const uint64_t Addend = Itr->second - BD->getAddress();
VtableSyms.emplace_back(BD->getSymbol(), Addend);
continue;
}
}
// Give up if we can't find the vtable for a method.
DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
<< SymTargets[I].first->getName() << "\n");
return MethodInfoType();
}
// Make sure the vtable reg is not clobbered by the argument passing code
if (VtableReg != MethodReg) {
for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
++CurInst) {
const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI))
return MethodInfoType();
}
}
return MethodInfoType(VtableSyms, MethodFetchInsns);
}
std::vector<std::unique_ptr<BinaryBasicBlock>>
IndirectCallPromotion::rewriteCall(
BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
MCPlusBuilder::BlocksVectorTy &&ICPcode,
const std::vector<MCInst *> &MethodFetchInsns) const {
BinaryFunction &Function = *IndCallBlock.getFunction();
MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
// Create new basic blocks with correct code in each one first.
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
const bool IsTailCallOrJT =
(MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));
// If we are tracking the indirect call/jump address, propagate the address to
// the ICP code.
const std::optional<uint32_t> IndirectInstrOffset = MIB->getOffset(CallInst);
if (IndirectInstrOffset) {
for (auto &[Symbol, Instructions] : ICPcode)
for (MCInst &Inst : Instructions)
MIB->setOffset(Inst, *IndirectInstrOffset);
}
// Move instructions from the tail of the original call block
// to the merge block.
// Remember any pseudo instructions following a tail call. These
// must be preserved and moved to the original block.
InstructionListType TailInsts;
const MCInst *TailInst = &CallInst;
if (IsTailCallOrJT)
while (TailInst + 1 < &(*IndCallBlock.end()) &&
MIB->isPseudo(*(TailInst + 1)))
TailInsts.push_back(*++TailInst);
InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
// Link new BBs to the original input offset of the indirect call site or its
// containing BB, so we can map samples recorded in new BBs back to the
// original BB seen in the input binary (if using BAT).
const uint32_t OrigOffset = IndirectInstrOffset
? *IndirectInstrOffset
: IndCallBlock.getInputOffset();
IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
MethodFetchInsns.end());
if (IndCallBlock.empty() ||
(!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst))
IndCallBlock.addInstructions(ICPcode.front().second.begin(),
ICPcode.front().second.end());
else
IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()),
ICPcode.front().second);
IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end());
for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
MCSymbol *&Sym = Itr->first;
InstructionListType &Insts = Itr->second;
assert(Sym);
std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Sym);
TBB->setOffset(OrigOffset);
for (MCInst &Inst : Insts) // sanitize new instructions.
if (MIB->isCall(Inst))
MIB->removeAnnotation(Inst, "CallProfile");
TBB->addInstructions(Insts.begin(), Insts.end());
NewBBs.emplace_back(std::move(TBB));
}
// Move tail of instructions from after the original call to
// the merge block.
if (!IsTailCallOrJT)
NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
return NewBBs;
}
BinaryBasicBlock *
IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
const bool IsTailCall, const bool IsJumpTable,
IndirectCallPromotion::BasicBlocksVector &&NewBBs,
const std::vector<Callsite> &Targets) const {
BinaryFunction &Function = *IndCallBlock.getFunction();
using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
BinaryBasicBlock *MergeBlock = nullptr;
// Scale indirect call counts to the execution count of the original
// basic block containing the indirect call.
uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
uint64_t TotalIndirectBranches = 0;
for (const Callsite &Target : Targets)
TotalIndirectBranches += Target.Branches;
if (TotalIndirectBranches == 0)
TotalIndirectBranches = 1;
BinaryBasicBlock::BranchInfoType BBI;
BinaryBasicBlock::BranchInfoType ScaledBBI;
for (const Callsite &Target : Targets) {
const size_t NumEntries =
std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
for (size_t I = 0; I < NumEntries; ++I) {
BBI.push_back(
BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
(Target.Mispreds + NumEntries - 1) / NumEntries});
ScaledBBI.push_back(
BinaryBranchInfo{uint64_t(TotalCount * Target.Branches /
(NumEntries * TotalIndirectBranches)),
uint64_t(TotalCount * Target.Mispreds /
(NumEntries * TotalIndirectBranches))});
}
}
if (IsJumpTable) {
BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock);
std::vector<MCSymbol *> SymTargets;
for (const Callsite &Target : Targets) {
const size_t NumEntries =
std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
for (size_t I = 0; I < NumEntries; ++I)
SymTargets.push_back(Target.To.Sym);
}
assert(SymTargets.size() > NewBBs.size() - 1 &&
"There must be a target symbol associated with each new BB.");
for (uint64_t I = 0; I < NewBBs.size(); ++I) {
BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
SourceBB->setExecutionCount(TotalCount);
BinaryBasicBlock *TargetBB =
Function.getBasicBlockForLabel(SymTargets[I]);
SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken
TotalCount -= ScaledBBI[I].Count;
SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through
// Update branch info for the indirect jump.
BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
NewIndCallBlock->getBranchInfo(*TargetBB);
if (BranchInfo.Count > BBI[I].Count)
BranchInfo.Count -= BBI[I].Count;
else
BranchInfo.Count = 0;
if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
else
BranchInfo.MispredictedCount = 0;
}
} else {
assert(NewBBs.size() >= 2);
assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
assert(NewBBs.size() % 2 == 1 || IsTailCall);
auto ScaledBI = ScaledBBI.begin();
auto updateCurrentBranchInfo = [&] {
assert(ScaledBI != ScaledBBI.end());
TotalCount -= ScaledBI->Count;
++ScaledBI;
};
if (!IsTailCall) {
MergeBlock = NewBBs.back().get();
IndCallBlock.moveAllSuccessorsTo(MergeBlock);
}
// Fix up successors and execution counts.
updateCurrentBranchInfo();
IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount);
IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]);
const size_t Adj = IsTailCall ? 1 : 2;
for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
assert(TotalCount <= IndCallBlock.getExecutionCount() ||
TotalCount <= uint64_t(TotalIndirectBranches));
uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
if (I % 2 == 0) {
if (MergeBlock)
NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count);
} else {
assert(I + 2 < NewBBs.size());
updateCurrentBranchInfo();
NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount);
NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]);
ExecCount += TotalCount;
}
NewBBs[I]->setExecutionCount(ExecCount);
}
if (MergeBlock) {
// Arrange for the MergeBlock to be the fallthrough for the first
// promoted call block.
std::unique_ptr<BinaryBasicBlock> MBPtr;
std::swap(MBPtr, NewBBs.back());
NewBBs.pop_back();
NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
// TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
}
}
// Update the execution count.
NewBBs.back()->setExecutionCount(TotalCount);
// Update BB and BB layout.
Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs));
assert(Function.validateCFG());
return MergeBlock;
}
size_t IndirectCallPromotion::canPromoteCallsite(
const BinaryBasicBlock &BB, const MCInst &Inst,
const std::vector<Callsite> &Targets, uint64_t NumCalls) {
BinaryFunction *BF = BB.getFunction();
const BinaryContext &BC = BF->getBinaryContext();
if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
return 0;
const bool IsJumpTable = BF->getJumpTable(Inst);
auto computeStats = [&](size_t N) {
for (size_t I = 0; I < N; ++I)
if (IsJumpTable)
TotalNumFrequentJmps += Targets[I].Branches;
else
TotalNumFrequentCalls += Targets[I].Branches;
};
// If we have no targets (or no calls), skip this callsite.
if (Targets.empty() || !NumCalls) {
if (opts::Verbosity >= 1) {
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
<< " in " << BB.getName() << ", calls = " << NumCalls
<< ", targets empty or NumCalls == 0.\n";
}
return 0;
}
size_t TopN = opts::ICPTopN;
if (IsJumpTable)
TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN;
else
TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN;
const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
if (opts::ICPTopCallsites && !BC.MIB->hasAnnotation(Inst, "DoICP"))
return 0;
// Pick the top N targets.
uint64_t TotalMispredictsTopN = 0;
size_t N = 0;
if (opts::ICPUseMispredicts &&
(!IsJumpTable || opts::ICPJumpTablesByTarget)) {
// Count total number of mispredictions for (at most) the top N targets.
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
// is exceeded by fewer targets.
double Threshold = double(opts::ICPMispredictThreshold);
for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
TotalMispredictsTopN += Targets[I].Mispreds;
}
computeStats(N);
// Compute the misprediction frequency of the top N call targets. If this
// frequency is greater than the threshold, we should try ICP on this
// callsite.
const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
if (opts::Verbosity >= 1) {
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
<< " in " << BB.getName() << ", calls = " << NumCalls
<< ", top N mis. frequency " << format("%.1f", TopNFrequency)
<< "% < " << opts::ICPMispredictThreshold << "%\n";
}
return 0;
}
} else {
size_t MaxTargets = 0;
// Count total number of calls for (at most) the top N targets.
// We may choose a smaller N (TrialN vs. N) if the frequency threshold
// is exceeded by fewer targets.
const unsigned TotalThreshold = IsJumpTable
? opts::ICPJTTotalPercentThreshold
: opts::ICPCallsTotalPercentThreshold;
const unsigned RemainingThreshold =
IsJumpTable ? opts::ICPJTRemainingPercentThreshold
: opts::ICPCallsRemainingPercentThreshold;
uint64_t NumRemainingCalls = NumCalls;
for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
break;
if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
break;
if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
TrialN)
break;
TotalMispredictsTopN += Targets[I].Mispreds;
NumRemainingCalls -= Targets[I].Branches;
N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
}
computeStats(MaxTargets);
// Don't check misprediction frequency for jump tables -- we don't really
// care as long as we are saving loads from the jump table.
if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
// Compute the misprediction frequency of the top N call targets. If
// this frequency is less than the threshold, we should skip ICP at
// this callsite.
const double TopNMispredictFrequency =
(100.0 * TotalMispredictsTopN) / NumCalls;
if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
if (opts::Verbosity >= 1) {
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
<< " in " << BB.getName() << ", calls = " << NumCalls
<< ", top N mispredict frequency "
<< format("%.1f", TopNMispredictFrequency) << "% < "
<< opts::ICPMispredictThreshold << "%\n";
}
return 0;
}
}
}
// Filter by inline-ability of target functions, stop at first target that
// can't be inlined.
if (!IsJumpTable && opts::ICPPeelForInline) {
for (size_t I = 0; I < N; ++I) {
const MCSymbol *TargetSym = Targets[I].To.Sym;
const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym);
if (!TargetBF || !BinaryFunctionPass::shouldOptimize(*TargetBF) ||
getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) {
N = I;
break;
}
}
}
// Filter functions that can have ICP applied (for debugging)
if (!opts::ICPFuncsList.empty()) {
for (std::string &Name : opts::ICPFuncsList)
if (BF->hasName(Name))
return N;
return 0;
}
return N;
}
void IndirectCallPromotion::printCallsiteInfo(
const BinaryBasicBlock &BB, const MCInst &Inst,
const std::vector<Callsite> &Targets, const size_t N,
uint64_t NumCalls) const {
BinaryContext &BC = BB.getFunction()->getBinaryContext();
const bool IsTailCall = BC.MIB->isTailCall(Inst);
const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
BC.outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
<< " @ " << InstIdx << " in " << BB.getName()
<< " -> calls = " << NumCalls
<< (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
<< "\n";
for (size_t I = 0; I < N; I++) {
const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
BC.outs() << "BOLT-INFO: ";
if (Targets[I].To.Sym)
BC.outs() << Targets[I].To.Sym->getName();
else
BC.outs() << Targets[I].To.Addr;
BC.outs() << ", calls = " << Targets[I].Branches
<< ", mispreds = " << Targets[I].Mispreds
<< ", taken freq = " << format("%.1f", Frequency) << "%"
<< ", mis. freq = " << format("%.1f", MisFrequency) << "%";
bool First = true;
for (uint64_t JTIndex : Targets[I].JTIndices) {
BC.outs() << (First ? ", indices = " : ", ") << JTIndex;
First = false;
}
BC.outs() << "\n";
}
LLVM_DEBUG({
dbgs() << "BOLT-INFO: ICP original call instruction:";
BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
});
}
Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
if (opts::ICP == ICP_NONE)
return Error::success();
auto &BFs = BC.getBinaryFunctions();
const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL);
const bool OptimizeJumpTables =
(opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL);
std::unique_ptr<RegAnalysis> RA;
std::unique_ptr<BinaryFunctionCallGraph> CG;
if (OptimizeJumpTables) {
CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
RA.reset(new RegAnalysis(BC, &BFs, &*CG));
}
// If icp-top-callsites is enabled, compute the total number of indirect
// calls and then optimize the hottest callsites that contribute to that
// total.
SetVector<BinaryFunction *> Functions;
if (opts::ICPTopCallsites == 0) {
for (auto &KV : BFs)
Functions.insert(&KV.second);
} else {
using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
std::vector<IndirectCallsite> IndirectCalls;
size_t TotalIndirectCalls = 0;
// Find all the indirect callsites.
for (auto &BFIt : BFs) {
BinaryFunction &Function = BFIt.second;
if (!shouldOptimize(Function))
continue;
const bool HasLayout = !Function.getLayout().block_empty();
for (BinaryBasicBlock &BB : Function) {
if (HasLayout && Function.isSplit() && BB.isCold())
continue;
for (MCInst &Inst : BB) {
const bool IsJumpTable = Function.getJumpTable(Inst);
const bool HasIndirectCallProfile =
BC.MIB->hasAnnotation(Inst, "CallProfile");
const bool IsDirectCall =
(BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0));
if (!IsDirectCall &&
((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
(IsJumpTable && OptimizeJumpTables))) {
uint64_t NumCalls = 0;
for (const Callsite &BInfo : getCallTargets(BB, Inst))
NumCalls += BInfo.Branches;
IndirectCalls.push_back(
std::make_tuple(NumCalls, &Inst, &Function));
TotalIndirectCalls += NumCalls;
}
}
}
}
// Sort callsites by execution count.
llvm::sort(reverse(IndirectCalls));
// Find callsites that contribute to the top "opts::ICPTopCallsites"%
// number of calls.
const float TopPerc = opts::ICPTopCallsites / 100.0f;
int64_t MaxCalls = TotalIndirectCalls * TopPerc;
uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
size_t Num = 0;
for (const IndirectCallsite &IC : IndirectCalls) {
const uint64_t CurFreq = std::get<0>(IC);
// Once we decide to stop, include at least all branches that share the
// same frequency of the last one to avoid non-deterministic behavior
// (e.g. turning on/off ICP depending on the order of functions)
if (MaxCalls <= 0 && CurFreq != LastFreq)
break;
MaxCalls -= CurFreq;
LastFreq = CurFreq;
BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
Functions.insert(std::get<2>(IC));
++Num;
}
BC.outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
<< ", " << Num << " callsites cover " << opts::ICPTopCallsites
<< "% of all indirect calls\n";
}
for (BinaryFunction *FuncPtr : Functions) {
BinaryFunction &Function = *FuncPtr;
if (!shouldOptimize(Function))
continue;
const bool HasLayout = !Function.getLayout().block_empty();
// Total number of indirect calls issued from the current Function.
// (a fraction of TotalIndirectCalls)
uint64_t FuncTotalIndirectCalls = 0;
uint64_t FuncTotalIndirectJmps = 0;
std::vector<BinaryBasicBlock *> BBs;
for (BinaryBasicBlock &BB : Function) {
// Skip indirect calls in cold blocks.
if (!HasLayout || !Function.isSplit() || !BB.isCold())
BBs.push_back(&BB);
}
if (BBs.empty())
continue;
DataflowInfoManager Info(Function, RA.get(), nullptr);
while (!BBs.empty()) {
BinaryBasicBlock *BB = BBs.back();
BBs.pop_back();
for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
MCInst &Inst = BB->getInstructionAtIndex(Idx);
const ptrdiff_t InstIdx = &Inst - &(*BB->begin());
const bool IsTailCall = BC.MIB->isTailCall(Inst);
const bool HasIndirectCallProfile =
BC.MIB->hasAnnotation(Inst, "CallProfile");
const bool IsJumpTable = Function.getJumpTable(Inst);
if (BC.MIB->isCall(Inst))
TotalCalls += BB->getKnownExecutionCount();
if (IsJumpTable && !OptimizeJumpTables)
continue;
if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
continue;
// Ignore direct calls.
if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
continue;
assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) &&
"expected a call or an indirect jump instruction");
if (IsJumpTable)
++TotalJumpTableCallsites;
else
++TotalIndirectCallsites;
std::vector<Callsite> Targets = getCallTargets(*BB, Inst);
// Compute the total number of calls from this particular callsite.
uint64_t NumCalls = 0;
for (const Callsite &BInfo : Targets)
NumCalls += BInfo.Branches;
if (!IsJumpTable)
FuncTotalIndirectCalls += NumCalls;
else
FuncTotalIndirectJmps += NumCalls;
// If FLAGS regs is alive after this jmp site, do not try
// promoting because we will clobber FLAGS.
if (IsJumpTable) {
ErrorOr<const BitVector &> State =
Info.getLivenessAnalysis().getStateBefore(Inst);
if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
if (opts::Verbosity >= 1)
BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
<< InstIdx << " in " << BB->getName()
<< ", calls = " << NumCalls
<< (State ? ", cannot clobber flags reg.\n"
: ", no liveness data available.\n");
continue;
}
}
// Should this callsite be optimized? Return the number of targets
// to use when promoting this call. A value of zero means to skip
// this callsite.
size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls);
// If it is a jump table and it failed to meet our initial threshold,
// proceed to findCallTargetSymbols -- it may reevaluate N if
// memory profile is present
if (!N && !IsJumpTable)
continue;
if (opts::Verbosity >= 1)
printCallsiteInfo(*BB, Inst, Targets, N, NumCalls);
// Find MCSymbols or absolute addresses for each call target.
MCInst *TargetFetchInst = nullptr;
const SymTargetsType SymTargets =
findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst);
// findCallTargetSymbols may have changed N if mem profile is available
// for jump tables
if (!N)
continue;
LLVM_DEBUG(printDecision(dbgs(), Targets, N));
// If we can't resolve any of the target symbols, punt on this callsite.
// TODO: can this ever happen?
if (SymTargets.size() < N) {
const size_t LastTarget = SymTargets.size();
if (opts::Verbosity >= 1)
BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
<< InstIdx << " in " << BB->getName()
<< ", calls = " << NumCalls
<< ", ICP failed to find target symbol for "
<< Targets[LastTarget].To.Sym->getName() << "\n";
continue;
}
MethodInfoType MethodInfo;
if (!IsJumpTable) {
MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets);
TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
LLVM_DEBUG(dbgs()
<< "BOLT-INFO: ICP "
<< (!MethodInfo.first.empty() ? "found" : "did not find")
<< " vtables for all methods.\n");
} else if (TargetFetchInst) {
++TotalIndexBasedJumps;
MethodInfo.second.push_back(TargetFetchInst);
}
// Generate new promoted call code for this callsite.
MCPlusBuilder::BlocksVectorTy ICPcode =
(IsJumpTable && !opts::ICPJumpTablesByTarget)
? BC.MIB->jumpTablePromotion(Inst, SymTargets,
MethodInfo.second, BC.Ctx.get())
: BC.MIB->indirectCallPromotion(
Inst, SymTargets, MethodInfo.first, MethodInfo.second,
opts::ICPOldCodeSequence, BC.Ctx.get());
if (ICPcode.empty()) {
if (opts::Verbosity >= 1)
BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
<< InstIdx << " in " << BB->getName()
<< ", calls = " << NumCalls
<< ", unable to generate promoted call code.\n";
continue;
}
LLVM_DEBUG({
uint64_t Offset = Targets[0].From.Addr;
dbgs() << "BOLT-INFO: ICP indirect call code:\n";
for (const auto &entry : ICPcode) {
const MCSymbol *const &Sym = entry.first;
const InstructionListType &Insts = entry.second;
if (Sym)
dbgs() << Sym->getName() << ":\n";
Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(),
Offset);
}
dbgs() << "---------------------------------------------------\n";
});
// Rewrite the CFG with the newly generated ICP code.
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second);
// Fix the CFG after inserting the new basic blocks.
BinaryBasicBlock *MergeBlock =
fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets);
// Since the tail of the original block was split off and it may contain
// additional indirect calls, we must add the merge block to the set of
// blocks to process.
if (MergeBlock)
BBs.push_back(MergeBlock);
if (opts::Verbosity >= 1)
BC.outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
<< InstIdx << " in " << BB->getName()
<< " -> calls = " << NumCalls << "\n";
if (IsJumpTable)
++TotalOptimizedJumpTableCallsites;
else
++TotalOptimizedIndirectCallsites;
Modified.insert(&Function);
}
}
TotalIndirectCalls += FuncTotalIndirectCalls;
TotalIndirectJmps += FuncTotalIndirectJmps;
}
BC.outs()
<< "BOLT-INFO: ICP total indirect callsites with profile = "
<< TotalIndirectCallsites << "\n"
<< "BOLT-INFO: ICP total jump table callsites = "
<< TotalJumpTableCallsites << "\n"
<< "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
<< "BOLT-INFO: ICP percentage of calls that are indirect = "
<< format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
<< "BOLT-INFO: ICP percentage of indirect calls that can be "
"optimized = "
<< format("%.1f", (100.0 * TotalNumFrequentCalls) /
std::max<size_t>(TotalIndirectCalls, 1))
<< "%\n"
<< "BOLT-INFO: ICP percentage of indirect callsites that are "
"optimized = "
<< format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
std::max<uint64_t>(TotalIndirectCallsites, 1))
<< "%\n"
<< "BOLT-INFO: ICP number of method load elimination candidates = "
<< TotalMethodLoadEliminationCandidates << "\n"
<< "BOLT-INFO: ICP percentage of method calls candidates that have "
"loads eliminated = "
<< format("%.1f",
(100.0 * TotalMethodLoadsEliminated) /
std::max<uint64_t>(TotalMethodLoadEliminationCandidates, 1))
<< "%\n"
<< "BOLT-INFO: ICP percentage of indirect branches that are "
"optimized = "
<< format("%.1f", (100.0 * TotalNumFrequentJmps) /
std::max<uint64_t>(TotalIndirectJmps, 1))
<< "%\n"
<< "BOLT-INFO: ICP percentage of jump table callsites that are "
<< "optimized = "
<< format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
std::max<uint64_t>(TotalJumpTableCallsites, 1))
<< "%\n"
<< "BOLT-INFO: ICP number of jump table callsites that can use hot "
<< "indices = " << TotalIndexBasedCandidates << "\n"
<< "BOLT-INFO: ICP percentage of jump table callsites that use hot "
"indices = "
<< format("%.1f", (100.0 * TotalIndexBasedJumps) /
std::max<uint64_t>(TotalIndexBasedCandidates, 1))
<< "%\n";
#ifndef NDEBUG
verifyProfile(BFs);
#endif
return Error::success();
}
} // namespace bolt
} // namespace llvm