Files
clang-p2996/llvm/lib/Transforms/IPO/SampleProfile.cpp
Wei Mi ee35784a90 [SampleFDO] Support enabling -funique-internal-linkage-name.
now -funique-internal-linkage-name flag is available, and we want to flip
it on by default since it is beneficial to have separate sample profiles
for different internal symbols with the same name. As a preparation, we
want to avoid regression caused by the flip.

When we flip -funique-internal-linkage-name on, the profile is collected
from binary built without -funique-internal-linkage-name so it has no uniq
suffix, but the IR in the optimized build contains the suffix. This kind of
mismatch may introduce transient regression.

To avoid such mismatch, we introduce a NameTable section flag indicating
whether there is any name in the profile containing uniq suffix. Compiler
will decide whether to keep uniq suffix during name canonicalization
depending on the NameTable section flag. The flag is only available for
extbinary format. For other formats, by default compiler will keep uniq
suffix so they will only experience transient regression when
-funique-internal-linkage-name is just flipped.

Another type of regression is caused by places where we miss to call
getCanonicalFnName. Those places are fixed.

Differential Revision: https://reviews.llvm.org/D96932
2021-03-09 21:41:40 -08:00

1920 lines
75 KiB
C++

//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the SampleProfileLoader transformation. This pass
// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
// profile information in the given profile.
//
// This pass generates branch weight annotations on the IR:
//
// - prof: Represents branch weights. This annotation is added to branches
// to indicate the weights of each edge coming out of the branch.
// The weight of each edge is the weight of the target block for
// that edge. The weight of a block B is computed as the maximum
// number of samples found in B.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/SampleProfile.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/Analysis/InlineAdvisor.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ReplayInlineAdvisor.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/ProfileData/SampleProf.h"
#include "llvm/ProfileData/SampleProfReader.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/ErrorOr.h"
#include "llvm/Support/GenericDomTree.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/SampleContextTracker.h"
#include "llvm/Transforms/IPO/SampleProfileProbe.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <limits>
#include <map>
#include <memory>
#include <queue>
#include <string>
#include <system_error>
#include <utility>
#include <vector>
using namespace llvm;
using namespace sampleprof;
using namespace llvm::sampleprofutil;
using ProfileCount = Function::ProfileCount;
#define DEBUG_TYPE "sample-profile"
#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
STATISTIC(NumCSInlined,
"Number of functions inlined with context sensitive profile");
STATISTIC(NumCSNotInlined,
"Number of functions not inlined with context sensitive profile");
STATISTIC(NumMismatchedProfile,
"Number of functions with CFG mismatched profile");
STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
STATISTIC(NumDuplicatedInlinesite,
"Number of inlined callsites with a partial distribution factor");
STATISTIC(NumCSInlinedHitMinLimit,
"Number of functions with FDO inline stopped due to min size limit");
STATISTIC(NumCSInlinedHitMaxLimit,
"Number of functions with FDO inline stopped due to max size limit");
STATISTIC(
NumCSInlinedHitGrowthLimit,
"Number of functions with FDO inline stopped due to growth size limit");
// Command line option to specify the file to read samples from. This is
// mainly used for debugging.
static cl::opt<std::string> SampleProfileFile(
"sample-profile-file", cl::init(""), cl::value_desc("filename"),
cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
// The named file contains a set of transformations that may have been applied
// to the symbol names between the program from which the sample data was
// collected and the current program's symbols.
static cl::opt<std::string> SampleProfileRemappingFile(
"sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
static cl::opt<bool> ProfileSampleAccurate(
"profile-sample-accurate", cl::Hidden, cl::init(false),
cl::desc("If the sample profile is accurate, we will mark all un-sampled "
"callsite and function as having 0 samples. Otherwise, treat "
"un-sampled callsites and functions conservatively as unknown. "));
static cl::opt<bool> ProfileAccurateForSymsInList(
"profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
cl::init(true),
cl::desc("For symbols in profile symbol list, regard their profiles to "
"be accurate. It may be overriden by profile-sample-accurate. "));
static cl::opt<bool> ProfileMergeInlinee(
"sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
cl::desc("Merge past inlinee's profile to outline version if sample "
"profile loader decided not to inline a call site. It will "
"only be enabled when top-down order of profile loading is "
"enabled. "));
static cl::opt<bool> ProfileTopDownLoad(
"sample-profile-top-down-load", cl::Hidden, cl::init(true),
cl::desc("Do profile annotation and inlining for functions in top-down "
"order of call graph during sample profile loading. It only "
"works for new pass manager. "));
static cl::opt<bool> UseProfileIndirectCallEdges(
"use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
cl::desc("Considering indirect call samples from profile when top-down "
"processing functions. Only CSSPGO is supported."));
static cl::opt<bool> UseProfileTopDownOrder(
"use-profile-top-down-order", cl::init(false), cl::Hidden,
cl::desc("Process functions in one SCC in a top-down order "
"based on the input profile."));
static cl::opt<bool> ProfileSizeInline(
"sample-profile-inline-size", cl::Hidden, cl::init(false),
cl::desc("Inline cold call sites in profile loader if it's beneficial "
"for code size."));
static cl::opt<int> ProfileInlineGrowthLimit(
"sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
cl::desc("The size growth ratio limit for proirity-based sample profile "
"loader inlining."));
static cl::opt<int> ProfileInlineLimitMin(
"sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
cl::desc("The lower bound of size growth limit for "
"proirity-based sample profile loader inlining."));
static cl::opt<int> ProfileInlineLimitMax(
"sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
cl::desc("The upper bound of size growth limit for "
"proirity-based sample profile loader inlining."));
static cl::opt<int> ProfileICPThreshold(
"sample-profile-icp-threshold", cl::Hidden, cl::init(5),
cl::desc(
"Relative hotness threshold for indirect "
"call promotion in proirity-based sample profile loader inlining."));
static cl::opt<int> SampleHotCallSiteThreshold(
"sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
cl::desc("Hot callsite threshold for proirity-based sample profile loader "
"inlining."));
static cl::opt<bool> CallsitePrioritizedInline(
"sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
cl::init(false),
cl::desc("Use call site prioritized inlining for sample profile loader."
"Currently only CSSPGO is supported."));
static cl::opt<int> SampleColdCallSiteThreshold(
"sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
cl::desc("Threshold for inlining cold callsites"));
static cl::opt<std::string> ProfileInlineReplayFile(
"sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
cl::desc(
"Optimization remarks file containing inline remarks to be replayed "
"by inlining from sample profile loader."),
cl::Hidden);
static cl::opt<unsigned>
MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
cl::ZeroOrMore,
cl::desc("Max number of promotions for a single indirect "
"call callsite in sample profile loader"));
namespace {
using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
using EdgeWeightMap = DenseMap<Edge, uint64_t>;
using BlockEdgeMap =
DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
class GUIDToFuncNameMapper {
public:
GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
: CurrentReader(Reader), CurrentModule(M),
CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
if (!CurrentReader.useMD5())
return;
for (const auto &F : CurrentModule) {
StringRef OrigName = F.getName();
CurrentGUIDToFuncNameMap.insert(
{Function::getGUID(OrigName), OrigName});
// Local to global var promotion used by optimization like thinlto
// will rename the var and add suffix like ".llvm.xxx" to the
// original local name. In sample profile, the suffixes of function
// names are all stripped. Since it is possible that the mapper is
// built in post-thin-link phase and var promotion has been done,
// we need to add the substring of function name without the suffix
// into the GUIDToFuncNameMap.
StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
if (CanonName != OrigName)
CurrentGUIDToFuncNameMap.insert(
{Function::getGUID(CanonName), CanonName});
}
// Update GUIDToFuncNameMap for each function including inlinees.
SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
}
~GUIDToFuncNameMapper() {
if (!CurrentReader.useMD5())
return;
CurrentGUIDToFuncNameMap.clear();
// Reset GUIDToFuncNameMap for of each function as they're no
// longer valid at this point.
SetGUIDToFuncNameMapForAll(nullptr);
}
private:
void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
std::queue<FunctionSamples *> FSToUpdate;
for (auto &IFS : CurrentReader.getProfiles()) {
FSToUpdate.push(&IFS.second);
}
while (!FSToUpdate.empty()) {
FunctionSamples *FS = FSToUpdate.front();
FSToUpdate.pop();
FS->GUIDToFuncNameMap = Map;
for (const auto &ICS : FS->getCallsiteSamples()) {
const FunctionSamplesMap &FSMap = ICS.second;
for (auto &IFS : FSMap) {
FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
FSToUpdate.push(&FS);
}
}
}
}
SampleProfileReader &CurrentReader;
Module &CurrentModule;
DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
};
// Inline candidate used by iterative callsite prioritized inliner
struct InlineCandidate {
CallBase *CallInstr;
const FunctionSamples *CalleeSamples;
// Prorated callsite count, which will be used to guide inlining. For example,
// if a callsite is duplicated in LTO prelink, then in LTO postlink the two
// copies will get their own distribution factors and their prorated counts
// will be used to decide if they should be inlined independently.
uint64_t CallsiteCount;
// Call site distribution factor to prorate the profile samples for a
// duplicated callsite. Default value is 1.0.
float CallsiteDistribution;
};
// Inline candidate comparer using call site weight
struct CandidateComparer {
bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
if (LHS.CallsiteCount != RHS.CallsiteCount)
return LHS.CallsiteCount < RHS.CallsiteCount;
// Tie breaker using GUID so we have stable/deterministic inlining order
assert(LHS.CalleeSamples && RHS.CalleeSamples &&
"Expect non-null FunctionSamples");
return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) <
RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName());
}
};
using CandidateQueue =
PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
CandidateComparer>;
/// Sample profile pass.
///
/// This pass reads profile data from the file specified by
/// -sample-profile-file and annotates every affected function with the
/// profile information found in that file.
class SampleProfileLoader final
: public SampleProfileLoaderBaseImpl<BasicBlock> {
public:
SampleProfileLoader(
StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
std::function<AssumptionCache &(Function &)> GetAssumptionCache,
std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
std::function<const TargetLibraryInfo &(Function &)> GetTLI)
: SampleProfileLoaderBaseImpl(std::string(Name)),
GetAC(std::move(GetAssumptionCache)),
GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
bool runOnModule(Module &M, ModuleAnalysisManager *AM,
ProfileSummaryInfo *_PSI, CallGraph *CG);
protected:
bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
bool emitAnnotations(Function &F);
ErrorOr<uint64_t> getInstWeight(const Instruction &I) override;
ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
const FunctionSamples *
findFunctionSamples(const Instruction &I) const override;
std::vector<const FunctionSamples *>
findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
// Attempt to promote indirect call and also inline the promoted call
bool tryPromoteAndInlineCandidate(
Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
bool inlineHotFunctions(Function &F,
DenseSet<GlobalValue::GUID> &InlinedGUIDs);
InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
bool
tryInlineCandidate(InlineCandidate &Candidate,
SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
bool
inlineHotFunctionsWithPriority(Function &F,
DenseSet<GlobalValue::GUID> &InlinedGUIDs);
// Inline cold/small functions in addition to hot ones
bool shouldInlineColdCallee(CallBase &CallInst);
void emitOptimizationRemarksForInlineCandidates(
const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
bool Hot);
std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
void generateMDProfMetadata(Function &F);
/// Map from function name to Function *. Used to find the function from
/// the function name. If the function name contains suffix, additional
/// entry is added to map from the stripped name to the function if there
/// is one-to-one mapping.
StringMap<Function *> SymbolMap;
std::function<AssumptionCache &(Function &)> GetAC;
std::function<TargetTransformInfo &(Function &)> GetTTI;
std::function<const TargetLibraryInfo &(Function &)> GetTLI;
/// Profile tracker for different context.
std::unique_ptr<SampleContextTracker> ContextTracker;
/// Name of the profile remapping file to load.
std::string RemappingFilename;
/// Flag indicating whether the profile input loaded successfully.
bool ProfileIsValid = false;
/// Flag indicating whether input profile is context-sensitive
bool ProfileIsCS = false;
/// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
///
/// We need to know the LTO phase because for example in ThinLTOPrelink
/// phase, in annotation, we should not promote indirect calls. Instead,
/// we will mark GUIDs that needs to be annotated to the function.
ThinOrFullLTOPhase LTOPhase;
/// Profle Symbol list tells whether a function name appears in the binary
/// used to generate the current profile.
std::unique_ptr<ProfileSymbolList> PSL;
/// Total number of samples collected in this profile.
///
/// This is the sum of all the samples collected in all the functions executed
/// at runtime.
uint64_t TotalCollectedSamples = 0;
// Information recorded when we declined to inline a call site
// because we have determined it is too cold is accumulated for
// each callee function. Initially this is just the entry count.
struct NotInlinedProfileInfo {
uint64_t entryCount;
};
DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
// GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
// all the function symbols defined or declared in current module.
DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
// All the Names used in FunctionSamples including outline function
// names, inline instance names and call target names.
StringSet<> NamesInProfile;
// For symbol in profile symbol list, whether to regard their profiles
// to be accurate. It is mainly decided by existance of profile symbol
// list and -profile-accurate-for-symsinlist flag, but it can be
// overriden by -profile-sample-accurate or profile-sample-accurate
// attribute.
bool ProfAccForSymsInList;
// External inline advisor used to replay inline decision from remarks.
std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
// A pseudo probe helper to correlate the imported sample counts.
std::unique_ptr<PseudoProbeManager> ProbeManager;
};
class SampleProfileLoaderLegacyPass : public ModulePass {
public:
// Class identification, replacement for typeinfo
static char ID;
SampleProfileLoaderLegacyPass(
StringRef Name = SampleProfileFile,
ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
: ModulePass(ID), SampleLoader(
Name, SampleProfileRemappingFile, LTOPhase,
[&](Function &F) -> AssumptionCache & {
return ACT->getAssumptionCache(F);
},
[&](Function &F) -> TargetTransformInfo & {
return TTIWP->getTTI(F);
},
[&](Function &F) -> TargetLibraryInfo & {
return TLIWP->getTLI(F);
}) {
initializeSampleProfileLoaderLegacyPassPass(
*PassRegistry::getPassRegistry());
}
void dump() { SampleLoader.dump(); }
bool doInitialization(Module &M) override {
return SampleLoader.doInitialization(M);
}
StringRef getPassName() const override { return "Sample profile pass"; }
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
private:
SampleProfileLoader SampleLoader;
AssumptionCacheTracker *ACT = nullptr;
TargetTransformInfoWrapperPass *TTIWP = nullptr;
TargetLibraryInfoWrapperPass *TLIWP = nullptr;
};
} // end anonymous namespace
ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
if (FunctionSamples::ProfileIsProbeBased)
return getProbeWeight(Inst);
const DebugLoc &DLoc = Inst.getDebugLoc();
if (!DLoc)
return std::error_code();
// Ignore all intrinsics, phinodes and branch instructions.
// Branch and phinodes instruction usually contains debug info from sources
// outside of the residing basic block, thus we ignore them during annotation.
if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
return std::error_code();
// For non-CS profile, if a direct call/invoke instruction is inlined in
// profile (findCalleeFunctionSamples returns non-empty result), but not
// inlined here, it means that the inlined callsite has no sample, thus the
// call instruction should have 0 count.
// For CS profile, the callsite count of previously inlined callees is
// populated with the entry count of the callees.
if (!ProfileIsCS)
if (const auto *CB = dyn_cast<CallBase>(&Inst))
if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
return 0;
return getInstWeightImpl(Inst);
}
ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
assert(FunctionSamples::ProfileIsProbeBased &&
"Profile is not pseudo probe based");
Optional<PseudoProbe> Probe = extractProbe(Inst);
if (!Probe)
return std::error_code();
// Ignore danling probes since they are logically deleted and should not
// consume any profile samples.
if (Probe->isDangling())
return std::error_code();
const FunctionSamples *FS = findFunctionSamples(Inst);
if (!FS)
return std::error_code();
// For non-CS profile, If a direct call/invoke instruction is inlined in
// profile (findCalleeFunctionSamples returns non-empty result), but not
// inlined here, it means that the inlined callsite has no sample, thus the
// call instruction should have 0 count.
// For CS profile, the callsite count of previously inlined callees is
// populated with the entry count of the callees.
if (!ProfileIsCS)
if (const auto *CB = dyn_cast<CallBase>(&Inst))
if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
return 0;
const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
if (R) {
uint64_t Samples = R.get() * Probe->Factor;
bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
if (FirstMark) {
ORE->emit([&]() {
OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
Remark << "Applied " << ore::NV("NumSamples", Samples);
Remark << " samples from profile (ProbeId=";
Remark << ore::NV("ProbeId", Probe->Id);
Remark << ", Factor=";
Remark << ore::NV("Factor", Probe->Factor);
Remark << ", OriginalSamples=";
Remark << ore::NV("OriginalSamples", R.get());
Remark << ")";
return Remark;
});
}
LLVM_DEBUG(dbgs() << " " << Probe->Id << ":" << Inst
<< " - weight: " << R.get() << " - factor: "
<< format("%0.2f", Probe->Factor) << ")\n");
return Samples;
}
return R;
}
/// Get the FunctionSamples for a call instruction.
///
/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
/// instance in which that call instruction is calling to. It contains
/// all samples that resides in the inlined instance. We first find the
/// inlined instance in which the call instruction is from, then we
/// traverse its children to find the callsite with the matching
/// location.
///
/// \param Inst Call/Invoke instruction to query.
///
/// \returns The FunctionSamples pointer to the inlined instance.
const FunctionSamples *
SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
const DILocation *DIL = Inst.getDebugLoc();
if (!DIL) {
return nullptr;
}
StringRef CalleeName;
if (Function *Callee = Inst.getCalledFunction())
CalleeName = Callee->getName();
if (ProfileIsCS)
return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
const FunctionSamples *FS = findFunctionSamples(Inst);
if (FS == nullptr)
return nullptr;
return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
CalleeName, Reader->getRemapper());
}
/// Returns a vector of FunctionSamples that are the indirect call targets
/// of \p Inst. The vector is sorted by the total number of samples. Stores
/// the total call count of the indirect call in \p Sum.
std::vector<const FunctionSamples *>
SampleProfileLoader::findIndirectCallFunctionSamples(
const Instruction &Inst, uint64_t &Sum) const {
const DILocation *DIL = Inst.getDebugLoc();
std::vector<const FunctionSamples *> R;
if (!DIL) {
return R;
}
auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
assert(L && R && "Expect non-null FunctionSamples");
if (L->getEntrySamples() != R->getEntrySamples())
return L->getEntrySamples() > R->getEntrySamples();
return FunctionSamples::getGUID(L->getName()) <
FunctionSamples::getGUID(R->getName());
};
if (ProfileIsCS) {
auto CalleeSamples =
ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
if (CalleeSamples.empty())
return R;
// For CSSPGO, we only use target context profile's entry count
// as that already includes both inlined callee and non-inlined ones..
Sum = 0;
for (const auto *const FS : CalleeSamples) {
Sum += FS->getEntrySamples();
R.push_back(FS);
}
llvm::sort(R, FSCompare);
return R;
}
const FunctionSamples *FS = findFunctionSamples(Inst);
if (FS == nullptr)
return R;
auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
auto T = FS->findCallTargetMapAt(CallSite);
Sum = 0;
if (T)
for (const auto &T_C : T.get())
Sum += T_C.second;
if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
if (M->empty())
return R;
for (const auto &NameFS : *M) {
Sum += NameFS.second.getEntrySamples();
R.push_back(&NameFS.second);
}
llvm::sort(R, FSCompare);
}
return R;
}
const FunctionSamples *
SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
if (FunctionSamples::ProfileIsProbeBased) {
Optional<PseudoProbe> Probe = extractProbe(Inst);
if (!Probe)
return nullptr;
}
const DILocation *DIL = Inst.getDebugLoc();
if (!DIL)
return Samples;
auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
if (it.second) {
if (ProfileIsCS)
it.first->second = ContextTracker->getContextSamplesFor(DIL);
else
it.first->second =
Samples->findFunctionSamples(DIL, Reader->getRemapper());
}
return it.first->second;
}
/// Check whether the indirect call promotion history of \p Inst allows
/// the promotion for \p Candidate.
/// If the profile count for the promotion candidate \p Candidate is
/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
/// for \p Inst. If we already have at least MaxNumPromotions
/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
/// cannot promote for \p Inst anymore.
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
uint32_t NumVals = 0;
uint64_t TotalCount = 0;
std::unique_ptr<InstrProfValueData[]> ValueData =
std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
bool Valid =
getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
ValueData.get(), NumVals, TotalCount, true);
// No valid value profile so no promoted targets have been recorded
// before. Ok to do ICP.
if (!Valid)
return true;
unsigned NumPromoted = 0;
for (uint32_t I = 0; I < NumVals; I++) {
if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
continue;
// If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
// metadata, it means the candidate has been promoted for this
// indirect call.
if (ValueData[I].Value == Function::getGUID(Candidate))
return false;
NumPromoted++;
// If already have MaxNumPromotions promotion, don't do it anymore.
if (NumPromoted == MaxNumPromotions)
return false;
}
return true;
}
/// Update indirect call target profile metadata for \p Inst.
/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
/// If it is 0, it means updateIDTMetaData is used to mark a
/// certain target to be promoted already. If it is not zero,
/// we expect to use it to update the total count in the value profile.
static void
updateIDTMetaData(Instruction &Inst,
const SmallVectorImpl<InstrProfValueData> &CallTargets,
uint64_t Sum) {
assert((Sum != 0 || (CallTargets.size() == 1 &&
CallTargets[0].Count == NOMORE_ICP_MAGICNUM)) &&
"If sum is 0, assume only one element in CallTargets with count "
"being NOMORE_ICP_MAGICNUM");
uint32_t NumVals = 0;
// OldSum is the existing total count in the value profile data.
// It will be replaced by Sum if Sum is not 0.
uint64_t OldSum = 0;
std::unique_ptr<InstrProfValueData[]> ValueData =
std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
bool Valid =
getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
ValueData.get(), NumVals, OldSum, true);
DenseMap<uint64_t, uint64_t> ValueCountMap;
// Initialize ValueCountMap with existing value profile data.
if (Valid) {
for (uint32_t I = 0; I < NumVals; I++)
ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
}
for (const auto &Data : CallTargets) {
auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
if (Pair.second)
continue;
// Whenever the count is NOMORE_ICP_MAGICNUM for a value, keep it
// in the ValueCountMap. If both the count in CallTargets and the
// count in ValueCountMap is not NOMORE_ICP_MAGICNUM, keep the
// count in CallTargets.
if (Pair.first->second != NOMORE_ICP_MAGICNUM &&
Data.Count == NOMORE_ICP_MAGICNUM) {
OldSum -= Pair.first->second;
Pair.first->second = NOMORE_ICP_MAGICNUM;
} else if (Pair.first->second == NOMORE_ICP_MAGICNUM &&
Data.Count != NOMORE_ICP_MAGICNUM) {
assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
Sum -= Data.Count;
} else if (Pair.first->second != NOMORE_ICP_MAGICNUM &&
Data.Count != NOMORE_ICP_MAGICNUM) {
// Sum will be used in this case. Although the existing count
// for the current value in value profile will be overriden,
// no need to update OldSum.
Pair.first->second = Data.Count;
}
}
SmallVector<InstrProfValueData, 8> NewCallTargets;
for (const auto &ValueCount : ValueCountMap) {
NewCallTargets.emplace_back(
InstrProfValueData{ValueCount.first, ValueCount.second});
}
llvm::sort(NewCallTargets,
[](const InstrProfValueData &L, const InstrProfValueData &R) {
if (L.Count != R.Count)
return L.Count > R.Count;
return L.Value > R.Value;
});
uint32_t MaxMDCount =
std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,
NewCallTargets, Sum ? Sum : OldSum, IPVK_IndirectCallTarget,
MaxMDCount);
}
/// Attempt to promote indirect call and also inline the promoted call.
///
/// \param F Caller function.
/// \param Candidate ICP and inline candidate.
/// \param Sum Sum of target counts for indirect call.
/// \param InlinedCallSite Output vector for new call sites exposed after
/// inlining.
bool SampleProfileLoader::tryPromoteAndInlineCandidate(
Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
SmallVector<CallBase *, 8> *InlinedCallSite) {
auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
auto R = SymbolMap.find(CalleeFunctionName);
if (R == SymbolMap.end() || !R->getValue())
return false;
auto &CI = *Candidate.CallInstr;
if (!doesHistoryAllowICP(CI, R->getValue()->getName()))
return false;
const char *Reason = "Callee function not available";
// R->getValue() != &F is to prevent promoting a recursive call.
// If it is a recursive call, we do not inline it as it could bloat
// the code exponentially. There is way to better handle this, e.g.
// clone the caller first, and inline the cloned caller if it is
// recursive. As llvm does not inline recursive calls, we will
// simply ignore it instead of handling it explicitly.
if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
R->getValue()->hasFnAttribute("use-sample-profile") &&
R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) {
// For promoted target, set its value with NOMORE_ICP_MAGICNUM count
// in the value profile metadata so the target won't be promoted again.
SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
Function::getGUID(R->getValue()->getName()), NOMORE_ICP_MAGICNUM}};
updateIDTMetaData(CI, SortedCallTargets, 0);
auto *DI = &pgo::promoteIndirectCall(
CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE);
if (DI) {
Sum -= Candidate.CallsiteCount;
// Prorate the indirect callsite distribution.
// Do not update the promoted direct callsite distribution at this
// point since the original distribution combined with the callee
// profile will be used to prorate callsites from the callee if
// inlined. Once not inlined, the direct callsite distribution should
// be prorated so that the it will reflect the real callsite counts.
setProbeDistributionFactor(CI, Candidate.CallsiteDistribution * Sum /
SumOrigin);
Candidate.CallInstr = DI;
if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
if (!Inlined) {
// Prorate the direct callsite distribution so that it reflects real
// callsite counts.
setProbeDistributionFactor(*DI, Candidate.CallsiteDistribution *
Candidate.CallsiteCount /
SumOrigin);
}
return Inlined;
}
}
} else {
LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
<< Candidate.CalleeSamples->getFuncName() << " because "
<< Reason << "\n");
}
return false;
}
bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
if (!ProfileSizeInline)
return false;
Function *Callee = CallInst.getCalledFunction();
if (Callee == nullptr)
return false;
InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
GetAC, GetTLI);
if (Cost.isNever())
return false;
if (Cost.isAlways())
return true;
return Cost.getCost() <= SampleColdCallSiteThreshold;
}
void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
bool Hot) {
for (auto I : Candidates) {
Function *CalledFunction = I->getCalledFunction();
if (CalledFunction) {
ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
I->getDebugLoc(), I->getParent())
<< "previous inlining reattempted for "
<< (Hot ? "hotness: '" : "size: '")
<< ore::NV("Callee", CalledFunction) << "' into '"
<< ore::NV("Caller", &F) << "'");
}
}
}
/// Iteratively inline hot callsites of a function.
///
/// Iteratively traverse all callsites of the function \p F, and find if
/// the corresponding inlined instance exists and is hot in profile. If
/// it is hot enough, inline the callsites and adds new callsites of the
/// callee into the caller. If the call is an indirect call, first promote
/// it to direct call. Each indirect call is limited with a single target.
///
/// \param F function to perform iterative inlining.
/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
/// inlined in the profiled binary.
///
/// \returns True if there is any inline happened.
bool SampleProfileLoader::inlineHotFunctions(
Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
// ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
// Profile symbol list is ignored when profile-sample-accurate is on.
assert((!ProfAccForSymsInList ||
(!ProfileSampleAccurate &&
!F.hasFnAttribute("profile-sample-accurate"))) &&
"ProfAccForSymsInList should be false when profile-sample-accurate "
"is enabled");
DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
bool Changed = false;
bool LocalChanged = true;
while (LocalChanged) {
LocalChanged = false;
SmallVector<CallBase *, 10> CIS;
for (auto &BB : F) {
bool Hot = false;
SmallVector<CallBase *, 10> AllCandidates;
SmallVector<CallBase *, 10> ColdCandidates;
for (auto &I : BB.getInstList()) {
const FunctionSamples *FS = nullptr;
if (auto *CB = dyn_cast<CallBase>(&I)) {
if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
"GUIDToFuncNameMap has to be populated");
AllCandidates.push_back(CB);
if (FS->getEntrySamples() > 0 || ProfileIsCS)
LocalNotInlinedCallSites.try_emplace(CB, FS);
if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
Hot = true;
else if (shouldInlineColdCallee(*CB))
ColdCandidates.push_back(CB);
}
}
}
if (Hot || ExternalInlineAdvisor) {
CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
} else {
CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
}
}
for (CallBase *I : CIS) {
Function *CalledFunction = I->getCalledFunction();
InlineCandidate Candidate = {
I,
LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
: nullptr,
0 /* dummy count */, 1.0 /* dummy distribution factor */};
// Do not inline recursive calls.
if (CalledFunction == &F)
continue;
if (I->isIndirectCall()) {
uint64_t Sum;
for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
uint64_t SumOrigin = Sum;
if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), SymbolMap,
PSI->getOrCompHotCountThreshold());
continue;
}
if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
continue;
Candidate = {I, FS, FS->getEntrySamples(), 1.0};
if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
LocalNotInlinedCallSites.erase(I);
LocalChanged = true;
}
}
} else if (CalledFunction && CalledFunction->getSubprogram() &&
!CalledFunction->isDeclaration()) {
if (tryInlineCandidate(Candidate)) {
LocalNotInlinedCallSites.erase(I);
LocalChanged = true;
}
} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
findCalleeFunctionSamples(*I)->findInlinedFunctions(
InlinedGUIDs, F.getParent(), SymbolMap,
PSI->getOrCompHotCountThreshold());
}
}
Changed |= LocalChanged;
}
// For CS profile, profile for not inlined context will be merged when
// base profile is being trieved
if (ProfileIsCS)
return Changed;
// Accumulate not inlined callsite information into notInlinedSamples
for (const auto &Pair : LocalNotInlinedCallSites) {
CallBase *I = Pair.getFirst();
Function *Callee = I->getCalledFunction();
if (!Callee || Callee->isDeclaration())
continue;
ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
I->getDebugLoc(), I->getParent())
<< "previous inlining not repeated: '"
<< ore::NV("Callee", Callee) << "' into '"
<< ore::NV("Caller", &F) << "'");
++NumCSNotInlined;
const FunctionSamples *FS = Pair.getSecond();
if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
continue;
}
if (ProfileMergeInlinee) {
// A function call can be replicated by optimizations like callsite
// splitting or jump threading and the replicates end up sharing the
// sample nested callee profile instead of slicing the original inlinee's
// profile. We want to do merge exactly once by filtering out callee
// profiles with a non-zero head sample count.
if (FS->getHeadSamples() == 0) {
// Use entry samples as head samples during the merge, as inlinees
// don't have head samples.
const_cast<FunctionSamples *>(FS)->addHeadSamples(
FS->getEntrySamples());
// Note that we have to do the merge right after processing function.
// This allows OutlineFS's profile to be used for annotation during
// top-down processing of functions' annotation.
FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
OutlineFS->merge(*FS);
}
} else {
auto pair =
notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
pair.first->second.entryCount += FS->getEntrySamples();
}
}
return Changed;
}
bool SampleProfileLoader::tryInlineCandidate(
InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
CallBase &CB = *Candidate.CallInstr;
Function *CalledFunction = CB.getCalledFunction();
assert(CalledFunction && "Expect a callee with definition");
DebugLoc DLoc = CB.getDebugLoc();
BasicBlock *BB = CB.getParent();
InlineCost Cost = shouldInlineCandidate(Candidate);
if (Cost.isNever()) {
ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
<< "incompatible inlining");
return false;
}
if (!Cost)
return false;
InlineFunctionInfo IFI(nullptr, GetAC);
if (InlineFunction(CB, IFI).isSuccess()) {
// The call to InlineFunction erases I, so we can't pass it here.
emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
true, CSINLINE_DEBUG);
// Now populate the list of newly exposed call sites.
if (InlinedCallSites) {
InlinedCallSites->clear();
for (auto &I : IFI.InlinedCallSites)
InlinedCallSites->push_back(I);
}
if (ProfileIsCS)
ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
++NumCSInlined;
// Prorate inlined probes for a duplicated inlining callsite which probably
// has a distribution less than 100%. Samples for an inlinee should be
// distributed among the copies of the original callsite based on each
// callsite's distribution factor for counts accuracy. Note that an inlined
// probe may come with its own distribution factor if it has been duplicated
// in the inlinee body. The two factor are multiplied to reflect the
// aggregation of duplication.
if (Candidate.CallsiteDistribution < 1) {
for (auto &I : IFI.InlinedCallSites) {
if (Optional<PseudoProbe> Probe = extractProbe(*I))
setProbeDistributionFactor(*I, Probe->Factor *
Candidate.CallsiteDistribution);
}
NumDuplicatedInlinesite++;
}
return true;
}
return false;
}
bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
CallBase *CB) {
assert(CB && "Expect non-null call instruction");
if (isa<IntrinsicInst>(CB))
return false;
// Find the callee's profile. For indirect call, find hottest target profile.
const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
if (!CalleeSamples)
return false;
float Factor = 1.0;
if (Optional<PseudoProbe> Probe = extractProbe(*CB))
Factor = Probe->Factor;
uint64_t CallsiteCount = 0;
ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
if (Weight)
CallsiteCount = Weight.get();
if (CalleeSamples)
CallsiteCount = std::max(
CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
*NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
return true;
}
InlineCost
SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
std::unique_ptr<InlineAdvice> Advice = nullptr;
if (ExternalInlineAdvisor) {
Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
if (!Advice->isInliningRecommended()) {
Advice->recordUnattemptedInlining();
return InlineCost::getNever("not previously inlined");
}
Advice->recordInlining();
return InlineCost::getAlways("previously inlined");
}
// Adjust threshold based on call site hotness, only do this for callsite
// prioritized inliner because otherwise cost-benefit check is done earlier.
int SampleThreshold = SampleColdCallSiteThreshold;
if (CallsitePrioritizedInline) {
if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
SampleThreshold = SampleHotCallSiteThreshold;
else if (!ProfileSizeInline)
return InlineCost::getNever("cold callsite");
}
Function *Callee = Candidate.CallInstr->getCalledFunction();
assert(Callee && "Expect a definition for inline candidate of direct call");
InlineParams Params = getInlineParams();
Params.ComputeFullInlineCost = true;
// Checks if there is anything in the reachable portion of the callee at
// this callsite that makes this inlining potentially illegal. Need to
// set ComputeFullInlineCost, otherwise getInlineCost may return early
// when cost exceeds threshold without checking all IRs in the callee.
// The acutal cost does not matter because we only checks isNever() to
// see if it is legal to inline the callsite.
InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
GetTTI(*Callee), GetAC, GetTLI);
// Honor always inline and never inline from call analyzer
if (Cost.isNever() || Cost.isAlways())
return Cost;
// For old FDO inliner, we inline the call site as long as cost is not
// "Never". The cost-benefit check is done earlier.
if (!CallsitePrioritizedInline) {
return InlineCost::get(Cost.getCost(), INT_MAX);
}
// Otherwise only use the cost from call analyzer, but overwite threshold with
// Sample PGO threshold.
return InlineCost::get(Cost.getCost(), SampleThreshold);
}
bool SampleProfileLoader::inlineHotFunctionsWithPriority(
Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
// ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
// Profile symbol list is ignored when profile-sample-accurate is on.
assert((!ProfAccForSymsInList ||
(!ProfileSampleAccurate &&
!F.hasFnAttribute("profile-sample-accurate"))) &&
"ProfAccForSymsInList should be false when profile-sample-accurate "
"is enabled");
// Populating worklist with initial call sites from root inliner, along
// with call site weights.
CandidateQueue CQueue;
InlineCandidate NewCandidate;
for (auto &BB : F) {
for (auto &I : BB.getInstList()) {
auto *CB = dyn_cast<CallBase>(&I);
if (!CB)
continue;
if (getInlineCandidate(&NewCandidate, CB))
CQueue.push(NewCandidate);
}
}
// Cap the size growth from profile guided inlining. This is needed even
// though cost of each inline candidate already accounts for callee size,
// because with top-down inlining, we can grow inliner size significantly
// with large number of smaller inlinees each pass the cost check.
assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
"Max inline size limit should not be smaller than min inline size "
"limit.");
unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
if (ExternalInlineAdvisor)
SizeLimit = std::numeric_limits<unsigned>::max();
// Perform iterative BFS call site prioritized inlining
bool Changed = false;
while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
InlineCandidate Candidate = CQueue.top();
CQueue.pop();
CallBase *I = Candidate.CallInstr;
Function *CalledFunction = I->getCalledFunction();
if (CalledFunction == &F)
continue;
if (I->isIndirectCall()) {
uint64_t Sum;
auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
uint64_t SumOrigin = Sum;
Sum *= Candidate.CallsiteDistribution;
for (const auto *FS : CalleeSamples) {
// TODO: Consider disable pre-lTO ICP for MonoLTO as well
if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), SymbolMap,
PSI->getOrCompHotCountThreshold());
continue;
}
uint64_t EntryCountDistributed =
FS->getEntrySamples() * Candidate.CallsiteDistribution;
// In addition to regular inline cost check, we also need to make sure
// ICP isn't introducing excessive speculative checks even if individual
// target looks beneficial to promote and inline. That means we should
// only do ICP when there's a small number dominant targets.
if (EntryCountDistributed < SumOrigin / ProfileICPThreshold)
break;
// TODO: Fix CallAnalyzer to handle all indirect calls.
// For indirect call, we don't run CallAnalyzer to get InlineCost
// before actual inlining. This is because we could see two different
// types from the same definition, which makes CallAnalyzer choke as
// it's expecting matching parameter type on both caller and callee
// side. See example from PR18962 for the triggering cases (the bug was
// fixed, but we generate different types).
if (!PSI->isHotCount(EntryCountDistributed))
break;
SmallVector<CallBase *, 8> InlinedCallSites;
// Attach function profile for promoted indirect callee, and update
// call site count for the promoted inline candidate too.
Candidate = {I, FS, EntryCountDistributed,
Candidate.CallsiteDistribution};
if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
&InlinedCallSites)) {
for (auto *CB : InlinedCallSites) {
if (getInlineCandidate(&NewCandidate, CB))
CQueue.emplace(NewCandidate);
}
Changed = true;
}
}
} else if (CalledFunction && CalledFunction->getSubprogram() &&
!CalledFunction->isDeclaration()) {
SmallVector<CallBase *, 8> InlinedCallSites;
if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
for (auto *CB : InlinedCallSites) {
if (getInlineCandidate(&NewCandidate, CB))
CQueue.emplace(NewCandidate);
}
Changed = true;
}
} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
findCalleeFunctionSamples(*I)->findInlinedFunctions(
InlinedGUIDs, F.getParent(), SymbolMap,
PSI->getOrCompHotCountThreshold());
}
}
if (!CQueue.empty()) {
if (SizeLimit == (unsigned)ProfileInlineLimitMax)
++NumCSInlinedHitMaxLimit;
else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
++NumCSInlinedHitMinLimit;
else
++NumCSInlinedHitGrowthLimit;
}
return Changed;
}
/// Returns the sorted CallTargetMap \p M by count in descending order.
static SmallVector<InstrProfValueData, 2>
GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) {
SmallVector<InstrProfValueData, 2> R;
for (const auto &I : SampleRecord::SortCallTargets(M)) {
R.emplace_back(
InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
}
return R;
}
// Generate MD_prof metadata for every branch instruction using the
// edge weights computed during propagation.
void SampleProfileLoader::generateMDProfMetadata(Function &F) {
// Generate MD_prof metadata for every branch instruction using the
// edge weights computed during propagation.
LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
LLVMContext &Ctx = F.getContext();
MDBuilder MDB(Ctx);
for (auto &BI : F) {
BasicBlock *BB = &BI;
if (BlockWeights[BB]) {
for (auto &I : BB->getInstList()) {
if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
continue;
if (!cast<CallBase>(I).getCalledFunction()) {
const DebugLoc &DLoc = I.getDebugLoc();
if (!DLoc)
continue;
const DILocation *DIL = DLoc;
const FunctionSamples *FS = findFunctionSamples(I);
if (!FS)
continue;
auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
auto T = FS->findCallTargetMapAt(CallSite);
if (!T || T.get().empty())
continue;
// Prorate the callsite counts to reflect what is already done to the
// callsite, such as ICP or calliste cloning.
if (FunctionSamples::ProfileIsProbeBased) {
if (Optional<PseudoProbe> Probe = extractProbe(I)) {
if (Probe->Factor < 1)
T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
}
}
SmallVector<InstrProfValueData, 2> SortedCallTargets =
GetSortedValueDataFromCallTargets(T.get());
uint64_t Sum = 0;
for (const auto &C : T.get())
Sum += C.second;
// With CSSPGO all indirect call targets are counted torwards the
// original indirect call site in the profile, including both
// inlined and non-inlined targets.
if (!FunctionSamples::ProfileIsCS) {
if (const FunctionSamplesMap *M =
FS->findFunctionSamplesMapAt(CallSite)) {
for (const auto &NameFS : *M)
Sum += NameFS.second.getEntrySamples();
}
}
if (!Sum)
continue;
updateIDTMetaData(I, SortedCallTargets, Sum);
} else if (!isa<IntrinsicInst>(&I)) {
I.setMetadata(LLVMContext::MD_prof,
MDB.createBranchWeights(
{static_cast<uint32_t>(BlockWeights[BB])}));
}
}
}
Instruction *TI = BB->getTerminator();
if (TI->getNumSuccessors() == 1)
continue;
if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
continue;
DebugLoc BranchLoc = TI->getDebugLoc();
LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
<< ((BranchLoc) ? Twine(BranchLoc.getLine())
: Twine("<UNKNOWN LOCATION>"))
<< ".\n");
SmallVector<uint32_t, 4> Weights;
uint32_t MaxWeight = 0;
Instruction *MaxDestInst;
for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
BasicBlock *Succ = TI->getSuccessor(I);
Edge E = std::make_pair(BB, Succ);
uint64_t Weight = EdgeWeights[E];
LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
// Use uint32_t saturated arithmetic to adjust the incoming weights,
// if needed. Sample counts in profiles are 64-bit unsigned values,
// but internally branch weights are expressed as 32-bit values.
if (Weight > std::numeric_limits<uint32_t>::max()) {
LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
Weight = std::numeric_limits<uint32_t>::max();
}
// Weight is added by one to avoid propagation errors introduced by
// 0 weights.
Weights.push_back(static_cast<uint32_t>(Weight + 1));
if (Weight != 0) {
if (Weight > MaxWeight) {
MaxWeight = Weight;
MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
}
}
}
uint64_t TempWeight;
// Only set weights if there is at least one non-zero weight.
// In any other case, let the analyzer set weights.
// Do not set weights if the weights are present. In ThinLTO, the profile
// annotation is done twice. If the first annotation already set the
// weights, the second pass does not need to set it.
if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
TI->setMetadata(LLVMContext::MD_prof,
MDB.createBranchWeights(Weights));
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
<< "most popular destination for conditional branches at "
<< ore::NV("CondBranchesLoc", BranchLoc);
});
} else {
LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
}
}
}
/// Once all the branch weights are computed, we emit the MD_prof
/// metadata on BB using the computed values for each of its branches.
///
/// \param F The function to query.
///
/// \returns true if \p F was modified. Returns false, otherwise.
bool SampleProfileLoader::emitAnnotations(Function &F) {
bool Changed = false;
if (FunctionSamples::ProfileIsProbeBased) {
if (!ProbeManager->profileIsValid(F, *Samples)) {
LLVM_DEBUG(
dbgs() << "Profile is invalid due to CFG mismatch for Function "
<< F.getName());
++NumMismatchedProfile;
return false;
}
++NumMatchedProfile;
} else {
if (getFunctionLoc(F) == 0)
return false;
LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
<< F.getName() << ": " << getFunctionLoc(F) << "\n");
}
DenseSet<GlobalValue::GUID> InlinedGUIDs;
if (ProfileIsCS && CallsitePrioritizedInline)
Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
else
Changed |= inlineHotFunctions(F, InlinedGUIDs);
Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
if (Changed)
generateMDProfMetadata(F);
emitCoverageRemarks(F);
return Changed;
}
char SampleProfileLoaderLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
"Sample Profile loader", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
"Sample Profile loader", false, false)
// Add inlined profile call edges to the call graph.
void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
const FunctionSamples &Samples) {
Function *Caller = SymbolMap.lookup(Samples.getFuncName());
if (!Caller || Caller->isDeclaration())
return;
// Skip non-inlined call edges which are not important since top down inlining
// for non-CS profile is to get more precise profile matching, not to enable
// more inlining.
for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
for (const auto &InlinedSamples : CallsiteSamples.second) {
Function *Callee = SymbolMap.lookup(InlinedSamples.first);
if (Callee && !Callee->isDeclaration())
CG[Caller]->addCalledFunction(nullptr, CG[Callee]);
addCallGraphEdges(CG, InlinedSamples.second);
}
}
}
// Replace call graph edges with dynamic call edges from the profile.
void SampleProfileLoader::replaceCallGraphEdges(
CallGraph &CG, StringMap<Function *> &SymbolMap) {
// Remove static call edges from the call graph except for the ones from the
// root which make the call graph connected.
for (const auto &Node : CG)
if (Node.second.get() != CG.getExternalCallingNode())
Node.second->removeAllCalledFunctions();
// Add profile call edges to the call graph.
if (ProfileIsCS) {
ContextTracker->addCallGraphEdges(CG, SymbolMap);
} else {
for (const auto &Samples : Reader->getProfiles())
addCallGraphEdges(CG, Samples.second);
}
}
std::vector<Function *>
SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
std::vector<Function *> FunctionOrderList;
FunctionOrderList.reserve(M.size());
if (!ProfileTopDownLoad || CG == nullptr) {
if (ProfileMergeInlinee) {
// Disable ProfileMergeInlinee if profile is not loaded in top down order,
// because the profile for a function may be used for the profile
// annotation of its outline copy before the profile merging of its
// non-inlined inline instances, and that is not the way how
// ProfileMergeInlinee is supposed to work.
ProfileMergeInlinee = false;
}
for (Function &F : M)
if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
FunctionOrderList.push_back(&F);
return FunctionOrderList;
}
assert(&CG->getModule() == &M);
// Add indirect call edges from profile to augment the static call graph.
// Functions will be processed in a top-down order defined by the static call
// graph. Adjusting the order by considering indirect call edges from the
// profile (which don't exist in the static call graph) can enable the
// inlining of indirect call targets by processing the caller before them.
// TODO: enable this for non-CS profile and fix the counts returning logic to
// have a full support for indirect calls.
if (UseProfileIndirectCallEdges && ProfileIsCS) {
for (auto &Entry : *CG) {
const auto *F = Entry.first;
if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
continue;
auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName());
if (AllContexts.empty())
continue;
for (const auto &BB : *F) {
for (const auto &I : BB.getInstList()) {
const auto *CB = dyn_cast<CallBase>(&I);
if (!CB || !CB->isIndirectCall())
continue;
const DebugLoc &DLoc = I.getDebugLoc();
if (!DLoc)
continue;
auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc);
for (FunctionSamples *Samples : AllContexts) {
if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) {
for (const auto &Target : CallTargets.get()) {
Function *Callee = SymbolMap.lookup(Target.first());
if (Callee && !Callee->isDeclaration())
Entry.second->addCalledFunction(nullptr, (*CG)[Callee]);
}
}
}
}
}
}
}
// Compute a top-down order the profile which is used to sort functions in
// one SCC later. The static processing order computed for an SCC may not
// reflect the call contexts in the context-sensitive profile, thus may cause
// potential inlining to be overlooked. The function order in one SCC is being
// adjusted to a top-down order based on the profile to favor more inlining.
DenseMap<Function *, uint64_t> ProfileOrderMap;
if (UseProfileTopDownOrder ||
(ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) {
// Create a static call graph. The call edges are not important since they
// will be replaced by dynamic edges from the profile.
CallGraph ProfileCG(M);
replaceCallGraphEdges(ProfileCG, SymbolMap);
scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG);
uint64_t I = 0;
while (!CGI.isAtEnd()) {
for (CallGraphNode *Node : *CGI) {
if (auto *F = Node->getFunction())
ProfileOrderMap[F] = ++I;
}
++CGI;
}
}
scc_iterator<CallGraph *> CGI = scc_begin(CG);
while (!CGI.isAtEnd()) {
uint64_t Start = FunctionOrderList.size();
for (CallGraphNode *Node : *CGI) {
auto *F = Node->getFunction();
if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
FunctionOrderList.push_back(F);
}
// Sort nodes in SCC based on the profile top-down order.
if (!ProfileOrderMap.empty()) {
std::stable_sort(FunctionOrderList.begin() + Start,
FunctionOrderList.end(),
[&ProfileOrderMap](Function *Left, Function *Right) {
return ProfileOrderMap[Left] < ProfileOrderMap[Right];
});
}
++CGI;
}
LLVM_DEBUG({
dbgs() << "Function processing order:\n";
for (auto F : reverse(FunctionOrderList)) {
dbgs() << F->getName() << "\n";
}
});
std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
return FunctionOrderList;
}
bool SampleProfileLoader::doInitialization(Module &M,
FunctionAnalysisManager *FAM) {
auto &Ctx = M.getContext();
auto ReaderOrErr =
SampleProfileReader::create(Filename, Ctx, RemappingFilename);
if (std::error_code EC = ReaderOrErr.getError()) {
std::string Msg = "Could not open profile: " + EC.message();
Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
return false;
}
Reader = std::move(ReaderOrErr.get());
Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
// set module before reading the profile so reader may be able to only
// read the function profiles which are used by the current module.
Reader->setModule(&M);
if (std::error_code EC = Reader->read()) {
std::string Msg = "profile reading failed: " + EC.message();
Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
return false;
}
PSL = Reader->getProfileSymbolList();
// While profile-sample-accurate is on, ignore symbol list.
ProfAccForSymsInList =
ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
if (ProfAccForSymsInList) {
NamesInProfile.clear();
if (auto NameTable = Reader->getNameTable())
NamesInProfile.insert(NameTable->begin(), NameTable->end());
CoverageTracker.setProfAccForSymsInList(true);
}
if (FAM && !ProfileInlineReplayFile.empty()) {
ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
/*EmitRemarks=*/false);
if (!ExternalInlineAdvisor->areReplayRemarksLoaded())
ExternalInlineAdvisor.reset();
}
// Apply tweaks if context-sensitive profile is available.
if (Reader->profileIsCS()) {
ProfileIsCS = true;
FunctionSamples::ProfileIsCS = true;
// Enable priority-base inliner and size inline by default for CSSPGO.
if (!ProfileSizeInline.getNumOccurrences())
ProfileSizeInline = true;
if (!CallsitePrioritizedInline.getNumOccurrences())
CallsitePrioritizedInline = true;
// Tracker for profiles under different context
ContextTracker =
std::make_unique<SampleContextTracker>(Reader->getProfiles());
}
// Load pseudo probe descriptors for probe-based function samples.
if (Reader->profileIsProbeBased()) {
ProbeManager = std::make_unique<PseudoProbeManager>(M);
if (!ProbeManager->moduleIsProbed(M)) {
const char *Msg =
"Pseudo-probe-based profile requires SampleProfileProbePass";
Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
return false;
}
}
return true;
}
ModulePass *llvm::createSampleProfileLoaderPass() {
return new SampleProfileLoaderLegacyPass();
}
ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
return new SampleProfileLoaderLegacyPass(Name);
}
bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
ProfileSummaryInfo *_PSI, CallGraph *CG) {
GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
PSI = _PSI;
if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
ProfileSummary::PSK_Sample);
PSI->refresh();
}
// Compute the total number of samples collected in this profile.
for (const auto &I : Reader->getProfiles())
TotalCollectedSamples += I.second.getTotalSamples();
auto Remapper = Reader->getRemapper();
// Populate the symbol map.
for (const auto &N_F : M.getValueSymbolTable()) {
StringRef OrigName = N_F.getKey();
Function *F = dyn_cast<Function>(N_F.getValue());
if (F == nullptr || OrigName.empty())
continue;
SymbolMap[OrigName] = F;
StringRef NewName = FunctionSamples::getCanonicalFnName(*F);
if (OrigName != NewName && !NewName.empty()) {
auto r = SymbolMap.insert(std::make_pair(NewName, F));
// Failiing to insert means there is already an entry in SymbolMap,
// thus there are multiple functions that are mapped to the same
// stripped name. In this case of name conflicting, set the value
// to nullptr to avoid confusion.
if (!r.second)
r.first->second = nullptr;
OrigName = NewName;
}
// Insert the remapped names into SymbolMap.
if (Remapper) {
if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
if (*MapName != OrigName && !MapName->empty())
SymbolMap.insert(std::make_pair(*MapName, F));
}
}
}
assert(SymbolMap.count(StringRef()) == 0 &&
"No empty StringRef should be added in SymbolMap");
bool retval = false;
for (auto F : buildFunctionOrder(M, CG)) {
assert(!F->isDeclaration());
clearFunctionData();
retval |= runOnFunction(*F, AM);
}
// Account for cold calls not inlined....
if (!ProfileIsCS)
for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
notInlinedCallInfo)
updateProfileCallee(pair.first, pair.second.entryCount);
return retval;
}
bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
ACT = &getAnalysis<AssumptionCacheTracker>();
TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
ProfileSummaryInfo *PSI =
&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
}
bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
DILocation2SampleMap.clear();
// By default the entry count is initialized to -1, which will be treated
// conservatively by getEntryCount as the same as unknown (None). This is
// to avoid newly added code to be treated as cold. If we have samples
// this will be overwritten in emitAnnotations.
uint64_t initialEntryCount = -1;
ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
// initialize all the function entry counts to 0. It means all the
// functions without profile will be regarded as cold.
initialEntryCount = 0;
// profile-sample-accurate is a user assertion which has a higher precedence
// than symbol list. When profile-sample-accurate is on, ignore symbol list.
ProfAccForSymsInList = false;
}
CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
// PSL -- profile symbol list include all the symbols in sampled binary.
// If ProfileAccurateForSymsInList is enabled, PSL is used to treat
// old functions without samples being cold, without having to worry
// about new and hot functions being mistakenly treated as cold.
if (ProfAccForSymsInList) {
// Initialize the entry count to 0 for functions in the list.
if (PSL->contains(F.getName()))
initialEntryCount = 0;
// Function in the symbol list but without sample will be regarded as
// cold. To minimize the potential negative performance impact it could
// have, we want to be a little conservative here saying if a function
// shows up in the profile, no matter as outline function, inline instance
// or call targets, treat the function as not being cold. This will handle
// the cases such as most callsites of a function are inlined in sampled
// binary but not inlined in current build (because of source code drift,
// imprecise debug information, or the callsites are all cold individually
// but not cold accumulatively...), so the outline function showing up as
// cold in sampled binary will actually not be cold after current build.
StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
if (NamesInProfile.count(CanonName))
initialEntryCount = -1;
}
// Initialize entry count when the function has no existing entry
// count value.
if (!F.getEntryCount().hasValue())
F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
if (AM) {
auto &FAM =
AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
.getManager();
ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
} else {
OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
ORE = OwnedORE.get();
}
if (ProfileIsCS)
Samples = ContextTracker->getBaseSamplesFor(F);
else
Samples = Reader->getSamplesFor(F);
if (Samples && !Samples->empty())
return emitAnnotations(F);
return false;
}
PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
ModuleAnalysisManager &AM) {
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
return FAM.getResult<AssumptionAnalysis>(F);
};
auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
return FAM.getResult<TargetIRAnalysis>(F);
};
auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
return FAM.getResult<TargetLibraryAnalysis>(F);
};
SampleProfileLoader SampleLoader(
ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
: ProfileRemappingFileName,
LTOPhase, GetAssumptionCache, GetTTI, GetTLI);
if (!SampleLoader.doInitialization(M, &FAM))
return PreservedAnalyses::all();
ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}