[MemProf] Context disambiguation cloning pass [patch 4/4]
Applies ThinLTO cloning decisions made during the thin link and recorded in the summary index to the IR during the ThinLTO backend. Depends on D141077. Differential Revision: https://reviews.llvm.org/D149117
This commit is contained in:
@@ -99,6 +99,10 @@ public:
|
||||
ImmutablePass *
|
||||
createImmutableModuleSummaryIndexWrapperPass(const ModuleSummaryIndex *Index);
|
||||
|
||||
/// Returns true if the instruction could have memprof metadata, used to ensure
|
||||
/// consistency between summary analysis and the ThinLTO backend processing.
|
||||
bool mayHaveMemprofSummary(const CallBase *CB);
|
||||
|
||||
} // end namespace llvm
|
||||
|
||||
#endif // LLVM_ANALYSIS_MODULESUMMARYANALYSIS_H
|
||||
|
||||
@@ -18,13 +18,13 @@
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/StringSet.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/ModuleSummaryIndex.h"
|
||||
#include "llvm/IR/PassManager.h"
|
||||
#include <functional>
|
||||
|
||||
namespace llvm {
|
||||
class GlobalValueSummary;
|
||||
class Module;
|
||||
class ModuleSummaryIndex;
|
||||
class OptimizationRemarkEmitter;
|
||||
|
||||
class MemProfContextDisambiguation
|
||||
@@ -34,8 +34,19 @@ class MemProfContextDisambiguation
|
||||
Module &M,
|
||||
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
|
||||
|
||||
/// In the ThinLTO backend, apply the cloning decisions in ImportSummary to
|
||||
/// the IR.
|
||||
bool applyImport(Module &M);
|
||||
|
||||
/// Import summary containing cloning decisions for the ThinLTO backend.
|
||||
const ModuleSummaryIndex *ImportSummary;
|
||||
|
||||
// Owns the import summary specified by internal options for testing the
|
||||
// ThinLTO backend via opt (to simulate distributed ThinLTO).
|
||||
std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;
|
||||
|
||||
public:
|
||||
MemProfContextDisambiguation() {}
|
||||
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
|
||||
|
||||
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
|
||||
|
||||
|
||||
@@ -284,6 +284,10 @@ static void computeFunctionSummary(
|
||||
std::vector<CallsiteInfo> Callsites;
|
||||
std::vector<AllocInfo> Allocs;
|
||||
|
||||
#ifndef NDEBUG
|
||||
DenseSet<const CallBase *> CallsThatMayHaveMemprofSummary;
|
||||
#endif
|
||||
|
||||
bool HasInlineAsmMaybeReferencingInternal = false;
|
||||
bool HasIndirBranchToBlockAddress = false;
|
||||
bool HasUnknownCall = false;
|
||||
@@ -427,6 +431,10 @@ static void computeFunctionSummary(
|
||||
.updateHotness(getHotness(Candidate.Count, PSI));
|
||||
}
|
||||
|
||||
// Summarize memprof related metadata. This is only needed for ThinLTO.
|
||||
if (!IsThinLTO)
|
||||
continue;
|
||||
|
||||
// TODO: Skip indirect calls for now. Need to handle these better, likely
|
||||
// by creating multiple Callsites, one per target, then speculatively
|
||||
// devirtualize while applying clone info in the ThinLTO backends. This
|
||||
@@ -437,6 +445,14 @@ static void computeFunctionSummary(
|
||||
if (!CalledFunction)
|
||||
continue;
|
||||
|
||||
// Ensure we keep this analysis in sync with the handling in the ThinLTO
|
||||
// backend (see MemProfContextDisambiguation::applyImport). Save this call
|
||||
// so that we can skip it in checking the reverse case later.
|
||||
assert(mayHaveMemprofSummary(CB));
|
||||
#ifndef NDEBUG
|
||||
CallsThatMayHaveMemprofSummary.insert(CB);
|
||||
#endif
|
||||
|
||||
// Compute the list of stack ids first (so we can trim them from the stack
|
||||
// ids on any MIBs).
|
||||
CallStack<MDNode, MDNode::op_iterator> InstCallsite(
|
||||
@@ -546,6 +562,25 @@ static void computeFunctionSummary(
|
||||
? CalleeInfo::HotnessType::Cold
|
||||
: CalleeInfo::HotnessType::Critical);
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Make sure that all calls we decided could not have memprof summaries get a
|
||||
// false value for mayHaveMemprofSummary, to ensure that this handling remains
|
||||
// in sync with the ThinLTO backend handling.
|
||||
if (IsThinLTO) {
|
||||
for (const BasicBlock &BB : F) {
|
||||
for (const Instruction &I : BB) {
|
||||
const auto *CB = dyn_cast<CallBase>(&I);
|
||||
if (!CB)
|
||||
continue;
|
||||
// We already checked these above.
|
||||
if (CallsThatMayHaveMemprofSummary.count(CB))
|
||||
continue;
|
||||
assert(!mayHaveMemprofSummary(CB));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
bool NonRenamableLocal = isNonRenamableLocal(F);
|
||||
bool NotEligibleForImport = NonRenamableLocal ||
|
||||
HasInlineAsmMaybeReferencingInternal ||
|
||||
@@ -1042,3 +1077,36 @@ ImmutablePass *llvm::createImmutableModuleSummaryIndexWrapperPass(
|
||||
|
||||
INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info",
|
||||
"Module summary info", false, true)
|
||||
|
||||
bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
|
||||
if (!CB)
|
||||
return false;
|
||||
if (CB->isDebugOrPseudoInst())
|
||||
return false;
|
||||
auto *CI = dyn_cast<CallInst>(CB);
|
||||
auto *CalledValue = CB->getCalledOperand();
|
||||
auto *CalledFunction = CB->getCalledFunction();
|
||||
if (CalledValue && !CalledFunction) {
|
||||
CalledValue = CalledValue->stripPointerCasts();
|
||||
// Stripping pointer casts can reveal a called function.
|
||||
CalledFunction = dyn_cast<Function>(CalledValue);
|
||||
}
|
||||
// Check if this is an alias to a function. If so, get the
|
||||
// called aliasee for the checks below.
|
||||
if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
|
||||
assert(!CalledFunction &&
|
||||
"Expected null called function in callsite for alias");
|
||||
CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
|
||||
}
|
||||
// Check if this is a direct call to a known function or a known
|
||||
// intrinsic, or an indirect call with profile data.
|
||||
if (CalledFunction) {
|
||||
if (CI && CalledFunction->isIntrinsic())
|
||||
return false;
|
||||
} else {
|
||||
// TODO: For now skip indirect calls. See comments in
|
||||
// computeFunctionSummary for what is needed to handle this.
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1531,6 +1531,11 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
|
||||
ModulePassManager MPM;
|
||||
|
||||
if (ImportSummary) {
|
||||
// For ThinLTO we must apply the context disambiguation decisions early, to
|
||||
// ensure we can correctly match the callsites to summary data.
|
||||
if (EnableMemProfContextDisambiguation)
|
||||
MPM.addPass(MemProfContextDisambiguation(ImportSummary));
|
||||
|
||||
// These passes import type identifier resolutions for whole-program
|
||||
// devirtualization and CFI. They must run early because other passes may
|
||||
// disturb the specific instruction patterns that these passes look for,
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "llvm/Analysis/MemoryProfileInfo.h"
|
||||
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
|
||||
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
||||
#include "llvm/Bitcode/BitcodeReader.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
@@ -52,10 +53,30 @@ using namespace llvm::memprof;
|
||||
|
||||
STATISTIC(FunctionClonesAnalysis,
|
||||
"Number of function clones created during whole program analysis");
|
||||
STATISTIC(FunctionClonesThinBackend,
|
||||
"Number of function clones created during ThinLTO backend");
|
||||
STATISTIC(FunctionsClonedThinBackend,
|
||||
"Number of functions that had clones created during ThinLTO backend");
|
||||
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
|
||||
"cloned) during whole program analysis");
|
||||
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
|
||||
"during whole program analysis");
|
||||
STATISTIC(AllocTypeNotColdThinBackend,
|
||||
"Number of not cold static allocations (possibly cloned) during "
|
||||
"ThinLTO backend");
|
||||
STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
|
||||
"(possibly cloned) during ThinLTO backend");
|
||||
STATISTIC(OrigAllocsThinBackend,
|
||||
"Number of original (not cloned) allocations with memprof profiles "
|
||||
"during ThinLTO backend");
|
||||
STATISTIC(
|
||||
AllocVersionsThinBackend,
|
||||
"Number of allocation versions (including clones) during ThinLTO backend");
|
||||
STATISTIC(MaxAllocVersionsThinBackend,
|
||||
"Maximum number of allocation versions created for an original "
|
||||
"allocation during ThinLTO backend");
|
||||
STATISTIC(UnclonableAllocsThinBackend,
|
||||
"Number of unclonable ambigous allocations during ThinLTO backend");
|
||||
|
||||
static cl::opt<std::string> DotFilePathPrefix(
|
||||
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
|
||||
@@ -78,6 +99,11 @@ static cl::opt<bool>
|
||||
VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
|
||||
cl::desc("Perform frequent verification checks on nodes."));
|
||||
|
||||
static cl::opt<std::string> MemProfImportSummary(
|
||||
"memprof-import-summary",
|
||||
cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
|
||||
cl::Hidden);
|
||||
|
||||
/// CRTP base for graphs built from either IR or ThinLTO summary index.
|
||||
///
|
||||
/// The graph represents the call contexts in all memprof metadata on allocation
|
||||
@@ -109,8 +135,8 @@ public:
|
||||
/// Assign callsite clones to functions, cloning functions as needed to
|
||||
/// accommodate the combinations of their callsite clones reached by callers.
|
||||
/// For regular LTO this clones functions and callsites in the IR, but for
|
||||
/// ThinLTO the cloning decisions are noted in the summaries and applied
|
||||
/// later.
|
||||
/// ThinLTO the cloning decisions are noted in the summaries and later applied
|
||||
/// in applyImport.
|
||||
bool assignFunctions();
|
||||
|
||||
void dump() const;
|
||||
@@ -2779,6 +2805,358 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
|
||||
return Changed;
|
||||
}
|
||||
|
||||
static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
|
||||
Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
|
||||
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
|
||||
&FuncToAliasMap) {
|
||||
// The first "clone" is the original copy, we should only call this if we
|
||||
// needed to create new clones.
|
||||
assert(NumClones > 1);
|
||||
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
|
||||
VMaps.reserve(NumClones - 1);
|
||||
FunctionsClonedThinBackend++;
|
||||
for (unsigned I = 1; I < NumClones; I++) {
|
||||
VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
|
||||
auto *NewF = CloneFunction(&F, *VMaps.back());
|
||||
FunctionClonesThinBackend++;
|
||||
// Strip memprof and callsite metadata from clone as they are no longer
|
||||
// needed.
|
||||
for (auto &BB : *NewF) {
|
||||
for (auto &Inst : BB) {
|
||||
Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
|
||||
Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
|
||||
}
|
||||
}
|
||||
std::string Name = getMemProfFuncName(F.getName(), I);
|
||||
auto *PrevF = M.getFunction(Name);
|
||||
if (PrevF) {
|
||||
// We might have created this when adjusting callsite in another
|
||||
// function. It should be a declaration.
|
||||
assert(PrevF->isDeclaration());
|
||||
NewF->takeName(PrevF);
|
||||
PrevF->replaceAllUsesWith(NewF);
|
||||
PrevF->eraseFromParent();
|
||||
} else
|
||||
NewF->setName(Name);
|
||||
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
|
||||
<< "created clone " << ore::NV("NewFunction", NewF));
|
||||
|
||||
// Now handle aliases to this function, and clone those as well.
|
||||
if (!FuncToAliasMap.count(&F))
|
||||
continue;
|
||||
for (auto *A : FuncToAliasMap[&F]) {
|
||||
std::string Name = getMemProfFuncName(A->getName(), I);
|
||||
auto *PrevA = M.getNamedAlias(Name);
|
||||
auto *NewA = GlobalAlias::create(A->getValueType(),
|
||||
A->getType()->getPointerAddressSpace(),
|
||||
A->getLinkage(), Name, NewF);
|
||||
NewA->copyAttributesFrom(A);
|
||||
if (PrevA) {
|
||||
// We might have created this when adjusting callsite in another
|
||||
// function. It should be a declaration.
|
||||
assert(PrevA->isDeclaration());
|
||||
NewA->takeName(PrevA);
|
||||
PrevA->replaceAllUsesWith(NewA);
|
||||
PrevA->eraseFromParent();
|
||||
}
|
||||
}
|
||||
}
|
||||
return VMaps;
|
||||
}
|
||||
|
||||
// Locate the summary for F. This is complicated by the fact that it might
|
||||
// have been internalized or promoted.
|
||||
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
|
||||
const ModuleSummaryIndex *ImportSummary) {
|
||||
// FIXME: Ideally we would retain the original GUID in some fashion on the
|
||||
// function (e.g. as metadata), but for now do our best to locate the
|
||||
// summary without that information.
|
||||
ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
|
||||
if (!TheFnVI)
|
||||
// See if theFn was internalized, by checking index directly with
|
||||
// original name (this avoids the name adjustment done by getGUID() for
|
||||
// internal symbols).
|
||||
TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
|
||||
if (TheFnVI)
|
||||
return TheFnVI;
|
||||
// Now query with the original name before any promotion was performed.
|
||||
StringRef OrigName =
|
||||
ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
|
||||
std::string OrigId = GlobalValue::getGlobalIdentifier(
|
||||
OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
|
||||
TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
|
||||
if (TheFnVI)
|
||||
return TheFnVI;
|
||||
// Could be a promoted local imported from another module. We need to pass
|
||||
// down more info here to find the original module id. For now, try with
|
||||
// the OrigName which might have been stored in the OidGuidMap in the
|
||||
// index. This would not work if there were same-named locals in multiple
|
||||
// modules, however.
|
||||
auto OrigGUID =
|
||||
ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
|
||||
if (OrigGUID)
|
||||
TheFnVI = ImportSummary->getValueInfo(OrigGUID);
|
||||
return TheFnVI;
|
||||
}
|
||||
|
||||
bool MemProfContextDisambiguation::applyImport(Module &M) {
|
||||
assert(ImportSummary);
|
||||
bool Changed = false;
|
||||
|
||||
auto IsMemProfClone = [](const Function &F) {
|
||||
return F.getName().contains(MemProfCloneSuffix);
|
||||
};
|
||||
|
||||
// We also need to clone any aliases that reference cloned functions, because
|
||||
// the modified callsites may invoke via the alias. Keep track of the aliases
|
||||
// for each function.
|
||||
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
|
||||
FuncToAliasMap;
|
||||
for (auto &A : M.aliases()) {
|
||||
auto *Aliasee = A.getAliaseeObject();
|
||||
if (auto *F = dyn_cast<Function>(Aliasee))
|
||||
FuncToAliasMap[F].insert(&A);
|
||||
}
|
||||
|
||||
for (auto &F : M) {
|
||||
if (F.isDeclaration() || IsMemProfClone(F))
|
||||
continue;
|
||||
|
||||
OptimizationRemarkEmitter ORE(&F);
|
||||
|
||||
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
|
||||
bool ClonesCreated = false;
|
||||
unsigned NumClonesCreated = 0;
|
||||
auto CloneFuncIfNeeded = [&](unsigned NumClones) {
|
||||
// We should at least have version 0 which is the original copy.
|
||||
assert(NumClones > 0);
|
||||
// If only one copy needed use original.
|
||||
if (NumClones == 1)
|
||||
return;
|
||||
// If we already performed cloning of this function, confirm that the
|
||||
// requested number of clones matches (the thin link should ensure the
|
||||
// number of clones for each constituent callsite is consistent within
|
||||
// each function), before returning.
|
||||
if (ClonesCreated) {
|
||||
assert(NumClonesCreated == NumClones);
|
||||
return;
|
||||
}
|
||||
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
|
||||
// The first "clone" is the original copy, which doesn't have a VMap.
|
||||
assert(VMaps.size() == NumClones - 1);
|
||||
Changed = true;
|
||||
ClonesCreated = true;
|
||||
NumClonesCreated = NumClones;
|
||||
};
|
||||
|
||||
// Locate the summary for F.
|
||||
ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
|
||||
// If not found, this could be an imported local (see comment in
|
||||
// findValueInfoForFunc). Skip for now as it will be cloned in its original
|
||||
// module (where it would have been promoted to global scope so should
|
||||
// satisfy any reference in this module).
|
||||
if (!TheFnVI)
|
||||
continue;
|
||||
|
||||
auto *GVSummary =
|
||||
ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
|
||||
if (!GVSummary)
|
||||
// Must have been imported, use the first summary (might be multiple if
|
||||
// this was a linkonce_odr).
|
||||
GVSummary = TheFnVI.getSummaryList().front().get();
|
||||
|
||||
// If this was an imported alias skip it as we won't have the function
|
||||
// summary, and it should be cloned in the original module.
|
||||
if (isa<AliasSummary>(GVSummary))
|
||||
continue;
|
||||
|
||||
auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
|
||||
|
||||
if (FS->allocs().empty() && FS->callsites().empty())
|
||||
continue;
|
||||
|
||||
auto SI = FS->callsites().begin();
|
||||
auto AI = FS->allocs().begin();
|
||||
|
||||
// Assume for now that the instructions are in the exact same order
|
||||
// as when the summary was created, but confirm this is correct by
|
||||
// matching the stack ids.
|
||||
for (auto &BB : F) {
|
||||
for (auto &I : BB) {
|
||||
auto *CB = dyn_cast<CallBase>(&I);
|
||||
// Same handling as when creating module summary.
|
||||
if (!mayHaveMemprofSummary(CB))
|
||||
continue;
|
||||
|
||||
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
|
||||
I.getMetadata(LLVMContext::MD_callsite));
|
||||
auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
|
||||
|
||||
// Include allocs that were already assigned a memprof function
|
||||
// attribute in the statistics.
|
||||
if (CB->getAttributes().hasFnAttr("memprof")) {
|
||||
assert(!MemProfMD);
|
||||
CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
|
||||
? AllocTypeColdThinBackend++
|
||||
: AllocTypeNotColdThinBackend++;
|
||||
OrigAllocsThinBackend++;
|
||||
AllocVersionsThinBackend++;
|
||||
if (!MaxAllocVersionsThinBackend)
|
||||
MaxAllocVersionsThinBackend = 1;
|
||||
// Remove any remaining callsite metadata and we can skip the rest of
|
||||
// the handling for this instruction, since no cloning needed.
|
||||
I.setMetadata(LLVMContext::MD_callsite, nullptr);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (MemProfMD) {
|
||||
// Consult the next alloc node.
|
||||
assert(AI != FS->allocs().end());
|
||||
auto &AllocNode = *(AI++);
|
||||
|
||||
// Sanity check that the MIB stack ids match between the summary and
|
||||
// instruction metadata.
|
||||
auto MIBIter = AllocNode.MIBs.begin();
|
||||
for (auto &MDOp : MemProfMD->operands()) {
|
||||
assert(MIBIter != AllocNode.MIBs.end());
|
||||
auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
|
||||
auto *MIBMD = cast<const MDNode>(MDOp);
|
||||
MDNode *StackMDNode = getMIBStackNode(MIBMD);
|
||||
assert(StackMDNode);
|
||||
SmallVector<unsigned> StackIdsFromMetadata;
|
||||
CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
|
||||
for (auto ContextIter =
|
||||
StackContext.beginAfterSharedPrefix(CallsiteContext);
|
||||
ContextIter != StackContext.end(); ++ContextIter) {
|
||||
// If this is a direct recursion, simply skip the duplicate
|
||||
// entries, to be consistent with how the summary ids were
|
||||
// generated during ModuleSummaryAnalysis.
|
||||
if (!StackIdsFromMetadata.empty() &&
|
||||
StackIdsFromMetadata.back() == *ContextIter)
|
||||
continue;
|
||||
assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
|
||||
assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
|
||||
*ContextIter);
|
||||
StackIdIndexIter++;
|
||||
}
|
||||
MIBIter++;
|
||||
}
|
||||
|
||||
// Perform cloning if not yet done.
|
||||
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
|
||||
|
||||
OrigAllocsThinBackend++;
|
||||
AllocVersionsThinBackend += AllocNode.Versions.size();
|
||||
if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
|
||||
MaxAllocVersionsThinBackend = AllocNode.Versions.size();
|
||||
|
||||
// If there is only one version that means we didn't end up
|
||||
// considering this function for cloning, and in that case the alloc
|
||||
// will still be none type or should have gotten the default NotCold.
|
||||
// Skip that after calling clone helper since that does some sanity
|
||||
// checks that confirm we haven't decided yet that we need cloning.
|
||||
if (AllocNode.Versions.size() == 1) {
|
||||
assert((AllocationType)AllocNode.Versions[0] ==
|
||||
AllocationType::NotCold ||
|
||||
(AllocationType)AllocNode.Versions[0] ==
|
||||
AllocationType::None);
|
||||
UnclonableAllocsThinBackend++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// All versions should have a singular allocation type.
|
||||
assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
|
||||
return Type == ((uint8_t)AllocationType::NotCold |
|
||||
(uint8_t)AllocationType::Cold);
|
||||
}));
|
||||
|
||||
// Update the allocation types per the summary info.
|
||||
for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
|
||||
// Ignore any that didn't get an assigned allocation type.
|
||||
if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
|
||||
continue;
|
||||
AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
|
||||
AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
|
||||
: AllocTypeNotColdThinBackend++;
|
||||
std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
|
||||
auto A = llvm::Attribute::get(F.getContext(), "memprof",
|
||||
AllocTypeString);
|
||||
CallBase *CBClone;
|
||||
// Copy 0 is the original function.
|
||||
if (!J)
|
||||
CBClone = CB;
|
||||
else
|
||||
// Since VMaps are only created for new clones, we index with
|
||||
// clone J-1 (J==0 is the original clone and does not have a VMaps
|
||||
// entry).
|
||||
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
|
||||
CBClone->addFnAttr(A);
|
||||
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
|
||||
<< ore::NV("AllocationCall", CBClone) << " in clone "
|
||||
<< ore::NV("Caller", CBClone->getFunction())
|
||||
<< " marked with memprof allocation attribute "
|
||||
<< ore::NV("Attribute", AllocTypeString));
|
||||
}
|
||||
} else if (!CallsiteContext.empty()) {
|
||||
// Consult the next callsite node.
|
||||
assert(SI != FS->callsites().end());
|
||||
auto &StackNode = *(SI++);
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Sanity check that the stack ids match between the summary and
|
||||
// instruction metadata.
|
||||
auto StackIdIndexIter = StackNode.StackIdIndices.begin();
|
||||
for (auto StackId : CallsiteContext) {
|
||||
assert(StackIdIndexIter != StackNode.StackIdIndices.end());
|
||||
assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
|
||||
StackId);
|
||||
StackIdIndexIter++;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Perform cloning if not yet done.
|
||||
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
|
||||
|
||||
// Should have skipped indirect calls via mayHaveMemprofSummary.
|
||||
assert(CB->getCalledFunction());
|
||||
assert(!IsMemProfClone(*CB->getCalledFunction()));
|
||||
|
||||
// Update the calls per the summary info.
|
||||
// Save orig name since it gets updated in the first iteration
|
||||
// below.
|
||||
auto CalleeOrigName = CB->getCalledFunction()->getName();
|
||||
for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
|
||||
// Do nothing if this version calls the original version of its
|
||||
// callee.
|
||||
if (!StackNode.Clones[J])
|
||||
continue;
|
||||
auto NewF = M.getOrInsertFunction(
|
||||
getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
|
||||
CB->getCalledFunction()->getFunctionType());
|
||||
CallBase *CBClone;
|
||||
// Copy 0 is the original function.
|
||||
if (!J)
|
||||
CBClone = CB;
|
||||
else
|
||||
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
|
||||
CBClone->setCalledFunction(NewF);
|
||||
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
|
||||
<< ore::NV("Call", CBClone) << " in clone "
|
||||
<< ore::NV("Caller", CBClone->getFunction())
|
||||
<< " assigned to call function clone "
|
||||
<< ore::NV("Callee", NewF.getCallee()));
|
||||
}
|
||||
}
|
||||
// Memprof and callsite metadata on memory allocations no longer needed.
|
||||
I.setMetadata(LLVMContext::MD_memprof, nullptr);
|
||||
I.setMetadata(LLVMContext::MD_callsite, nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
template <typename DerivedCCG, typename FuncTy, typename CallTy>
|
||||
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
|
||||
if (DumpCCG) {
|
||||
@@ -2820,12 +3198,46 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
|
||||
bool MemProfContextDisambiguation::processModule(
|
||||
Module &M,
|
||||
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
|
||||
bool Changed = false;
|
||||
|
||||
// If we have an import summary, then the cloning decisions were made during
|
||||
// the thin link on the index. Apply them and return.
|
||||
if (ImportSummary)
|
||||
return applyImport(M);
|
||||
|
||||
ModuleCallsiteContextGraph CCG(M, OREGetter);
|
||||
Changed = CCG.process();
|
||||
return CCG.process();
|
||||
}
|
||||
|
||||
return Changed;
|
||||
MemProfContextDisambiguation::MemProfContextDisambiguation(
|
||||
const ModuleSummaryIndex *Summary)
|
||||
: ImportSummary(Summary) {
|
||||
if (ImportSummary) {
|
||||
// The MemProfImportSummary should only be used for testing ThinLTO
|
||||
// distributed backend handling via opt, in which case we don't have a
|
||||
// summary from the pass pipeline.
|
||||
assert(MemProfImportSummary.empty());
|
||||
return;
|
||||
}
|
||||
if (MemProfImportSummary.empty())
|
||||
return;
|
||||
|
||||
auto ReadSummaryFile =
|
||||
errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
|
||||
if (!ReadSummaryFile) {
|
||||
logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
|
||||
"Error loading file '" + MemProfImportSummary +
|
||||
"': ");
|
||||
return;
|
||||
}
|
||||
auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
|
||||
if (!ImportSummaryForTestingOrErr) {
|
||||
logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
|
||||
"Error parsing file '" + MemProfImportSummary +
|
||||
"': ");
|
||||
return;
|
||||
}
|
||||
ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
|
||||
ImportSummary = ImportSummaryForTesting.get();
|
||||
}
|
||||
|
||||
PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
|
||||
|
||||
@@ -44,12 +44,14 @@
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
@@ -71,11 +73,18 @@
|
||||
;; Check distributed index
|
||||
; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
|
||||
|
||||
;; Run ThinLTO backend
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-import-summary=%t.o.thinlto.bc \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
source_filename = "memprof-basic.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define i32 @main() {
|
||||
define i32 @main() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3foov(), !callsite !0
|
||||
%call1 = call ptr @_Z3foov(), !callsite !1
|
||||
@@ -86,7 +95,7 @@ declare void @_ZdaPv()
|
||||
|
||||
declare i32 @sleep()
|
||||
|
||||
define internal ptr @_Z3barv() {
|
||||
define internal ptr @_Z3barv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
|
||||
ret ptr null
|
||||
@@ -94,13 +103,13 @@ entry:
|
||||
|
||||
declare ptr @_Znam(i64)
|
||||
|
||||
define internal ptr @_Z3bazv() {
|
||||
define internal ptr @_Z3bazv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3barv(), !callsite !8
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_Z3foov() {
|
||||
define internal ptr @_Z3foov() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3bazv(), !callsite !9
|
||||
ret ptr null
|
||||
@@ -109,6 +118,8 @@ entry:
|
||||
; uselistorder directives
|
||||
uselistorder ptr @_Z3foov, { 1, 0 }
|
||||
|
||||
attributes #0 = { noinline optnone }
|
||||
|
||||
!0 = !{i64 8632435727821051414}
|
||||
!1 = !{i64 -3421689549917153178}
|
||||
!2 = !{!3, !5}
|
||||
@@ -252,9 +263,50 @@ uselistorder ptr @_Z3foov, { 1, 0 }
|
||||
; DUMP: Clone of [[BAR]]
|
||||
|
||||
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
|
||||
; REMARKS: created clone _Z3barv.memprof.1
|
||||
; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: created clone _Z3bazv.memprof.1
|
||||
; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
|
||||
; REMARKS: created clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
|
||||
|
||||
|
||||
; IR: define {{.*}} @main
|
||||
;; The first call to foo does not allocate cold memory. It should call the
|
||||
;; original functions, which ultimately call the original allocation decorated
|
||||
;; with a "notcold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
;; The second call to foo allocates cold memory. It should call cloned functions
|
||||
;; which ultimately call a cloned allocation decorated with a "cold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z3barv()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3bazv()
|
||||
; IR: call {{.*}} @_Z3barv()
|
||||
; IR: define internal {{.*}} @_Z3foov()
|
||||
; IR: call {{.*}} @_Z3bazv()
|
||||
; IR: define internal {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3bazv.memprof.1()
|
||||
; IR: call {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Z3bazv.memprof.1()
|
||||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
|
||||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
|
||||
; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
|
||||
@@ -66,13 +66,15 @@
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
|
||||
;; We should clone D once for the cold allocations via C.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
@@ -95,11 +97,18 @@
|
||||
;; Check distributed index
|
||||
; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
|
||||
|
||||
;; Run ThinLTO backend
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-import-summary=%t.o.thinlto.bc \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
source_filename = "duplicate-context-ids.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal ptr @_Z1Dv() {
|
||||
define internal ptr @_Z1Dv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
|
||||
ret ptr null
|
||||
@@ -107,31 +116,31 @@ entry:
|
||||
|
||||
declare ptr @_Znam(i64)
|
||||
|
||||
define internal ptr @_Z1Fv() {
|
||||
define internal ptr @_Z1Fv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z1Dv(), !callsite !6
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_Z1Cv() {
|
||||
define internal ptr @_Z1Cv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z1Dv(), !callsite !7
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_Z1Bv() {
|
||||
define internal ptr @_Z1Bv() #0 {
|
||||
entry:
|
||||
%call.i = call ptr @_Z1Dv(), !callsite !8
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_Z1Ev() {
|
||||
define internal ptr @_Z1Ev() #0 {
|
||||
entry:
|
||||
%call.i = call ptr @_Z1Dv(), !callsite !9
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define i32 @main() {
|
||||
define i32 @main() #0 {
|
||||
entry:
|
||||
call ptr @_Z1Bv()
|
||||
call ptr @_Z1Ev()
|
||||
@@ -143,6 +152,8 @@ declare void @_ZdaPv()
|
||||
|
||||
declare i32 @sleep()
|
||||
|
||||
attributes #0 = { noinline optnone}
|
||||
|
||||
!0 = !{!1, !3}
|
||||
!1 = !{!2, !"cold"}
|
||||
!2 = !{i64 6541423618768552252, i64 -6270142974039008131}
|
||||
@@ -300,10 +311,43 @@ declare i32 @sleep()
|
||||
; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
|
||||
; DUMP: Clone of [[D]]
|
||||
|
||||
; REMARKS: created clone _Z1Dv.memprof.1
|
||||
; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1
|
||||
; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1
|
||||
|
||||
|
||||
;; The allocation via F does not allocate cold memory. It should call the
|
||||
;; original D, which ultimately call the original allocation decorated
|
||||
;; with a "notcold" attribute.
|
||||
; IR: define internal {{.*}} @_Z1Dv()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z1Fv()
|
||||
; IR: call {{.*}} @_Z1Dv()
|
||||
;; The allocations via B and E allocate cold memory. They should call the
|
||||
;; cloned D, which ultimately call the cloned allocation decorated with a
|
||||
;; "cold" attribute.
|
||||
; IR: define internal {{.*}} @_Z1Bv()
|
||||
; IR: call {{.*}} @_Z1Dv.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z1Ev()
|
||||
; IR: call {{.*}} @_Z1Dv.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z1Dv.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
|
||||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
|
||||
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
|
||||
|
||||
|
||||
; DOTPRE: digraph "prestackupdate" {
|
||||
|
||||
@@ -58,7 +58,9 @@
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
@@ -73,13 +75,20 @@
|
||||
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
;; Run ThinLTO backend
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-import-summary=%t.o.thinlto.bc \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
|
||||
source_filename = "funcassigncloning.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: noinline optnone
|
||||
define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) {
|
||||
define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
|
||||
entry:
|
||||
%call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
|
||||
%call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
|
||||
@@ -107,7 +116,7 @@ entry:
|
||||
}
|
||||
|
||||
; Function Attrs: noinline optnone
|
||||
define i32 @main() {
|
||||
define i32 @main() #0 {
|
||||
entry:
|
||||
call void @_Z1BPPcS0_()
|
||||
call void @_Z1CPPcS0_()
|
||||
@@ -122,6 +131,8 @@ declare i32 @sleep()
|
||||
; uselistorder directives
|
||||
uselistorder ptr @_Znam, { 1, 0 }
|
||||
|
||||
attributes #0 = { noinline optnone }
|
||||
|
||||
!0 = !{!1, !3, !5}
|
||||
!1 = !{!2, !"cold"}
|
||||
!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
|
||||
@@ -230,6 +241,54 @@ uselistorder ptr @_Znam, { 1, 0 }
|
||||
; DUMP: Clone of [[ENEW2ORIG]]
|
||||
|
||||
|
||||
;; We greedily create a clone of E that is initially used by the clones of the
|
||||
;; first call to new. However, we end up with an incompatible set of callers
|
||||
;; given the second call to new which has clones with a different combination of
|
||||
;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
|
||||
; REMARKS: created clone _Z1EPPcS0_.memprof.1
|
||||
; REMARKS: created clone _Z1EPPcS0_.memprof.2
|
||||
; REMARKS: created clone _Z1EPPcS0_.memprof.3
|
||||
; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3
|
||||
; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2
|
||||
|
||||
|
||||
;; Original version of E is used for the non-cold allocations, both from B.
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
|
||||
; IR: define internal {{.*}} @_Z1BPPcS0_(
|
||||
; IR: call {{.*}} @_Z1EPPcS0_(
|
||||
;; C calls a clone of E with the first new allocating cold memory and the
|
||||
;; second allocating non-cold memory.
|
||||
; IR: define internal {{.*}} @_Z1CPPcS0_(
|
||||
; IR: call {{.*}} @_Z1EPPcS0_.memprof.3(
|
||||
;; D calls a clone of E with the first new allocating non-cold memory and the
|
||||
;; second allocating cold memory.
|
||||
; IR: define internal {{.*}} @_Z1DPPcS0_(
|
||||
; IR: call {{.*}} @_Z1EPPcS0_.memprof.2(
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
|
||||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS-BE: 8 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
|
||||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
|
||||
; STATS-BE: 4 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
|
||||
|
||||
@@ -66,13 +66,15 @@
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should only create a single clone of foo, for the direct call
|
||||
;; from main allocating cold memory.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
@@ -94,6 +96,13 @@
|
||||
;; from main allocating cold memory.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
;; Run ThinLTO backend
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-import-summary=%t.o.thinlto.bc \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS
|
||||
|
||||
source_filename = "indirectcall.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
@@ -101,12 +110,12 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
@_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr
|
||||
@_ZTVN10__cxxabiv117__class_type_infoE = external global ptr
|
||||
|
||||
define internal ptr @_Z3barP1A(ptr %a) {
|
||||
define internal ptr @_Z3barP1A(ptr %a) #0 {
|
||||
entry:
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define i32 @main() {
|
||||
define i32 @main() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3foov(), !callsite !0
|
||||
%call1 = call ptr @_Z3foov(), !callsite !1
|
||||
@@ -121,19 +130,19 @@ declare void @_ZdaPv()
|
||||
|
||||
declare i32 @sleep()
|
||||
|
||||
define internal ptr @_ZN1A1xEv() {
|
||||
define internal ptr @_ZN1A1xEv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3foov(), !callsite !6
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_ZN1B1xEv() {
|
||||
define internal ptr @_ZN1B1xEv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3foov(), !callsite !7
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_Z3foov() {
|
||||
define internal ptr @_Z3foov() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21
|
||||
ret ptr null
|
||||
@@ -144,6 +153,8 @@ declare ptr @_Znam(i64)
|
||||
; uselistorder directives
|
||||
uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
|
||||
|
||||
attributes #0 = { noinline optnone }
|
||||
|
||||
!0 = !{i64 8632435727821051414}
|
||||
!1 = !{i64 -3421689549917153178}
|
||||
!2 = !{i64 6792096022461663180}
|
||||
@@ -384,9 +395,39 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
|
||||
; DUMP: Clone of [[FOO]]
|
||||
|
||||
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
|
||||
; REMARKS: created clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold
|
||||
|
||||
|
||||
; IR: define {{.*}} @main(
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
;; Only the second call to foo, which allocates cold memory via direct calls,
|
||||
;; is replaced with a call to a clone that calls a cold allocation.
|
||||
; IR: call {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: define internal {{.*}} @_Z3foov()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
|
||||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
|
||||
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
|
||||
@@ -54,13 +54,16 @@
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
; RUN: --check-prefix=STATS --check-prefix=STATS-BE \
|
||||
; RUN: --check-prefix=STATS-INPROCESS-BE --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should create clones for foo and bar for the call from main to allocate
|
||||
;; cold memory.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
@@ -80,11 +83,19 @@
|
||||
;; cold memory.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
;; Run ThinLTO backend
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-import-summary=%t.o.thinlto.bc \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS-BE --check-prefix=STATS-DISTRIB-BE \
|
||||
; RUN: --check-prefix=REMARKS
|
||||
|
||||
source_filename = "inlined.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal ptr @_Z3barv() {
|
||||
define internal ptr @_Z3barv() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
|
||||
ret ptr null
|
||||
@@ -92,19 +103,19 @@ entry:
|
||||
|
||||
declare ptr @_Znam(i64)
|
||||
|
||||
define internal ptr @_Z3bazv() {
|
||||
define internal ptr @_Z3bazv() #0 {
|
||||
entry:
|
||||
%call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define internal ptr @_Z3foov() {
|
||||
define internal ptr @_Z3foov() #0 {
|
||||
entry:
|
||||
%call.i = call ptr @_Z3barv(), !callsite !7
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
define i32 @main() {
|
||||
define i32 @main() #0 {
|
||||
entry:
|
||||
%call = call ptr @_Z3foov(), !callsite !8
|
||||
%call1 = call ptr @_Z3foov(), !callsite !9
|
||||
@@ -115,6 +126,8 @@ declare void @_ZdaPv()
|
||||
|
||||
declare i32 @sleep()
|
||||
|
||||
attributes #0 = { noinline optnone }
|
||||
|
||||
!0 = !{!1, !3}
|
||||
!1 = !{!2, !"notcold"}
|
||||
!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
|
||||
@@ -281,9 +294,50 @@ declare i32 @sleep()
|
||||
; DUMP: Clone of [[BAR]]
|
||||
|
||||
|
||||
; REMARKS: created clone _Z3barv.memprof.1
|
||||
; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: created clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
|
||||
|
||||
|
||||
; IR: define internal {{.*}} @_Z3barv()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3foov()
|
||||
; IR: call {{.*}} @_Z3barv()
|
||||
; IR: define {{.*}} @main()
|
||||
;; The first call to foo does not allocate cold memory. It should call the
|
||||
;; original functions, which ultimately call the original allocation decorated
|
||||
;; with a "notcold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
;; The second call to foo allocates cold memory. It should call cloned functions
|
||||
;; which ultimately call a cloned allocation decorated with a "cold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
|
||||
; STATS-INPROCESS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
|
||||
;; The distributed backend hasn't yet eliminated the now-dead baz with
|
||||
;; the allocation from bar inlined, so it has one more allocation.
|
||||
; STATS-DISTRIB-BE: 3 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
|
||||
; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
|
||||
; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
|
||||
; STATS-INPROCESS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
|
||||
;; The distributed backend hasn't yet eliminated the now-dead baz with
|
||||
;; the allocation from bar inlined, so it has one more allocation.
|
||||
; STATS-DISTRIB-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
|
||||
Reference in New Issue
Block a user