Restore "[MemProf] Context disambiguation cloning pass [patch 3/4]"
This reverts commit6fbf022908, restoring commitbf6ff4fd4bwith a fix for a bot failure due to a previously unstable iteration order. Differential Revision: https://reviews.llvm.org/D141077
This commit is contained in:
@@ -25,11 +25,14 @@ namespace llvm {
|
||||
class GlobalValueSummary;
|
||||
class Module;
|
||||
class ModuleSummaryIndex;
|
||||
class OptimizationRemarkEmitter;
|
||||
|
||||
class MemProfContextDisambiguation
|
||||
: public PassInfoMixin<MemProfContextDisambiguation> {
|
||||
/// Run the context disambiguator on \p M, returns true if any changes made.
|
||||
bool processModule(Module &M);
|
||||
bool processModule(
|
||||
Module &M,
|
||||
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
|
||||
|
||||
public:
|
||||
MemProfContextDisambiguation() {}
|
||||
|
||||
@@ -27,8 +27,10 @@
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/MemoryProfileInfo.h"
|
||||
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
|
||||
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
@@ -39,6 +41,7 @@
|
||||
#include "llvm/Support/GraphWriter.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Transforms/IPO.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
using namespace llvm;
|
||||
@@ -46,6 +49,13 @@ using namespace llvm::memprof;
|
||||
|
||||
#define DEBUG_TYPE "memprof-context-disambiguation"
|
||||
|
||||
STATISTIC(FunctionClonesAnalysis,
|
||||
"Number of function clones created during whole program analysis");
|
||||
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
|
||||
"cloned) during whole program analysis");
|
||||
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
|
||||
"during whole program analysis");
|
||||
|
||||
static cl::opt<std::string> DotFilePathPrefix(
|
||||
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
|
||||
cl::value_desc("filename"),
|
||||
@@ -95,6 +105,13 @@ public:
|
||||
/// behavior of an allocation based on its context.
|
||||
void identifyClones();
|
||||
|
||||
/// Assign callsite clones to functions, cloning functions as needed to
|
||||
/// accommodate the combinations of their callsite clones reached by callers.
|
||||
/// For regular LTO this clones functions and callsites in the IR, but for
|
||||
/// ThinLTO the cloning decisions are noted in the summaries and applied
|
||||
/// later.
|
||||
bool assignFunctions();
|
||||
|
||||
void dump() const;
|
||||
void print(raw_ostream &OS) const;
|
||||
|
||||
@@ -375,6 +392,28 @@ private:
|
||||
return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
|
||||
}
|
||||
|
||||
/// Update the allocation call to record type of allocated memory.
|
||||
void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
|
||||
AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
|
||||
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
|
||||
}
|
||||
|
||||
/// Update non-allocation call to invoke (possibly cloned) function
|
||||
/// CalleeFunc.
|
||||
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
|
||||
static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
|
||||
}
|
||||
|
||||
/// Clone the given function for the given callsite, recording mapping of all
|
||||
/// of the functions tracked calls to their new versions in the CallMap.
|
||||
/// Assigns new clones to clone number CloneNo.
|
||||
FuncInfo cloneFunctionForCallsite(
|
||||
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
|
||||
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
|
||||
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
|
||||
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
|
||||
}
|
||||
|
||||
/// Gets a label to use in the dot graph for the given call clone in the given
|
||||
/// function.
|
||||
std::string getLabel(const FuncTy *Func, const CallTy Call,
|
||||
@@ -469,7 +508,9 @@ class ModuleCallsiteContextGraph
|
||||
: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
|
||||
Instruction *> {
|
||||
public:
|
||||
ModuleCallsiteContextGraph(Module &M);
|
||||
ModuleCallsiteContextGraph(
|
||||
Module &M,
|
||||
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
|
||||
|
||||
private:
|
||||
friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
|
||||
@@ -479,10 +520,19 @@ private:
|
||||
bool calleeMatchesFunc(Instruction *Call, const Function *Func);
|
||||
uint64_t getLastStackId(Instruction *Call);
|
||||
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
|
||||
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
|
||||
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
|
||||
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
|
||||
Instruction *>::FuncInfo
|
||||
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
|
||||
std::map<CallInfo, CallInfo> &CallMap,
|
||||
std::vector<CallInfo> &CallsWithMetadataInFunc,
|
||||
unsigned CloneNo);
|
||||
std::string getLabel(const Function *Func, const Instruction *Call,
|
||||
unsigned CloneNo) const;
|
||||
|
||||
const Module &Mod;
|
||||
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
|
||||
};
|
||||
|
||||
/// Represents a call in the summary index graph, which can either be an
|
||||
@@ -527,6 +577,14 @@ private:
|
||||
bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
|
||||
uint64_t getLastStackId(IndexCall &Call);
|
||||
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
|
||||
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
|
||||
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
|
||||
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
|
||||
IndexCall>::FuncInfo
|
||||
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
|
||||
std::map<CallInfo, CallInfo> &CallMap,
|
||||
std::vector<CallInfo> &CallsWithMetadataInFunc,
|
||||
unsigned CloneNo);
|
||||
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
|
||||
unsigned CloneNo) const;
|
||||
|
||||
@@ -1282,10 +1340,14 @@ uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
|
||||
return Index.getStackIdAtIndex(CallsiteContext.back());
|
||||
}
|
||||
|
||||
static const std::string MemProfCloneSuffix = ".memprof.";
|
||||
|
||||
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
|
||||
// We use CloneNo == 0 to refer to the original version, which doesn't get
|
||||
// renamed with a suffix.
|
||||
if (!CloneNo)
|
||||
return Base.str();
|
||||
return (Base + ".memprof." + Twine(CloneNo)).str();
|
||||
return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
|
||||
}
|
||||
|
||||
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
|
||||
@@ -1347,7 +1409,9 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
|
||||
return StackIds;
|
||||
}
|
||||
|
||||
ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
|
||||
ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
|
||||
Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
|
||||
: Mod(M), OREGetter(OREGetter) {
|
||||
for (auto &F : M) {
|
||||
std::vector<CallInfo> CallsWithMetadata;
|
||||
for (auto &BB : F) {
|
||||
@@ -1661,7 +1725,7 @@ static void checkEdge(
|
||||
|
||||
template <typename DerivedCCG, typename FuncTy, typename CallTy>
|
||||
static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
|
||||
bool CheckEdges = false) {
|
||||
bool CheckEdges = true) {
|
||||
if (Node->isRemoved())
|
||||
return;
|
||||
// Node's context ids should be the union of both its callee and caller edge
|
||||
@@ -1701,7 +1765,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
|
||||
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
|
||||
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
|
||||
for (const auto Node : nodes<GraphType>(this)) {
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
|
||||
for (auto &Edge : Node->CallerEdges)
|
||||
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
|
||||
}
|
||||
@@ -1925,12 +1989,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
|
||||
NewEdge->Callee->CallerEdges.push_back(NewEdge);
|
||||
}
|
||||
if (VerifyCCG) {
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
|
||||
for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
|
||||
/*CheckEdges=*/false);
|
||||
for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
|
||||
/*CheckEdges=*/false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1945,7 +2011,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
|
||||
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
|
||||
ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
|
||||
if (VerifyNodes)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
|
||||
assert(!Node->CloneOf);
|
||||
|
||||
// If Node as a null call, then either it wasn't found in the module (regular
|
||||
@@ -2099,7 +2165,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
|
||||
for (auto *Clone : Node->Clones) {
|
||||
removeNoneTypeCalleeEdges(Clone);
|
||||
if (VerifyNodes)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Clone, /*CheckEdges=*/true);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
|
||||
}
|
||||
// We should still have some context ids on the original Node.
|
||||
assert(!Node->ContextIds.empty());
|
||||
@@ -2120,7 +2186,595 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
|
||||
}));
|
||||
|
||||
if (VerifyNodes)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
|
||||
}
|
||||
|
||||
static std::string getAllocTypeAttributeString(AllocationType Type) {
|
||||
switch (Type) {
|
||||
case AllocationType::NotCold:
|
||||
return "notcold";
|
||||
break;
|
||||
case AllocationType::Cold:
|
||||
return "cold";
|
||||
break;
|
||||
default:
|
||||
dbgs() << "Unexpected alloc type " << (uint8_t)Type;
|
||||
assert(false);
|
||||
}
|
||||
llvm_unreachable("invalid alloc type");
|
||||
}
|
||||
|
||||
void ModuleCallsiteContextGraph::updateAllocationCall(
|
||||
CallInfo &Call, AllocationType AllocType) {
|
||||
std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
|
||||
auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
|
||||
"memprof", AllocTypeString);
|
||||
cast<CallBase>(Call.call())->addFnAttr(A);
|
||||
OREGetter(Call.call()->getFunction())
|
||||
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
|
||||
<< ore::NV("AllocationCall", Call.call()) << " in clone "
|
||||
<< ore::NV("Caller", Call.call()->getFunction())
|
||||
<< " marked with memprof allocation attribute "
|
||||
<< ore::NV("Attribute", AllocTypeString));
|
||||
}
|
||||
|
||||
void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
|
||||
AllocationType AllocType) {
|
||||
auto *AI = Call.call().dyn_cast<AllocInfo *>();
|
||||
assert(AI);
|
||||
assert(AI->Versions.size() > Call.cloneNo());
|
||||
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
|
||||
}
|
||||
|
||||
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
|
||||
FuncInfo CalleeFunc) {
|
||||
if (CalleeFunc.cloneNo() > 0)
|
||||
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
|
||||
OREGetter(CallerCall.call()->getFunction())
|
||||
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
|
||||
<< ore::NV("Call", CallerCall.call()) << " in clone "
|
||||
<< ore::NV("Caller", CallerCall.call()->getFunction())
|
||||
<< " assigned to call function clone "
|
||||
<< ore::NV("Callee", CalleeFunc.func()));
|
||||
}
|
||||
|
||||
void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
|
||||
FuncInfo CalleeFunc) {
|
||||
auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
|
||||
assert(CI &&
|
||||
"Caller cannot be an allocation which should not have profiled calls");
|
||||
assert(CI->Clones.size() > CallerCall.cloneNo());
|
||||
CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
|
||||
}
|
||||
|
||||
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
|
||||
Instruction *>::FuncInfo
|
||||
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
|
||||
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
|
||||
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
|
||||
// Use existing LLVM facilities for cloning and obtaining Call in clone
|
||||
ValueToValueMapTy VMap;
|
||||
auto *NewFunc = CloneFunction(Func.func(), VMap);
|
||||
std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
|
||||
assert(!Func.func()->getParent()->getFunction(Name));
|
||||
NewFunc->setName(Name);
|
||||
for (auto &Inst : CallsWithMetadataInFunc) {
|
||||
// This map always has the initial version in it.
|
||||
assert(Inst.cloneNo() == 0);
|
||||
CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
|
||||
}
|
||||
OREGetter(Func.func())
|
||||
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
|
||||
<< "created clone " << ore::NV("NewFunction", NewFunc));
|
||||
return {NewFunc, CloneNo};
|
||||
}
|
||||
|
||||
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
|
||||
IndexCall>::FuncInfo
|
||||
IndexCallsiteContextGraph::cloneFunctionForCallsite(
|
||||
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
|
||||
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
|
||||
// Check how many clones we have of Call (and therefore function).
|
||||
// The next clone number is the current size of versions array.
|
||||
// Confirm this matches the CloneNo provided by the caller, which is based on
|
||||
// the number of function clones we have.
|
||||
assert(CloneNo ==
|
||||
(Call.call().is<AllocInfo *>()
|
||||
? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
|
||||
: Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
|
||||
// Walk all the instructions in this function. Create a new version for
|
||||
// each (by adding an entry to the Versions/Clones summary array), and copy
|
||||
// over the version being called for the function clone being cloned here.
|
||||
// Additionally, add an entry to the CallMap for the new function clone,
|
||||
// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
|
||||
// to the new call clone.
|
||||
for (auto &Inst : CallsWithMetadataInFunc) {
|
||||
// This map always has the initial version in it.
|
||||
assert(Inst.cloneNo() == 0);
|
||||
if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
|
||||
assert(AI->Versions.size() == CloneNo);
|
||||
// We assign the allocation type later (in updateAllocationCall), just add
|
||||
// an entry for it here.
|
||||
AI->Versions.push_back(0);
|
||||
} else {
|
||||
auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
|
||||
assert(CI && CI->Clones.size() == CloneNo);
|
||||
// We assign the clone number later (in updateCall), just add an entry for
|
||||
// it here.
|
||||
CI->Clones.push_back(0);
|
||||
}
|
||||
CallMap[Inst] = {Inst.call(), CloneNo};
|
||||
}
|
||||
return {Func.func(), CloneNo};
|
||||
}
|
||||
|
||||
// This method assigns cloned callsites to functions, cloning the functions as
|
||||
// needed. The assignment is greedy and proceeds roughly as follows:
|
||||
//
|
||||
// For each function Func:
|
||||
// For each call with graph Node having clones:
|
||||
// Initialize ClonesWorklist to Node and its clones
|
||||
// Initialize NodeCloneCount to 0
|
||||
// While ClonesWorklist is not empty:
|
||||
// Clone = pop front ClonesWorklist
|
||||
// NodeCloneCount++
|
||||
// If Func has been cloned less than NodeCloneCount times:
|
||||
// If NodeCloneCount is 1:
|
||||
// Assign Clone to original Func
|
||||
// Continue
|
||||
// Create a new function clone
|
||||
// If other callers not assigned to call a function clone yet:
|
||||
// Assign them to call new function clone
|
||||
// Continue
|
||||
// Assign any other caller calling the cloned version to new clone
|
||||
//
|
||||
// For each caller of Clone:
|
||||
// If caller is assigned to call a specific function clone:
|
||||
// If we cannot assign Clone to that function clone:
|
||||
// Create new callsite Clone NewClone
|
||||
// Add NewClone to ClonesWorklist
|
||||
// Continue
|
||||
// Assign Clone to existing caller's called function clone
|
||||
// Else:
|
||||
// If Clone not already assigned to a function clone:
|
||||
// Assign to first function clone without assignment
|
||||
// Assign caller to selected function clone
|
||||
template <typename DerivedCCG, typename FuncTy, typename CallTy>
|
||||
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
|
||||
bool Changed = false;
|
||||
|
||||
// Keep track of the assignment of nodes (callsites) to function clones they
|
||||
// call.
|
||||
DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
|
||||
|
||||
// Update caller node to call function version CalleeFunc, by recording the
|
||||
// assignment in CallsiteToCalleeFuncCloneMap.
|
||||
auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
|
||||
const FuncInfo &CalleeFunc) {
|
||||
assert(Caller->hasCall());
|
||||
CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
|
||||
};
|
||||
|
||||
// Walk all functions for which we saw calls with memprof metadata, and handle
|
||||
// cloning for each of its calls.
|
||||
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
|
||||
FuncInfo OrigFunc(Func);
|
||||
// Map from each clone of OrigFunc to a map of remappings of each call of
|
||||
// interest (from original uncloned call to the corresponding cloned call in
|
||||
// that function clone).
|
||||
std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
|
||||
for (auto &Call : CallsWithMetadata) {
|
||||
ContextNode *Node = getNodeForInst(Call);
|
||||
// Skip call if we do not have a node for it (all uses of its stack ids
|
||||
// were either on inlined chains or pruned from the MIBs), or if we did
|
||||
// not create any clones for it.
|
||||
if (!Node || Node->Clones.empty())
|
||||
continue;
|
||||
assert(Node->hasCall() &&
|
||||
"Not having a call should have prevented cloning");
|
||||
|
||||
// Track the assignment of function clones to clones of the current
|
||||
// callsite Node being handled.
|
||||
std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
|
||||
|
||||
// Assign callsite version CallsiteClone to function version FuncClone,
|
||||
// and also assign (possibly cloned) Call to CallsiteClone.
|
||||
auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
|
||||
CallInfo &Call,
|
||||
ContextNode *CallsiteClone,
|
||||
bool IsAlloc) {
|
||||
// Record the clone of callsite node assigned to this function clone.
|
||||
FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
|
||||
|
||||
assert(FuncClonesToCallMap.count(FuncClone));
|
||||
std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
|
||||
CallInfo CallClone(Call);
|
||||
if (CallMap.count(Call))
|
||||
CallClone = CallMap[Call];
|
||||
CallsiteClone->setCall(CallClone);
|
||||
};
|
||||
|
||||
// Keep track of the clones of callsite Node that need to be assigned to
|
||||
// function clones. This list may be expanded in the loop body below if we
|
||||
// find additional cloning is required.
|
||||
std::deque<ContextNode *> ClonesWorklist;
|
||||
// Ignore original Node if we moved all of its contexts to clones.
|
||||
if (!Node->ContextIds.empty())
|
||||
ClonesWorklist.push_back(Node);
|
||||
ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
|
||||
Node->Clones.end());
|
||||
|
||||
// Now walk through all of the clones of this callsite Node that we need,
|
||||
// and determine the assignment to a corresponding clone of the current
|
||||
// function (creating new function clones as needed).
|
||||
unsigned NodeCloneCount = 0;
|
||||
while (!ClonesWorklist.empty()) {
|
||||
ContextNode *Clone = ClonesWorklist.front();
|
||||
ClonesWorklist.pop_front();
|
||||
NodeCloneCount++;
|
||||
if (VerifyNodes)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
|
||||
|
||||
// Need to create a new function clone if we have more callsite clones
|
||||
// than existing function clones, which would have been assigned to an
|
||||
// earlier clone in the list (we assign callsite clones to function
|
||||
// clones greedily).
|
||||
if (FuncClonesToCallMap.size() < NodeCloneCount) {
|
||||
// If this is the first callsite copy, assign to original function.
|
||||
if (NodeCloneCount == 1) {
|
||||
// Since FuncClonesToCallMap is empty in this case, no clones have
|
||||
// been created for this function yet, and no callers should have
|
||||
// been assigned a function clone for this callee node yet.
|
||||
assert(llvm::none_of(
|
||||
Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
|
||||
return CallsiteToCalleeFuncCloneMap.count(E->Caller);
|
||||
}));
|
||||
// Initialize with empty call map, assign Clone to original function
|
||||
// and its callers, and skip to the next clone.
|
||||
FuncClonesToCallMap[OrigFunc] = {};
|
||||
AssignCallsiteCloneToFuncClone(
|
||||
OrigFunc, Call, Clone,
|
||||
AllocationCallToContextNodeMap.count(Call));
|
||||
for (auto &CE : Clone->CallerEdges) {
|
||||
// Ignore any caller that does not have a recorded callsite Call.
|
||||
if (!CE->Caller->hasCall())
|
||||
continue;
|
||||
RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// First locate which copy of OrigFunc to clone again. If a caller
|
||||
// of this callsite clone was already assigned to call a particular
|
||||
// function clone, we need to redirect all of those callers to the
|
||||
// new function clone, and update their other callees within this
|
||||
// function.
|
||||
FuncInfo PreviousAssignedFuncClone;
|
||||
auto EI = llvm::find_if(
|
||||
Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
|
||||
return CallsiteToCalleeFuncCloneMap.count(E->Caller);
|
||||
});
|
||||
bool CallerAssignedToCloneOfFunc = false;
|
||||
if (EI != Clone->CallerEdges.end()) {
|
||||
const std::shared_ptr<ContextEdge> &Edge = *EI;
|
||||
PreviousAssignedFuncClone =
|
||||
CallsiteToCalleeFuncCloneMap[Edge->Caller];
|
||||
CallerAssignedToCloneOfFunc = true;
|
||||
}
|
||||
|
||||
// Clone function and save it along with the CallInfo map created
|
||||
// during cloning in the FuncClonesToCallMap.
|
||||
std::map<CallInfo, CallInfo> NewCallMap;
|
||||
unsigned CloneNo = FuncClonesToCallMap.size();
|
||||
assert(CloneNo > 0 && "Clone 0 is the original function, which "
|
||||
"should already exist in the map");
|
||||
FuncInfo NewFuncClone = cloneFunctionForCallsite(
|
||||
OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
|
||||
FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
|
||||
FunctionClonesAnalysis++;
|
||||
Changed = true;
|
||||
|
||||
// If no caller callsites were already assigned to a clone of this
|
||||
// function, we can simply assign this clone to the new func clone
|
||||
// and update all callers to it, then skip to the next clone.
|
||||
if (!CallerAssignedToCloneOfFunc) {
|
||||
AssignCallsiteCloneToFuncClone(
|
||||
NewFuncClone, Call, Clone,
|
||||
AllocationCallToContextNodeMap.count(Call));
|
||||
for (auto &CE : Clone->CallerEdges) {
|
||||
// Ignore any caller that does not have a recorded callsite Call.
|
||||
if (!CE->Caller->hasCall())
|
||||
continue;
|
||||
RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// We may need to do additional node cloning in this case.
|
||||
// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
|
||||
// that were previously assigned to call PreviousAssignedFuncClone,
|
||||
// to record that they now call NewFuncClone.
|
||||
for (auto CE : Clone->CallerEdges) {
|
||||
// Ignore any caller that does not have a recorded callsite Call.
|
||||
if (!CE->Caller->hasCall())
|
||||
continue;
|
||||
|
||||
if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
|
||||
// We subsequently fall through to later handling that
|
||||
// will perform any additional cloning required for
|
||||
// callers that were calling other function clones.
|
||||
CallsiteToCalleeFuncCloneMap[CE->Caller] !=
|
||||
PreviousAssignedFuncClone)
|
||||
continue;
|
||||
|
||||
RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
|
||||
|
||||
// If we are cloning a function that was already assigned to some
|
||||
// callers, then essentially we are creating new callsite clones
|
||||
// of the other callsites in that function that are reached by those
|
||||
// callers. Clone the other callees of the current callsite's caller
|
||||
// that were already assigned to PreviousAssignedFuncClone
|
||||
// accordingly. This is important since we subsequently update the
|
||||
// calls from the nodes in the graph and their assignments to callee
|
||||
// functions recorded in CallsiteToCalleeFuncCloneMap.
|
||||
for (auto CalleeEdge : CE->Caller->CalleeEdges) {
|
||||
// Skip any that have been removed on an earlier iteration when
|
||||
// cleaning up newly None type callee edges.
|
||||
if (!CalleeEdge)
|
||||
continue;
|
||||
ContextNode *Callee = CalleeEdge->Callee;
|
||||
// Skip the current callsite, we are looking for other
|
||||
// callsites Caller calls, as well as any that does not have a
|
||||
// recorded callsite Call.
|
||||
if (Callee == Clone || !Callee->hasCall())
|
||||
continue;
|
||||
ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
|
||||
removeNoneTypeCalleeEdges(NewClone);
|
||||
// Moving the edge may have resulted in some none type
|
||||
// callee edges on the original Callee.
|
||||
removeNoneTypeCalleeEdges(Callee);
|
||||
assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
|
||||
// If the Callee node was already assigned to call a specific
|
||||
// function version, make sure its new clone is assigned to call
|
||||
// that same function clone.
|
||||
if (CallsiteToCalleeFuncCloneMap.count(Callee))
|
||||
RecordCalleeFuncOfCallsite(
|
||||
NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
|
||||
// Update NewClone with the new Call clone of this callsite's Call
|
||||
// created for the new function clone created earlier.
|
||||
// Recall that we have already ensured when building the graph
|
||||
// that each caller can only call callsites within the same
|
||||
// function, so we are guaranteed that Callee Call is in the
|
||||
// current OrigFunc.
|
||||
// CallMap is set up as indexed by original Call at clone 0.
|
||||
CallInfo OrigCall(Callee->getOrigNode()->Call);
|
||||
OrigCall.setCloneNo(0);
|
||||
std::map<CallInfo, CallInfo> &CallMap =
|
||||
FuncClonesToCallMap[NewFuncClone];
|
||||
assert(CallMap.count(OrigCall));
|
||||
CallInfo NewCall(CallMap[OrigCall]);
|
||||
assert(NewCall);
|
||||
NewClone->setCall(NewCall);
|
||||
}
|
||||
}
|
||||
// Fall through to handling below to perform the recording of the
|
||||
// function for this callsite clone. This enables handling of cases
|
||||
// where the callers were assigned to different clones of a function.
|
||||
}
|
||||
|
||||
// See if we can use existing function clone. Walk through
|
||||
// all caller edges to see if any have already been assigned to
|
||||
// a clone of this callsite's function. If we can use it, do so. If not,
|
||||
// because that function clone is already assigned to a different clone
|
||||
// of this callsite, then we need to clone again.
|
||||
// Basically, this checking is needed to handle the case where different
|
||||
// caller functions/callsites may need versions of this function
|
||||
// containing different mixes of callsite clones across the different
|
||||
// callsites within the function. If that happens, we need to create
|
||||
// additional function clones to handle the various combinations.
|
||||
//
|
||||
// Keep track of any new clones of this callsite created by the
|
||||
// following loop, as well as any existing clone that we decided to
|
||||
// assign this clone to.
|
||||
std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
|
||||
FuncInfo FuncCloneAssignedToCurCallsiteClone;
|
||||
// We need to be able to remove Edge from CallerEdges, so need to adjust
|
||||
// iterator in the loop.
|
||||
for (auto EI = Clone->CallerEdges.begin();
|
||||
EI != Clone->CallerEdges.end();) {
|
||||
auto Edge = *EI;
|
||||
// Ignore any caller that does not have a recorded callsite Call.
|
||||
if (!Edge->Caller->hasCall()) {
|
||||
EI++;
|
||||
continue;
|
||||
}
|
||||
// If this caller already assigned to call a version of OrigFunc, need
|
||||
// to ensure we can assign this callsite clone to that function clone.
|
||||
if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
|
||||
FuncInfo FuncCloneCalledByCaller =
|
||||
CallsiteToCalleeFuncCloneMap[Edge->Caller];
|
||||
// First we need to confirm that this function clone is available
|
||||
// for use by this callsite node clone.
|
||||
//
|
||||
// While FuncCloneToCurNodeCloneMap is built only for this Node and
|
||||
// its callsite clones, one of those callsite clones X could have
|
||||
// been assigned to the same function clone called by Edge's caller
|
||||
// - if Edge's caller calls another callsite within Node's original
|
||||
// function, and that callsite has another caller reaching clone X.
|
||||
// We need to clone Node again in this case.
|
||||
if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
|
||||
FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
|
||||
Clone) ||
|
||||
// Detect when we have multiple callers of this callsite that
|
||||
// have already been assigned to specific, and different, clones
|
||||
// of OrigFunc (due to other unrelated callsites in Func they
|
||||
// reach via call contexts). Is this Clone of callsite Node
|
||||
// assigned to a different clone of OrigFunc? If so, clone Node
|
||||
// again.
|
||||
(FuncCloneAssignedToCurCallsiteClone &&
|
||||
FuncCloneAssignedToCurCallsiteClone !=
|
||||
FuncCloneCalledByCaller)) {
|
||||
// We need to use a different newly created callsite clone, in
|
||||
// order to assign it to another new function clone on a
|
||||
// subsequent iteration over the Clones array (adjusted below).
|
||||
// Note we specifically do not reset the
|
||||
// CallsiteToCalleeFuncCloneMap entry for this caller, so that
|
||||
// when this new clone is processed later we know which version of
|
||||
// the function to copy (so that other callsite clones we have
|
||||
// assigned to that function clone are properly cloned over). See
|
||||
// comments in the function cloning handling earlier.
|
||||
|
||||
// Check if we already have cloned this callsite again while
|
||||
// walking through caller edges, for a caller calling the same
|
||||
// function clone. If so, we can move this edge to that new clone
|
||||
// rather than creating yet another new clone.
|
||||
if (FuncCloneToNewCallsiteCloneMap.count(
|
||||
FuncCloneCalledByCaller)) {
|
||||
ContextNode *NewClone =
|
||||
FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
|
||||
moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
|
||||
// Cleanup any none type edges cloned over.
|
||||
removeNoneTypeCalleeEdges(NewClone);
|
||||
} else {
|
||||
// Create a new callsite clone.
|
||||
ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
|
||||
removeNoneTypeCalleeEdges(NewClone);
|
||||
FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
|
||||
NewClone;
|
||||
// Add to list of clones and process later.
|
||||
ClonesWorklist.push_back(NewClone);
|
||||
assert(EI == Clone->CallerEdges.end() ||
|
||||
Clone->AllocTypes != (uint8_t)AllocationType::None);
|
||||
assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
|
||||
}
|
||||
// Moving the caller edge may have resulted in some none type
|
||||
// callee edges.
|
||||
removeNoneTypeCalleeEdges(Clone);
|
||||
// We will handle the newly created callsite clone in a subsequent
|
||||
// iteration over this Node's Clones. Continue here since we
|
||||
// already adjusted iterator EI while moving the edge.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, we can use the function clone already assigned to this
|
||||
// caller.
|
||||
if (!FuncCloneAssignedToCurCallsiteClone) {
|
||||
FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
|
||||
// Assign Clone to FuncCloneCalledByCaller
|
||||
AssignCallsiteCloneToFuncClone(
|
||||
FuncCloneCalledByCaller, Call, Clone,
|
||||
AllocationCallToContextNodeMap.count(Call));
|
||||
} else
|
||||
// Don't need to do anything - callsite is already calling this
|
||||
// function clone.
|
||||
assert(FuncCloneAssignedToCurCallsiteClone ==
|
||||
FuncCloneCalledByCaller);
|
||||
|
||||
} else {
|
||||
// We have not already assigned this caller to a version of
|
||||
// OrigFunc. Do the assignment now.
|
||||
|
||||
// First check if we have already assigned this callsite clone to a
|
||||
// clone of OrigFunc for another caller during this iteration over
|
||||
// its caller edges.
|
||||
if (!FuncCloneAssignedToCurCallsiteClone) {
|
||||
// Find first function in FuncClonesToCallMap without an assigned
|
||||
// clone of this callsite Node. We should always have one
|
||||
// available at this point due to the earlier cloning when the
|
||||
// FuncClonesToCallMap size was smaller than the clone number.
|
||||
for (auto &CF : FuncClonesToCallMap) {
|
||||
if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
|
||||
FuncCloneAssignedToCurCallsiteClone = CF.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(FuncCloneAssignedToCurCallsiteClone);
|
||||
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
|
||||
AssignCallsiteCloneToFuncClone(
|
||||
FuncCloneAssignedToCurCallsiteClone, Call, Clone,
|
||||
AllocationCallToContextNodeMap.count(Call));
|
||||
} else
|
||||
assert(FuncCloneToCurNodeCloneMap
|
||||
[FuncCloneAssignedToCurCallsiteClone] == Clone);
|
||||
// Update callers to record function version called.
|
||||
RecordCalleeFuncOfCallsite(Edge->Caller,
|
||||
FuncCloneAssignedToCurCallsiteClone);
|
||||
}
|
||||
|
||||
EI++;
|
||||
}
|
||||
}
|
||||
if (VerifyCCG) {
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
|
||||
for (const auto &PE : Node->CalleeEdges)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
|
||||
for (const auto &CE : Node->CallerEdges)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
|
||||
for (auto *Clone : Node->Clones) {
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
|
||||
for (const auto &PE : Clone->CalleeEdges)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
|
||||
for (const auto &CE : Clone->CallerEdges)
|
||||
checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto UpdateCalls = [&](ContextNode *Node,
|
||||
DenseSet<const ContextNode *> &Visited,
|
||||
auto &&UpdateCalls) {
|
||||
auto Inserted = Visited.insert(Node);
|
||||
if (!Inserted.second)
|
||||
return;
|
||||
|
||||
for (auto *Clone : Node->Clones)
|
||||
UpdateCalls(Clone, Visited, UpdateCalls);
|
||||
|
||||
for (auto &Edge : Node->CallerEdges)
|
||||
UpdateCalls(Edge->Caller, Visited, UpdateCalls);
|
||||
|
||||
// Skip if either no call to update, or if we ended up with no context ids
|
||||
// (we moved all edges onto other clones).
|
||||
if (!Node->hasCall() || Node->ContextIds.empty())
|
||||
return;
|
||||
|
||||
if (Node->IsAllocation) {
|
||||
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!CallsiteToCalleeFuncCloneMap.count(Node))
|
||||
return;
|
||||
|
||||
auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
|
||||
updateCall(Node->Call, CalleeFunc);
|
||||
};
|
||||
|
||||
// Sort the allocation nodes based on the OrigStackOrAllocId, which increase
|
||||
// in insertion order, so that the following loop is deterministic (since the
|
||||
// AllocationCallToContextNodeMap is keyed by a pointer). Specifically this
|
||||
// can affect the order of the remarks emitted for regular LTO IR updates
|
||||
// during the call updating.
|
||||
std::vector<ContextNode *> AllocationNodes;
|
||||
AllocationNodes.reserve(AllocationCallToContextNodeMap.size());
|
||||
for (auto &Entry : AllocationCallToContextNodeMap)
|
||||
AllocationNodes.push_back(Entry.second);
|
||||
std::sort(AllocationNodes.begin(), AllocationNodes.end(),
|
||||
[](const ContextNode *A, const ContextNode *B) {
|
||||
return A->OrigStackOrAllocId < B->OrigStackOrAllocId;
|
||||
});
|
||||
|
||||
// Performs DFS traversal starting from allocation nodes to update calls to
|
||||
// reflect cloning decisions recorded earlier. For regular LTO this will
|
||||
// update the actual calls in the IR to call the appropriate function clone
|
||||
// (and add attributes to allocation calls), whereas for ThinLTO the decisions
|
||||
// are recorded in the summary entries.
|
||||
DenseSet<const ContextNode *> Visited;
|
||||
for (auto *AllocNode : AllocationNodes)
|
||||
UpdateCalls(AllocNode, Visited, UpdateCalls);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
template <typename DerivedCCG, typename FuncTy, typename CallTy>
|
||||
@@ -2149,13 +2803,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
|
||||
if (ExportToDot)
|
||||
exportToDot("cloned");
|
||||
|
||||
return false;
|
||||
bool Changed = assignFunctions();
|
||||
|
||||
if (DumpCCG) {
|
||||
dbgs() << "CCG after assigning function clones:\n";
|
||||
dbgs() << *this;
|
||||
}
|
||||
if (ExportToDot)
|
||||
exportToDot("clonefuncassign");
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool MemProfContextDisambiguation::processModule(Module &M) {
|
||||
bool MemProfContextDisambiguation::processModule(
|
||||
Module &M,
|
||||
function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
|
||||
bool Changed = false;
|
||||
|
||||
ModuleCallsiteContextGraph CCG(M);
|
||||
ModuleCallsiteContextGraph CCG(M, OREGetter);
|
||||
Changed = CCG.process();
|
||||
|
||||
return Changed;
|
||||
@@ -2163,7 +2828,11 @@ bool MemProfContextDisambiguation::processModule(Module &M) {
|
||||
|
||||
PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
|
||||
ModuleAnalysisManager &AM) {
|
||||
if (!processModule(M))
|
||||
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
||||
auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
|
||||
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
|
||||
};
|
||||
if (!processModule(M, OREGetter))
|
||||
return PreservedAnalyses::all();
|
||||
return PreservedAnalyses::none();
|
||||
}
|
||||
|
||||
@@ -39,13 +39,35 @@
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
; RUN: -thinlto-distributed-indexes \
|
||||
; RUN: -r=%t.o,main,plx \
|
||||
; RUN: -r=%t.o,_ZdaPv, \
|
||||
; RUN: -r=%t.o,sleep, \
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
|
||||
; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
;; Check distributed index
|
||||
; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
|
||||
|
||||
source_filename = "memprof-basic.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
@@ -227,6 +249,11 @@ uselistorder ptr @_Z3foov, { 1, 0 }
|
||||
; DUMP: Clone of [[BAR]]
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
; DOT: label="postbuild";
|
||||
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
|
||||
@@ -258,3 +285,9 @@ uselistorder ptr @_Z3foov, { 1, 0 }
|
||||
; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
|
||||
; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
|
||||
; DOTCLONED: }
|
||||
|
||||
|
||||
; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1)
|
||||
; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1)
|
||||
; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold)
|
||||
; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1)
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
;; Test callsite context graph generation for call graph with with MIBs
|
||||
;; that have pruned contexts that partially match multiple inlined
|
||||
;; callsite contexts, requiring duplication of context ids and nodes
|
||||
;; while matching callsite nodes onto the graph.
|
||||
;; while matching callsite nodes onto the graph. Also tests graph and IR
|
||||
;; cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -60,7 +61,9 @@
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
|
||||
@@ -68,6 +71,27 @@
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
; RUN: -thinlto-distributed-indexes \
|
||||
; RUN: -r=%t.o,main,plx \
|
||||
; RUN: -r=%t.o,_ZdaPv, \
|
||||
; RUN: -r=%t.o,sleep, \
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
|
||||
;; We should clone D once for the cold allocations via C.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
;; Check distributed index
|
||||
; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
|
||||
|
||||
source_filename = "duplicate-context-ids.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
@@ -104,7 +128,13 @@ entry:
|
||||
ret ptr null
|
||||
}
|
||||
|
||||
declare i32 @main()
|
||||
define i32 @main() {
|
||||
entry:
|
||||
call ptr @_Z1Bv()
|
||||
call ptr @_Z1Ev()
|
||||
call ptr @_Z1Fv()
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
declare void @_ZdaPv()
|
||||
|
||||
@@ -268,6 +298,11 @@ declare i32 @sleep()
|
||||
; DUMP: Clone of [[D]]
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOTPRE: digraph "prestackupdate" {
|
||||
; DOTPRE: label="prestackupdate";
|
||||
; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
|
||||
@@ -305,3 +340,9 @@ declare i32 @sleep()
|
||||
; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"];
|
||||
; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
|
||||
; DOTCLONED: }
|
||||
|
||||
; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1)
|
||||
; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold)
|
||||
; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
|
||||
; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0)
|
||||
; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
|
||||
|
||||
232
llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
Normal file
232
llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
Normal file
@@ -0,0 +1,232 @@
|
||||
;; Test context disambiguation for a callgraph containing multiple memprof
|
||||
;; contexts and no inlining, where we need to perform additional cloning
|
||||
;; during function assignment/cloning to handle the combination of contexts
|
||||
;; to 2 different allocations.
|
||||
;;
|
||||
;; void E(char **buf1, char **buf2) {
|
||||
;; *buf1 = new char[10];
|
||||
;; *buf2 = new char[10];
|
||||
;; }
|
||||
;;
|
||||
;; void B(char **buf1, char **buf2) {
|
||||
;; E(buf1, buf2);
|
||||
;; }
|
||||
;;
|
||||
;; void C(char **buf1, char **buf2) {
|
||||
;; E(buf1, buf2);
|
||||
;; }
|
||||
;;
|
||||
;; void D(char **buf1, char **buf2) {
|
||||
;; E(buf1, buf2);
|
||||
;; }
|
||||
;; int main(int argc, char **argv) {
|
||||
;; char *cold1, *cold2, *default1, *default2, *default3, *default4;
|
||||
;; B(&default1, &default2);
|
||||
;; C(&default3, &cold1);
|
||||
;; D(&cold2, &default4);
|
||||
;; memset(cold1, 0, 10);
|
||||
;; memset(cold2, 0, 10);
|
||||
;; memset(default1, 0, 10);
|
||||
;; memset(default2, 0, 10);
|
||||
;; memset(default3, 0, 10);
|
||||
;; memset(default4, 0, 10);
|
||||
;; delete[] default1;
|
||||
;; delete[] default2;
|
||||
;; delete[] default3;
|
||||
;; delete[] default4;
|
||||
;; sleep(10);
|
||||
;; delete[] cold1;
|
||||
;; delete[] cold2;
|
||||
;; return 0;
|
||||
;; }
|
||||
;;
|
||||
;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
|
||||
;; memory freed after sleep(10) results in cold lifetimes.
|
||||
;;
|
||||
;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
|
||||
|
||||
|
||||
; RUN: opt -thinlto-bc %s >%t.o
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
; RUN: -r=%t.o,main,plx \
|
||||
; RUN: -r=%t.o,_ZdaPv, \
|
||||
; RUN: -r=%t.o,sleep, \
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
; RUN: -thinlto-distributed-indexes \
|
||||
; RUN: -r=%t.o,main,plx \
|
||||
; RUN: -r=%t.o,_ZdaPv, \
|
||||
; RUN: -r=%t.o,sleep, \
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
|
||||
source_filename = "funcassigncloning.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: noinline optnone
|
||||
define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) {
|
||||
entry:
|
||||
%call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
|
||||
%call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
|
||||
ret void
|
||||
}
|
||||
|
||||
declare ptr @_Znam(i64)
|
||||
|
||||
define internal void @_Z1BPPcS0_() {
|
||||
entry:
|
||||
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @_Z1CPPcS0_() {
|
||||
entry:
|
||||
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @_Z1DPPcS0_() {
|
||||
entry:
|
||||
call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: noinline optnone
|
||||
define i32 @main() {
|
||||
entry:
|
||||
call void @_Z1BPPcS0_()
|
||||
call void @_Z1CPPcS0_()
|
||||
call void @_Z1DPPcS0_()
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
declare void @_ZdaPv()
|
||||
|
||||
declare i32 @sleep()
|
||||
|
||||
; uselistorder directives
|
||||
uselistorder ptr @_Znam, { 1, 0 }
|
||||
|
||||
!0 = !{!1, !3, !5}
|
||||
!1 = !{!2, !"cold"}
|
||||
!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
|
||||
!3 = !{!4, !"notcold"}
|
||||
!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
|
||||
!5 = !{!6, !"notcold"}
|
||||
!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
|
||||
!7 = !{i64 -3461278137325233666}
|
||||
!8 = !{!9, !11, !13}
|
||||
!9 = !{!10, !"notcold"}
|
||||
!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
|
||||
!11 = !{!12, !"cold"}
|
||||
!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
|
||||
!13 = !{!14, !"notcold"}
|
||||
!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
|
||||
!15 = !{i64 -1415475215210681400}
|
||||
!16 = !{i64 -2441057035866683071}
|
||||
!17 = !{i64 -3483158674395044949}
|
||||
!18 = !{i64 -7799663586031895603}
|
||||
|
||||
|
||||
;; Originally we create a single clone of each call to new from E, since each
|
||||
;; allocates cold memory for a single caller.
|
||||
|
||||
; DUMP: CCG after cloning:
|
||||
; DUMP: Callsite Context Graph:
|
||||
; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
|
||||
; DUMP: Versions: 1 MIB:
|
||||
; DUMP: AllocType 2 StackIds: 0
|
||||
; DUMP: AllocType 1 StackIds: 1
|
||||
; DUMP: AllocType 1 StackIds: 2
|
||||
; DUMP: (clone 0)
|
||||
; DUMP: AllocTypes: NotCold
|
||||
; DUMP: ContextIds: 2 3
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
|
||||
; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
|
||||
|
||||
; DUMP: Node [[D:0x[a-z0-9]+]]
|
||||
; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0)
|
||||
; DUMP: AllocTypes: NotColdCold
|
||||
; DUMP: ContextIds: 1 6
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
|
||||
; DUMP: CallerEdges:
|
||||
|
||||
; DUMP: Node [[C]]
|
||||
; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0)
|
||||
; DUMP: AllocTypes: NotColdCold
|
||||
; DUMP: ContextIds: 2 5
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
|
||||
; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
|
||||
; DUMP: CallerEdges:
|
||||
|
||||
; DUMP: Node [[B]]
|
||||
; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0)
|
||||
; DUMP: AllocTypes: NotCold
|
||||
; DUMP: ContextIds: 3 4
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
|
||||
; DUMP: CallerEdges:
|
||||
|
||||
; DUMP: Node [[ENEW2ORIG]]
|
||||
; DUMP: Versions: 1 MIB:
|
||||
; DUMP: AllocType 1 StackIds: 2
|
||||
; DUMP: AllocType 2 StackIds: 1
|
||||
; DUMP: AllocType 1 StackIds: 0
|
||||
; DUMP: (clone 0)
|
||||
; DUMP: AllocTypes: NotCold
|
||||
; DUMP: ContextIds: 4 6
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
|
||||
; DUMP: Clones: [[ENEW2CLONE]]
|
||||
|
||||
; DUMP: Node [[ENEW1CLONE]]
|
||||
; DUMP: Versions: 1 MIB:
|
||||
; DUMP: AllocType 2 StackIds: 0
|
||||
; DUMP: AllocType 1 StackIds: 1
|
||||
; DUMP: AllocType 1 StackIds: 2
|
||||
; DUMP: (clone 0)
|
||||
; DUMP: AllocTypes: Cold
|
||||
; DUMP: ContextIds: 1
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
|
||||
; DUMP: Clone of [[ENEW1ORIG]]
|
||||
|
||||
; DUMP: Node [[ENEW2CLONE]]
|
||||
; DUMP: Versions: 1 MIB:
|
||||
; DUMP: AllocType 1 StackIds: 2
|
||||
; DUMP: AllocType 2 StackIds: 1
|
||||
; DUMP: AllocType 1 StackIds: 0
|
||||
; DUMP: (clone 0)
|
||||
; DUMP: AllocTypes: Cold
|
||||
; DUMP: ContextIds: 5
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
|
||||
; DUMP: Clone of [[ENEW2ORIG]]
|
||||
|
||||
|
||||
; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
@@ -1,7 +1,7 @@
|
||||
;; Tests callsite context graph generation for call graph containing indirect
|
||||
;; calls. Currently this should result in conservative behavior, such that the
|
||||
;; indirect call receives a null call in its graph node, to prevent subsequent
|
||||
;; cloning.
|
||||
;; cloning. Also tests graph and IR cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -61,7 +61,9 @@
|
||||
; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should only create a single clone of foo, for the direct call
|
||||
@@ -69,6 +71,26 @@
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
; RUN: -thinlto-distributed-indexes \
|
||||
; RUN: -r=%t.o,main,plx \
|
||||
; RUN: -r=%t.o,_ZdaPv, \
|
||||
; RUN: -r=%t.o,sleep, \
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
|
||||
; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should only create a single clone of foo, for the direct call
|
||||
;; from main allocating cold memory.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
source_filename = "indirectcall.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
@@ -359,6 +381,11 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
|
||||
; DUMP: Clone of [[FOO]]
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
; DOT: label="postbuild";
|
||||
; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
;; Test callsite context graph generation for call graph with two memprof
|
||||
;; contexts and partial inlining, requiring generation of a new fused node to
|
||||
;; represent the inlined sequence while matching callsite nodes onto the graph.
|
||||
;; Also tests graph and IR cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -48,7 +49,9 @@
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
|
||||
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should create clones for foo and bar for the call from main to allocate
|
||||
@@ -56,6 +59,24 @@
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
|
||||
;; Try again but with distributed ThinLTO
|
||||
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
|
||||
; RUN: -thinlto-distributed-indexes \
|
||||
; RUN: -r=%t.o,main,plx \
|
||||
; RUN: -r=%t.o,_ZdaPv, \
|
||||
; RUN: -r=%t.o,sleep, \
|
||||
; RUN: -r=%t.o,_Znam, \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
|
||||
; RUN: --check-prefix=STATS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should create clones for foo and bar for the call from main to allocate
|
||||
;; cold memory.
|
||||
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
|
||||
|
||||
source_filename = "inlined.ll"
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
@@ -257,6 +278,11 @@ declare i32 @sleep()
|
||||
; DUMP: Clone of [[BAR]]
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
; DOT: label="postbuild";
|
||||
; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
;; Test callsite context graph generation for simple call graph with
|
||||
;; two memprof contexts and no inlining.
|
||||
;; two memprof contexts and no inlining, as well as graph and IR cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -34,7 +34,9 @@
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
|
||||
@@ -222,6 +224,48 @@ attributes #6 = { builtin }
|
||||
; DUMP: Clone of [[BAR]]
|
||||
|
||||
|
||||
; REMARKS: created clone _Z3barv.memprof.1
|
||||
; REMARKS: created clone _Z3bazv.memprof.1
|
||||
; REMARKS: created clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
|
||||
; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
|
||||
; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov
|
||||
; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv
|
||||
; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
|
||||
; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
|
||||
|
||||
|
||||
; IR: define {{.*}} @main
|
||||
;; The first call to foo does not allocate cold memory. It should call the
|
||||
;; original functions, which ultimately call the original allocation decorated
|
||||
;; with a "notcold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
;; The second call to foo allocates cold memory. It should call cloned functions
|
||||
;; which ultimately call a cloned allocation decorated with a "cold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z3barv()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3bazv()
|
||||
; IR: call {{.*}} @_Z3barv()
|
||||
; IR: define internal {{.*}} @_Z3foov()
|
||||
; IR: call {{.*}} @_Z3bazv()
|
||||
; IR: define internal {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3bazv.memprof.1()
|
||||
; IR: call {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Z3bazv.memprof.1()
|
||||
; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
; DOT: label="postbuild";
|
||||
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
;; Test callsite context graph generation for call graph with with MIBs
|
||||
;; that have pruned contexts that partially match multiple inlined
|
||||
;; callsite contexts, requiring duplication of context ids and nodes
|
||||
;; while matching callsite nodes onto the graph.
|
||||
;; while matching callsite nodes onto the graph. Also tests graph and IR
|
||||
;; cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -55,7 +56,9 @@
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
|
||||
@@ -263,6 +266,39 @@ attributes #6 = { builtin }
|
||||
; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
|
||||
; DUMP: Clone of [[D]]
|
||||
|
||||
; REMARKS: created clone _Z1Dv.memprof.1
|
||||
; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1
|
||||
; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.memprof.1
|
||||
; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1
|
||||
; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv
|
||||
; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
|
||||
|
||||
|
||||
;; The allocation via F does not allocate cold memory. It should call the
|
||||
;; original D, which ultimately call the original allocation decorated
|
||||
;; with a "notcold" attribute.
|
||||
; IR: define internal {{.*}} @_Z1Dv()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z1Fv()
|
||||
; IR: call {{.*}} @_Z1Dv()
|
||||
;; The allocations via B and E allocate cold memory. They should call the
|
||||
;; cloned D, which ultimately call the cloned allocation decorated with a
|
||||
;; "cold" attribute.
|
||||
; IR: define internal {{.*}} @_Z1Bv()
|
||||
; IR: call {{.*}} @_Z1Dv.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z1Ev()
|
||||
; IR: call {{.*}} @_Z1Dv.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z1Dv.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
|
||||
; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOTPRE: digraph "prestackupdate" {
|
||||
; DOTPRE: label="prestackupdate";
|
||||
|
||||
@@ -0,0 +1,244 @@
|
||||
;; Test context disambiguation for a callgraph containing multiple memprof
|
||||
;; contexts and no inlining, where we need to perform additional cloning
|
||||
;; during function assignment/cloning to handle the combination of contexts
|
||||
;; to 2 different allocations.
|
||||
;;
|
||||
;; void E(char **buf1, char **buf2) {
|
||||
;; *buf1 = new char[10];
|
||||
;; *buf2 = new char[10];
|
||||
;; }
|
||||
;;
|
||||
;; void B(char **buf1, char **buf2) {
|
||||
;; E(buf1, buf2);
|
||||
;; }
|
||||
;;
|
||||
;; void C(char **buf1, char **buf2) {
|
||||
;; E(buf1, buf2);
|
||||
;; }
|
||||
;;
|
||||
;; void D(char **buf1, char **buf2) {
|
||||
;; E(buf1, buf2);
|
||||
;; }
|
||||
;; int main(int argc, char **argv) {
|
||||
;; char *cold1, *cold2, *default1, *default2, *default3, *default4;
|
||||
;; B(&default1, &default2);
|
||||
;; C(&default3, &cold1);
|
||||
;; D(&cold2, &default4);
|
||||
;; memset(cold1, 0, 10);
|
||||
;; memset(cold2, 0, 10);
|
||||
;; memset(default1, 0, 10);
|
||||
;; memset(default2, 0, 10);
|
||||
;; memset(default3, 0, 10);
|
||||
;; memset(default4, 0, 10);
|
||||
;; delete[] default1;
|
||||
;; delete[] default2;
|
||||
;; delete[] default3;
|
||||
;; delete[] default4;
|
||||
;; sleep(10);
|
||||
;; delete[] cold1;
|
||||
;; delete[] cold2;
|
||||
;; return 0;
|
||||
;; }
|
||||
;;
|
||||
;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
|
||||
;; memory freed after sleep(10) results in cold lifetimes.
|
||||
;;
|
||||
;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
|
||||
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS --check-prefix=REMARKS
|
||||
|
||||
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
|
||||
entry:
|
||||
%call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7
|
||||
%call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15
|
||||
ret void
|
||||
}
|
||||
|
||||
declare ptr @_Znam(i64) #1
|
||||
|
||||
define internal void @_Z1BPPcS0_(ptr %0, ptr %1) {
|
||||
entry:
|
||||
call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: noinline
|
||||
define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 {
|
||||
entry:
|
||||
call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 {
|
||||
entry:
|
||||
call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
|
||||
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
|
||||
|
||||
declare i32 @sleep() #5
|
||||
|
||||
; uselistorder directives
|
||||
uselistorder ptr @_Znam, { 1, 0 }
|
||||
|
||||
attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
|
||||
attributes #1 = { "no-trapping-math"="true" }
|
||||
attributes #2 = { noinline }
|
||||
attributes #3 = { "frame-pointer"="all" }
|
||||
attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
|
||||
attributes #5 = { "disable-tail-calls"="true" }
|
||||
attributes #6 = { builtin }
|
||||
|
||||
!0 = !{!1, !3, !5}
|
||||
!1 = !{!2, !"cold"}
|
||||
!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
|
||||
!3 = !{!4, !"notcold"}
|
||||
!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
|
||||
!5 = !{!6, !"notcold"}
|
||||
!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
|
||||
!7 = !{i64 -3461278137325233666}
|
||||
!8 = !{!9, !11, !13}
|
||||
!9 = !{!10, !"notcold"}
|
||||
!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
|
||||
!11 = !{!12, !"cold"}
|
||||
!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
|
||||
!13 = !{!14, !"notcold"}
|
||||
!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
|
||||
!15 = !{i64 -1415475215210681400}
|
||||
!16 = !{i64 -2441057035866683071}
|
||||
!17 = !{i64 -3483158674395044949}
|
||||
!18 = !{i64 -7799663586031895603}
|
||||
|
||||
|
||||
;; Originally we create a single clone of each call to new from E, since each
|
||||
;; allocates cold memory for a single caller.
|
||||
|
||||
; DUMP: CCG after cloning:
|
||||
; DUMP: Callsite Context Graph:
|
||||
; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
|
||||
; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0)
|
||||
; DUMP: AllocTypes: NotCold
|
||||
; DUMP: ContextIds: 2 3
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
|
||||
; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
|
||||
|
||||
; DUMP: Node [[D:0x[a-z0-9]+]]
|
||||
; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0)
|
||||
; DUMP: AllocTypes: NotColdCold
|
||||
; DUMP: ContextIds: 1 6
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
|
||||
; DUMP: CallerEdges:
|
||||
|
||||
; DUMP: Node [[C]]
|
||||
; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0)
|
||||
; DUMP: AllocTypes: NotColdCold
|
||||
; DUMP: ContextIds: 2 5
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
|
||||
; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
|
||||
; DUMP: CallerEdges:
|
||||
|
||||
; DUMP: Node [[B]]
|
||||
; DUMP: call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0)
|
||||
; DUMP: AllocTypes: NotCold
|
||||
; DUMP: ContextIds: 3 4
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
|
||||
; DUMP: CallerEdges:
|
||||
|
||||
; DUMP: Node [[ENEW2ORIG]]
|
||||
; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0)
|
||||
; DUMP: AllocTypes: NotCold
|
||||
; DUMP: ContextIds: 4 6
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
|
||||
; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
|
||||
; DUMP: Clones: [[ENEW2CLONE]]
|
||||
|
||||
; DUMP: Node [[ENEW1CLONE]]
|
||||
; DUMP: %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0)
|
||||
; DUMP: AllocTypes: Cold
|
||||
; DUMP: ContextIds: 1
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
|
||||
; DUMP: Clone of [[ENEW1ORIG]]
|
||||
|
||||
; DUMP: Node [[ENEW2CLONE]]
|
||||
; DUMP: %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6 (clone 0)
|
||||
; DUMP: AllocTypes: Cold
|
||||
; DUMP: ContextIds: 5
|
||||
; DUMP: CalleeEdges:
|
||||
; DUMP: CallerEdges:
|
||||
; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
|
||||
; DUMP: Clone of [[ENEW2ORIG]]
|
||||
|
||||
|
||||
;; We greedily create a clone of E that is initially used by the clones of the
|
||||
;; first call to new. However, we end up with an incompatible set of callers
|
||||
;; given the second call to new which has clones with a different combination of
|
||||
;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
|
||||
; REMARKS: created clone _Z1EPPcS0_.memprof.1
|
||||
; REMARKS: created clone _Z1EPPcS0_.memprof.2
|
||||
; REMARKS: created clone _Z1EPPcS0_.memprof.3
|
||||
; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_
|
||||
; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
|
||||
|
||||
|
||||
;; Original version of E is used for the non-cold allocations, both from B.
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
|
||||
; IR: define internal {{.*}} @_Z1BPPcS0_(
|
||||
; IR: call {{.*}} @_Z1EPPcS0_(
|
||||
;; C calls a clone of E with the first new allocating cold memory and the
|
||||
;; second allocating non-cold memory.
|
||||
; IR: define internal {{.*}} @_Z1CPPcS0_(
|
||||
; IR: call {{.*}} @_Z1EPPcS0_.memprof.3(
|
||||
;; D calls a clone of E with the first new allocating non-cold memory and the
|
||||
;; second allocating cold memory.
|
||||
; IR: define internal {{.*}} @_Z1DPPcS0_(
|
||||
; IR: call {{.*}} @_Z1EPPcS0_.memprof.2(
|
||||
;; Transient clone that will get removed as it ends up with no callers.
|
||||
;; Its calls to new never get updated with a memprof attribute as a result.
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.1(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]]
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
|
||||
; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3(
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
|
||||
; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
|
||||
; IR: attributes #[[DEFAULT]] = { builtin }
|
||||
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
@@ -1,7 +1,7 @@
|
||||
;; Tests callsite context graph generation for call graph containing indirect
|
||||
;; calls. Currently this should result in conservative behavior, such that the
|
||||
;; indirect call receives a null call in its graph node, to prevent subsequent
|
||||
;; cloning.
|
||||
;; cloning. Also tests graph and IR cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -54,7 +54,9 @@
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should only create a single clone of foo, for the direct call
|
||||
@@ -340,6 +342,41 @@ attributes #7 = { builtin }
|
||||
; DUMP: Clone of [[FOO]]
|
||||
|
||||
|
||||
; REMARKS: created clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov
|
||||
; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov
|
||||
; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
|
||||
|
||||
|
||||
; IR: define {{.*}} @main(
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
;; Only the second call to foo, which allocates cold memory via direct calls,
|
||||
;; is replaced with a call to a clone that calls a cold allocation.
|
||||
; IR: call {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: call {{.*}} @_Z3barP1A(
|
||||
; IR: define internal {{.*}} @_ZN1A1xEv(
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
; IR: define internal {{.*}} @_ZN1B1xEv(
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
; IR: define internal {{.*}} @_Z3foov()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
|
||||
; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
; DOT: label="postbuild";
|
||||
; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
;; Test callsite context graph generation for call graph with two memprof
|
||||
;; contexts and partial inlining, requiring generation of a new fused node to
|
||||
;; represent the inlined sequence while matching callsite nodes onto the graph.
|
||||
;; Also tests graph and IR cloning.
|
||||
;;
|
||||
;; Original code looks like:
|
||||
;;
|
||||
@@ -43,7 +44,9 @@
|
||||
; RUN: opt -passes=memprof-context-disambiguation \
|
||||
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
|
||||
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
|
||||
; RUN: -stats -pass-remarks=memprof-context-disambiguation \
|
||||
; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
|
||||
; RUN: --check-prefix=STATS --check-prefix=REMARKS
|
||||
|
||||
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
|
||||
;; We should create clones for foo and bar for the call from main to allocate
|
||||
@@ -251,6 +254,42 @@ attributes #7 = { builtin }
|
||||
; DUMP: Clone of [[BAR]]
|
||||
|
||||
|
||||
; REMARKS: created clone _Z3barv.memprof.1
|
||||
; REMARKS: created clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
|
||||
; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1
|
||||
; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
|
||||
; REMARKS: call in clone main assigned to call function clone _Z3foov
|
||||
; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv
|
||||
; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
|
||||
; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold
|
||||
|
||||
|
||||
; IR: define internal {{.*}} @_Z3barv()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3foov()
|
||||
; IR: call {{.*}} @_Z3barv()
|
||||
; IR: define {{.*}} @main()
|
||||
;; The first call to foo does not allocate cold memory. It should call the
|
||||
;; original functions, which ultimately call the original allocation decorated
|
||||
;; with a "notcold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov()
|
||||
;; The second call to foo allocates cold memory. It should call cloned functions
|
||||
;; which ultimately call a cloned allocation decorated with a "cold" attribute.
|
||||
; IR: call {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: define internal {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
|
||||
; IR: define internal {{.*}} @_Z3foov.memprof.1()
|
||||
; IR: call {{.*}} @_Z3barv.memprof.1()
|
||||
; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
|
||||
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
|
||||
|
||||
|
||||
; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
|
||||
; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
|
||||
; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
|
||||
|
||||
|
||||
; DOT: digraph "postbuild" {
|
||||
; DOT: label="postbuild";
|
||||
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
|
||||
|
||||
Reference in New Issue
Block a user