Files
clang-p2996/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
Ramkumar Ramachandra fd6260f13b [EquivClasses] Shorten members_{begin,end} idiom (#134373)
Introduce members() iterator-helper to shorten the members_{begin,end}
idiom. A previous attempt of this patch was #130319, which had to be
reverted due to unit-test failures when attempting to call members() on
the end iterator. In this patch, members() accepts either an ECValue or
an ElemTy, which is more intuitive and doesn't suffer from the same
issue.
2025-04-04 14:34:08 +01:00

1586 lines
54 KiB
C++

//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Implements a module splitting algorithm designed to support the
/// FullLTO --lto-partitions option for parallel codegen.
///
/// The role of this module splitting pass is the same as
/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
/// across a set of N partitions to allow for parallel codegen.
///
/// The similarities mostly end here, as this pass achieves load-balancing in a
/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
/// advantage of the structure of AMDGPU modules (which are mostly
/// self-contained) to allow for more efficient splitting without affecting
/// codegen negatively, or causing innaccurate resource usage analysis.
///
/// High-level pass overview:
/// - SplitGraph & associated classes
/// - Graph representation of the module and of the dependencies that
/// matter for splitting.
/// - RecursiveSearchSplitting
/// - Core splitting algorithm.
/// - SplitProposal
/// - Represents a suggested solution for splitting the input module. These
/// solutions can be scored to determine the best one when multiple
/// solutions are available.
/// - Driver/pass "run" function glues everything together.
#include "AMDGPUSplitModule.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <cassert>
#include <cmath>
#include <memory>
#include <utility>
#include <vector>
#ifndef NDEBUG
#include "llvm/Support/LockFileManager.h"
#endif
#define DEBUG_TYPE "amdgpu-split-module"
namespace llvm {
namespace {
static cl::opt<unsigned> MaxDepth(
"amdgpu-module-splitting-max-depth",
cl::desc(
"maximum search depth. 0 forces a greedy approach. "
"warning: the algorithm is up to O(2^N), where N is the max depth."),
cl::init(8));
static cl::opt<float> LargeFnFactor(
"amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
cl::desc(
"when max depth is reached and we can no longer branch out, this "
"value determines if a function is worth merging into an already "
"existing partition to reduce code duplication. This is a factor "
"of the ideal partition size, e.g. 2.0 means we consider the "
"function for merging if its cost (including its callees) is 2x the "
"size of an ideal partition."));
static cl::opt<float> LargeFnOverlapForMerge(
"amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
cl::desc("when a function is considered for merging into a partition that "
"already contains some of its callees, do the merge if at least "
"n% of the code it can reach is already present inside the "
"partition; e.g. 0.7 means only merge >70%"));
static cl::opt<bool> NoExternalizeGlobals(
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
cl::desc("disables externalization of global variable with local linkage; "
"may cause globals to be duplicated which increases binary size"));
static cl::opt<bool> NoExternalizeOnAddrTaken(
"amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
cl::desc(
"disables externalization of functions whose addresses are taken"));
static cl::opt<std::string>
ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
cl::Hidden,
cl::desc("output file to write out the dotgraph "
"representation of the input module"));
static cl::opt<std::string> PartitionSummariesOutput(
"amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
cl::desc("output file to write out a summary of "
"the partitions created for each module"));
#ifndef NDEBUG
static cl::opt<bool>
UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
cl::desc("use a lock file so only one process in the system "
"can run this pass at once. useful to avoid mangled "
"debug output in multithreaded environments."));
static cl::opt<bool>
DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
cl::Hidden,
cl::desc("print all proposals received and whether "
"they were rejected or accepted"));
#endif
struct SplitModuleTimer : NamedRegionTimer {
SplitModuleTimer(StringRef Name, StringRef Desc)
: NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
TimePassesIsEnabled) {}
};
//===----------------------------------------------------------------------===//
// Utils
//===----------------------------------------------------------------------===//
using CostType = InstructionCost::CostType;
using FunctionsCostMap = DenseMap<const Function *, CostType>;
using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
static constexpr unsigned InvalidPID = -1;
/// \param Num numerator
/// \param Dem denominator
/// \returns a printable object to print (Num/Dem) using "%0.2f".
static auto formatRatioOf(CostType Num, CostType Dem) {
CostType DemOr1 = Dem ? Dem : 1;
return format("%0.2f", (static_cast<double>(Num) / DemOr1) * 100);
}
/// Checks whether a given function is non-copyable.
///
/// Non-copyable functions cannot be cloned into multiple partitions, and only
/// one copy of the function can be present across all partitions.
///
/// Kernel functions and external functions fall into this category. If we were
/// to clone them, we would end up with multiple symbol definitions and a very
/// unhappy linker.
static bool isNonCopyable(const Function &F) {
return F.hasExternalLinkage() || !F.isDefinitionExact() ||
AMDGPU::isEntryFunctionCC(F.getCallingConv());
}
/// If \p GV has local linkage, make it external + hidden.
static void externalize(GlobalValue &GV) {
if (GV.hasLocalLinkage()) {
GV.setLinkage(GlobalValue::ExternalLinkage);
GV.setVisibility(GlobalValue::HiddenVisibility);
}
// Unnamed entities must be named consistently between modules. setName will
// give a distinct name to each such entity.
if (!GV.hasName())
GV.setName("__llvmsplit_unnamed");
}
/// Cost analysis function. Calculates the cost of each function in \p M
///
/// \param GetTTI Abstract getter for TargetTransformInfo.
/// \param M Module to analyze.
/// \param CostMap[out] Resulting Function -> Cost map.
/// \return The module's total cost.
static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
FunctionsCostMap &CostMap) {
SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
CostType ModuleCost = 0;
[[maybe_unused]] CostType KernelCost = 0;
for (auto &Fn : M) {
if (Fn.isDeclaration())
continue;
CostType FnCost = 0;
const auto &TTI = GetTTI(Fn);
for (const auto &BB : Fn) {
for (const auto &I : BB) {
auto Cost =
TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
assert(Cost != InstructionCost::getMax());
// Assume expensive if we can't tell the cost of an instruction.
CostType CostVal =
Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
assert((FnCost + CostVal) >= FnCost && "Overflow!");
FnCost += CostVal;
}
}
assert(FnCost != 0);
CostMap[&Fn] = FnCost;
assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
ModuleCost += FnCost;
if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
KernelCost += FnCost;
}
if (CostMap.empty())
return 0;
assert(ModuleCost);
LLVM_DEBUG({
const CostType FnCost = ModuleCost - KernelCost;
dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
<< "" << KernelCost << " ("
<< format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
<< "% of the module), functions cost " << FnCost << " ("
<< format("%0.2f", (float(FnCost) / ModuleCost) * 100)
<< "% of the module)\n";
});
return ModuleCost;
}
/// \return true if \p F can be indirectly called
static bool canBeIndirectlyCalled(const Function &F) {
if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
return false;
return !F.hasLocalLinkage() ||
F.hasAddressTaken(/*PutOffender=*/nullptr,
/*IgnoreCallbackUses=*/false,
/*IgnoreAssumeLikeCalls=*/true,
/*IgnoreLLVMUsed=*/true,
/*IgnoreARCAttachedCall=*/false,
/*IgnoreCastedDirectCall=*/true);
}
//===----------------------------------------------------------------------===//
// Graph-based Module Representation
//===----------------------------------------------------------------------===//
/// AMDGPUSplitModule's view of the source Module, as a graph of all components
/// that can be split into different modules.
///
/// The most trivial instance of this graph is just the CallGraph of the module,
/// but it is not guaranteed that the graph is strictly equal to the CG. It
/// currently always is but it's designed in a way that would eventually allow
/// us to create abstract nodes, or nodes for different entities such as global
/// variables or any other meaningful constraint we must consider.
///
/// The graph is only mutable by this class, and is generally not modified
/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
/// mutate it.
class SplitGraph {
public:
class Node;
enum class EdgeKind : uint8_t {
/// The nodes are related through a direct call. This is a "strong" edge as
/// it means the Src will directly reference the Dst.
DirectCall,
/// The nodes are related through an indirect call.
/// This is a "weaker" edge and is only considered when traversing the graph
/// starting from a kernel. We need this edge for resource usage analysis.
///
/// The reason why we have this edge in the first place is due to how
/// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
/// the resource usage of the kernel containing the indirect call is the
/// max resource usage of all functions that can be indirectly called.
IndirectCall,
};
/// An edge between two nodes. Edges are directional, and tagged with a
/// "kind".
struct Edge {
Edge(Node *Src, Node *Dst, EdgeKind Kind)
: Src(Src), Dst(Dst), Kind(Kind) {}
Node *Src; ///< Source
Node *Dst; ///< Destination
EdgeKind Kind;
};
using EdgesVec = SmallVector<const Edge *, 0>;
using edges_iterator = EdgesVec::const_iterator;
using nodes_iterator = const Node *const *;
SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
CostType ModuleCost)
: M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
void buildGraph(CallGraph &CG);
#ifndef NDEBUG
bool verifyGraph() const;
#endif
bool empty() const { return Nodes.empty(); }
const iterator_range<nodes_iterator> nodes() const {
return {Nodes.begin(), Nodes.end()};
}
const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
unsigned getNumNodes() const { return Nodes.size(); }
BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
const Module &getModule() const { return M; }
CostType getModuleCost() const { return ModuleCost; }
CostType getCost(const Function &F) const { return CostMap.at(&F); }
/// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
/// IDs).
CostType calculateCost(const BitVector &BV) const;
private:
/// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
/// updates \p Cache.
Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
const GlobalValue &GV);
// Create a new edge between two nodes and add it to both nodes.
const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
const Module &M;
const FunctionsCostMap &CostMap;
CostType ModuleCost;
// Final list of nodes with stable ordering.
SmallVector<Node *> Nodes;
SpecificBumpPtrAllocator<Node> NodesPool;
// Edges are trivially destructible objects, so as a small optimization we
// use a BumpPtrAllocator which avoids destructor calls but also makes
// allocation faster.
static_assert(
std::is_trivially_destructible_v<Edge>,
"Edge must be trivially destructible to use the BumpPtrAllocator");
BumpPtrAllocator EdgesPool;
};
/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
/// Incoming edges have this node as their Dst, and Outgoing ones have this node
/// as their Src.
///
/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
/// feedback on how two nodes are related, and in which direction they are
/// related, which is valuable information to make splitting decisions.
///
/// Nodes are fundamentally abstract, and any consumers of the graph should
/// treat them as such. While a node will be a function most of the time, we
/// could also create nodes for any other reason. In the future, we could have
/// single nodes for multiple functions, or nodes for GVs, etc.
class SplitGraph::Node {
friend class SplitGraph;
public:
Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
bool IsNonCopyable)
: ID(ID), GV(GV), IndividualCost(IndividualCost),
IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
if (auto *Fn = dyn_cast<Function>(&GV))
IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
}
/// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
/// nodes in the graph. This ID can be used as an index in a BitVector.
unsigned getID() const { return ID; }
const Function &getFunction() const { return cast<Function>(GV); }
/// \returns the cost to import this component into a given module, not
/// accounting for any dependencies that may need to be imported as well.
CostType getIndividualCost() const { return IndividualCost; }
bool isNonCopyable() const { return IsNonCopyable; }
bool isEntryFunctionCC() const { return IsEntryFnCC; }
/// \returns whether this is an entry point in the graph. Entry points are
/// defined as follows: if you take all entry points in the graph, and iterate
/// their dependencies, you are guaranteed to visit all nodes in the graph at
/// least once.
bool isGraphEntryPoint() const { return IsGraphEntry; }
StringRef getName() const { return GV.getName(); }
bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
}
bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
}
iterator_range<edges_iterator> incoming_edges() const {
return IncomingEdges;
}
iterator_range<edges_iterator> outgoing_edges() const {
return OutgoingEdges;
}
bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
/// Visit all children of this node in a recursive fashion. Also visits Self.
/// If \ref shouldFollowIndirectCalls returns false, then this only follows
/// DirectCall edges.
///
/// \param Visitor Visitor Function.
void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
/// Adds the depedencies of this node in \p BV by setting the bit
/// corresponding to each node.
///
/// Implemented using \ref visitAllDependencies, hence it follows the same
/// rules regarding dependencies traversal.
///
/// \param[out] BV The bitvector where the bits should be set.
void getDependencies(BitVector &BV) const {
visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
}
private:
void markAsGraphEntry() { IsGraphEntry = true; }
unsigned ID;
const GlobalValue &GV;
CostType IndividualCost;
bool IsNonCopyable : 1;
bool IsEntryFnCC : 1;
bool IsGraphEntry : 1;
// TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
// together)
EdgesVec IncomingEdges;
EdgesVec OutgoingEdges;
};
void SplitGraph::Node::visitAllDependencies(
std::function<void(const Node &)> Visitor) const {
const bool FollowIndirect = shouldFollowIndirectCalls();
// FIXME: If this can access SplitGraph in the future, use a BitVector
// instead.
DenseSet<const Node *> Seen;
SmallVector<const Node *, 8> WorkList({this});
while (!WorkList.empty()) {
const Node *CurN = WorkList.pop_back_val();
if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
continue;
Visitor(*CurN);
for (const Edge *E : CurN->outgoing_edges()) {
if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
continue;
WorkList.push_back(E->Dst);
}
}
}
/// Checks if \p I has MD_callees and if it does, parse it and put the function
/// in \p Callees.
///
/// \returns true if there was metadata and it was parsed correctly. false if
/// there was no MD or if it contained unknown entries and parsing failed.
/// If this returns false, \p Callees will contain incomplete information
/// and must not be used.
static bool handleCalleesMD(const Instruction &I,
SetVector<Function *> &Callees) {
auto *MD = I.getMetadata(LLVMContext::MD_callees);
if (!MD)
return false;
for (const auto &Op : MD->operands()) {
Function *Callee = mdconst::extract_or_null<Function>(Op);
if (!Callee)
return false;
Callees.insert(Callee);
}
return true;
}
void SplitGraph::buildGraph(CallGraph &CG) {
SplitModuleTimer SMT("buildGraph", "graph construction");
LLVM_DEBUG(
dbgs()
<< "[build graph] constructing graph representation of the input\n");
// FIXME(?): Is the callgraph really worth using if we have to iterate the
// function again whenever it fails to give us enough information?
// We build the graph by just iterating all functions in the module and
// working on their direct callees. At the end, all nodes should be linked
// together as expected.
DenseMap<const GlobalValue *, Node *> Cache;
SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
for (const Function &Fn : M) {
if (Fn.isDeclaration())
continue;
// Look at direct callees and create the necessary edges in the graph.
SetVector<const Function *> DirectCallees;
bool CallsExternal = false;
for (auto &CGEntry : *CG[&Fn]) {
auto *CGNode = CGEntry.second;
if (auto *Callee = CGNode->getFunction()) {
if (!Callee->isDeclaration())
DirectCallees.insert(Callee);
} else if (CGNode == CG.getCallsExternalNode())
CallsExternal = true;
}
// Keep track of this function if it contains an indirect call and/or if it
// can be indirectly called.
if (CallsExternal) {
LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for ";
Fn.printAsOperand(dbgs());
dbgs() << " - analyzing function\n");
SetVector<Function *> KnownCallees;
bool HasUnknownIndirectCall = false;
for (const auto &Inst : instructions(Fn)) {
// look at all calls without a direct callee.
const auto *CB = dyn_cast<CallBase>(&Inst);
if (!CB || CB->getCalledFunction())
continue;
// inline assembly can be ignored, unless InlineAsmIsIndirectCall is
// true.
if (CB->isInlineAsm()) {
LLVM_DEBUG(dbgs() << " found inline assembly\n");
continue;
}
if (handleCalleesMD(Inst, KnownCallees))
continue;
// If we failed to parse any !callees MD, or some was missing,
// the entire KnownCallees list is now unreliable.
KnownCallees.clear();
// Everything else is handled conservatively. If we fall into the
// conservative case don't bother analyzing further.
HasUnknownIndirectCall = true;
break;
}
if (HasUnknownIndirectCall) {
LLVM_DEBUG(dbgs() << " indirect call found\n");
FnsWithIndirectCalls.push_back(&Fn);
} else if (!KnownCallees.empty())
DirectCallees.insert_range(KnownCallees);
}
Node &N = getNode(Cache, Fn);
for (const auto *Callee : DirectCallees)
createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
if (canBeIndirectlyCalled(Fn))
IndirectlyCallableFns.push_back(&Fn);
}
// Post-process functions with indirect calls.
for (const Function *Fn : FnsWithIndirectCalls) {
for (const Function *Candidate : IndirectlyCallableFns) {
Node &Src = getNode(Cache, *Fn);
Node &Dst = getNode(Cache, *Candidate);
createEdge(Src, Dst, EdgeKind::IndirectCall);
}
}
// Now, find all entry points.
SmallVector<Node *, 16> CandidateEntryPoints;
BitVector NodesReachableByKernels = createNodesBitVector();
for (Node *N : Nodes) {
// Functions with an Entry CC are always graph entry points too.
if (N->isEntryFunctionCC()) {
N->markAsGraphEntry();
N->getDependencies(NodesReachableByKernels);
} else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
CandidateEntryPoints.push_back(N);
}
for (Node *N : CandidateEntryPoints) {
// This can be another entry point if it's not reachable by a kernel
// TODO: We could sort all of the possible new entries in a stable order
// (e.g. by cost), then consume them one by one until
// NodesReachableByKernels is all 1s. It'd allow us to avoid
// considering some nodes as non-entries in some specific cases.
if (!NodesReachableByKernels.test(N->getID()))
N->markAsGraphEntry();
}
#ifndef NDEBUG
assert(verifyGraph());
#endif
}
#ifndef NDEBUG
bool SplitGraph::verifyGraph() const {
unsigned ExpectedID = 0;
// Exceptionally using a set here in case IDs are messed up.
DenseSet<const Node *> SeenNodes;
DenseSet<const Function *> SeenFunctionNodes;
for (const Node *N : Nodes) {
if (N->getID() != (ExpectedID++)) {
errs() << "Node IDs are incorrect!\n";
return false;
}
if (!SeenNodes.insert(N).second) {
errs() << "Node seen more than once!\n";
return false;
}
if (&getNode(N->getID()) != N) {
errs() << "getNode doesn't return the right node\n";
return false;
}
for (const Edge *E : N->IncomingEdges) {
if (!E->Src || !E->Dst || (E->Dst != N) ||
(find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
errs() << "ill-formed incoming edges\n";
return false;
}
}
for (const Edge *E : N->OutgoingEdges) {
if (!E->Src || !E->Dst || (E->Src != N) ||
(find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
errs() << "ill-formed outgoing edges\n";
return false;
}
}
const Function &Fn = N->getFunction();
if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
if (N->hasAnyIncomingEdges()) {
errs() << "Kernels cannot have incoming edges\n";
return false;
}
}
if (Fn.isDeclaration()) {
errs() << "declarations shouldn't have nodes!\n";
return false;
}
auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
if (!Inserted) {
errs() << "one function has multiple nodes!\n";
return false;
}
}
if (ExpectedID != Nodes.size()) {
errs() << "Node IDs out of sync!\n";
return false;
}
if (createNodesBitVector().size() != getNumNodes()) {
errs() << "nodes bit vector doesn't have the right size!\n";
return false;
}
// Check we respect the promise of Node::isKernel
BitVector BV = createNodesBitVector();
for (const Node *N : nodes()) {
if (N->isGraphEntryPoint())
N->getDependencies(BV);
}
// Ensure each function in the module has an associated node.
for (const auto &Fn : M) {
if (!Fn.isDeclaration()) {
if (!SeenFunctionNodes.contains(&Fn)) {
errs() << "Fn has no associated node in the graph!\n";
return false;
}
}
}
if (!BV.all()) {
errs() << "not all nodes are reachable through the graph's entry points!\n";
return false;
}
return true;
}
#endif
CostType SplitGraph::calculateCost(const BitVector &BV) const {
CostType Cost = 0;
for (unsigned NodeID : BV.set_bits())
Cost += getNode(NodeID).getIndividualCost();
return Cost;
}
SplitGraph::Node &
SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
const GlobalValue &GV) {
auto &N = Cache[&GV];
if (N)
return *N;
CostType Cost = 0;
bool NonCopyable = false;
if (const Function *Fn = dyn_cast<Function>(&GV)) {
NonCopyable = isNonCopyable(*Fn);
Cost = CostMap.at(Fn);
}
N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
Nodes.push_back(N);
assert(&getNode(N->getID()) == N);
return *N;
}
const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
EdgeKind EK) {
const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
Src.OutgoingEdges.push_back(E);
Dst.IncomingEdges.push_back(E);
return *E;
}
//===----------------------------------------------------------------------===//
// Split Proposals
//===----------------------------------------------------------------------===//
/// Represents a module splitting proposal.
///
/// Proposals are made of N BitVectors, one for each partition, where each bit
/// set indicates that the node is present and should be copied inside that
/// partition.
///
/// Proposals have several metrics attached so they can be compared/sorted,
/// which the driver to try multiple strategies resultings in multiple proposals
/// and choose the best one out of them.
class SplitProposal {
public:
SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
}
void setName(StringRef NewName) { Name = NewName; }
StringRef getName() const { return Name; }
const BitVector &operator[](unsigned PID) const {
return Partitions[PID].second;
}
void add(unsigned PID, const BitVector &BV) {
Partitions[PID].second |= BV;
updateScore(PID);
}
void print(raw_ostream &OS) const;
LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
// Find the cheapest partition (lowest cost). In case of ties, always returns
// the highest partition number.
unsigned findCheapestPartition() const;
/// Calculate the CodeSize and Bottleneck scores.
void calculateScores();
#ifndef NDEBUG
void verifyCompleteness() const;
#endif
/// Only available after \ref calculateScores is called.
///
/// A positive number indicating the % of code duplication that this proposal
/// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
/// duplicating some functions across partitions.
///
/// Value is always rounded up to 3 decimal places.
///
/// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
double getCodeSizeScore() const { return CodeSizeScore; }
/// Only available after \ref calculateScores is called.
///
/// A number between [0, 1] which indicates how big of a bottleneck is
/// expected from the largest partition.
///
/// A score of 1.0 means the biggest partition is as big as the source module,
/// so build time will be equal to or greater than the build time of the
/// initial input.
///
/// Value is always rounded up to 3 decimal places.
///
/// This is one of the metrics used to estimate this proposal's build time.
double getBottleneckScore() const { return BottleneckScore; }
private:
void updateScore(unsigned PID) {
assert(SG);
for (auto &[PCost, Nodes] : Partitions) {
TotalCost -= PCost;
PCost = SG->calculateCost(Nodes);
TotalCost += PCost;
}
}
/// \see getCodeSizeScore
double CodeSizeScore = 0.0;
/// \see getBottleneckScore
double BottleneckScore = 0.0;
/// Aggregated cost of all partitions
CostType TotalCost = 0;
const SplitGraph *SG = nullptr;
std::string Name;
std::vector<std::pair<CostType, BitVector>> Partitions;
};
void SplitProposal::print(raw_ostream &OS) const {
assert(SG);
OS << "[proposal] " << Name << ", total cost:" << TotalCost
<< ", code size score:" << format("%0.3f", CodeSizeScore)
<< ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
for (const auto &[PID, Part] : enumerate(Partitions)) {
const auto &[Cost, NodeIDs] = Part;
OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
<< '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
}
}
unsigned SplitProposal::findCheapestPartition() const {
assert(!Partitions.empty());
CostType CurCost = std::numeric_limits<CostType>::max();
unsigned CurPID = InvalidPID;
for (const auto &[Idx, Part] : enumerate(Partitions)) {
if (Part.first <= CurCost) {
CurPID = Idx;
CurCost = Part.first;
}
}
assert(CurPID != InvalidPID);
return CurPID;
}
void SplitProposal::calculateScores() {
if (Partitions.empty())
return;
assert(SG);
CostType LargestPCost = 0;
for (auto &[PCost, Nodes] : Partitions) {
if (PCost > LargestPCost)
LargestPCost = PCost;
}
CostType ModuleCost = SG->getModuleCost();
CodeSizeScore = double(TotalCost) / ModuleCost;
assert(CodeSizeScore >= 0.0);
BottleneckScore = double(LargestPCost) / ModuleCost;
CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
}
#ifndef NDEBUG
void SplitProposal::verifyCompleteness() const {
if (Partitions.empty())
return;
BitVector Result = Partitions[0].second;
for (const auto &P : drop_begin(Partitions))
Result |= P.second;
assert(Result.all() && "some nodes are missing from this proposal!");
}
#endif
//===-- RecursiveSearchStrategy -------------------------------------------===//
/// Partitioning algorithm.
///
/// This is a recursive search algorithm that can explore multiple possiblities.
///
/// When a cluster of nodes can go into more than one partition, and we haven't
/// reached maximum search depth, we recurse and explore both options and their
/// consequences. Both branches will yield a proposal, and the driver will grade
/// both and choose the best one.
///
/// If max depth is reached, we will use some heuristics to make a choice. Most
/// of the time we will just use the least-pressured (cheapest) partition, but
/// if a cluster is particularly big and there is a good amount of overlap with
/// an existing partition, we will choose that partition instead.
class RecursiveSearchSplitting {
public:
using SubmitProposalFn = function_ref<void(SplitProposal)>;
RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
SubmitProposalFn SubmitProposal);
void run();
private:
struct WorkListEntry {
WorkListEntry(const BitVector &BV) : Cluster(BV) {}
unsigned NumNonEntryNodes = 0;
CostType TotalCost = 0;
CostType CostExcludingGraphEntryPoints = 0;
BitVector Cluster;
};
/// Collects all graph entry points's clusters and sort them so the most
/// expensive clusters are viewed first. This will merge clusters together if
/// they share a non-copyable dependency.
void setupWorkList();
/// Recursive function that assigns the worklist item at \p Idx into a
/// partition of \p SP.
///
/// \p Depth is the current search depth. When this value is equal to
/// \ref MaxDepth, we can no longer recurse.
///
/// This function only recurses if there is more than one possible assignment,
/// otherwise it is iterative to avoid creating a call stack that is as big as
/// \ref WorkList.
void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
/// \return A pair: first element is the PID of the partition that has the
/// most similarities with \p Entry, or \ref InvalidPID if no partition was
/// found with at least one element in common. The second element is the
/// aggregated cost of all dependencies in common between \p Entry and that
/// partition.
std::pair<unsigned, CostType>
findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
const SplitGraph &SG;
unsigned NumParts;
SubmitProposalFn SubmitProposal;
// A Cluster is considered large when its cost, excluding entry points,
// exceeds this value.
CostType LargeClusterThreshold = 0;
unsigned NumProposalsSubmitted = 0;
SmallVector<WorkListEntry> WorkList;
};
RecursiveSearchSplitting::RecursiveSearchSplitting(
const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
: SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
// arbitrary max value as a safeguard. Anything above 10 will already be
// slow, this is just a max value to prevent extreme resource exhaustion or
// unbounded run time.
if (MaxDepth > 16)
report_fatal_error("[amdgpu-split-module] search depth of " +
Twine(MaxDepth) + " is too high!");
LargeClusterThreshold =
(LargeFnFactor != 0.0)
? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
: std::numeric_limits<CostType>::max();
LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
<< LargeClusterThreshold << "\n");
}
void RecursiveSearchSplitting::run() {
{
SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
setupWorkList();
}
{
SplitModuleTimer SMT("recursive_search_pick", "partitioning");
SplitProposal SP(SG, NumParts);
pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
}
}
void RecursiveSearchSplitting::setupWorkList() {
// e.g. if A and B are two worklist item, and they both call a non copyable
// dependency C, this does:
// A=C
// B=C
// => NodeEC will create a single group (A, B, C) and we create a new
// WorkList entry for that group.
EquivalenceClasses<unsigned> NodeEC;
for (const SplitGraph::Node *N : SG.nodes()) {
if (!N->isGraphEntryPoint())
continue;
NodeEC.insert(N->getID());
N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
if (&Dep != N && Dep.isNonCopyable())
NodeEC.unionSets(N->getID(), Dep.getID());
});
}
for (const auto &Node : NodeEC) {
if (!Node->isLeader())
continue;
BitVector Cluster = SG.createNodesBitVector();
for (unsigned M : NodeEC.members(*Node)) {
const SplitGraph::Node &N = SG.getNode(M);
if (N.isGraphEntryPoint())
N.getDependencies(Cluster);
}
WorkList.emplace_back(std::move(Cluster));
}
// Calculate costs and other useful information.
for (WorkListEntry &Entry : WorkList) {
for (unsigned NodeID : Entry.Cluster.set_bits()) {
const SplitGraph::Node &N = SG.getNode(NodeID);
const CostType Cost = N.getIndividualCost();
Entry.TotalCost += Cost;
if (!N.isGraphEntryPoint()) {
Entry.CostExcludingGraphEntryPoints += Cost;
++Entry.NumNonEntryNodes;
}
}
}
stable_sort(WorkList, [](const WorkListEntry &A, const WorkListEntry &B) {
if (A.TotalCost != B.TotalCost)
return A.TotalCost > B.TotalCost;
if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints)
return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints;
if (A.NumNonEntryNodes != B.NumNonEntryNodes)
return A.NumNonEntryNodes > B.NumNonEntryNodes;
return A.Cluster.count() > B.Cluster.count();
});
LLVM_DEBUG({
dbgs() << "[recursive search] worklist:\n";
for (const auto &[Idx, Entry] : enumerate(WorkList)) {
dbgs() << " - [" << Idx << "]: ";
for (unsigned NodeID : Entry.Cluster.set_bits())
dbgs() << NodeID << " ";
dbgs() << "(total_cost:" << Entry.TotalCost
<< ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
<< ")\n";
}
});
}
void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
SplitProposal SP) {
while (Idx < WorkList.size()) {
// Step 1: Determine candidate PIDs.
//
const WorkListEntry &Entry = WorkList[Idx];
const BitVector &Cluster = Entry.Cluster;
// Default option is to do load-balancing, AKA assign to least pressured
// partition.
const unsigned CheapestPID = SP.findCheapestPartition();
assert(CheapestPID != InvalidPID);
// Explore assigning to the kernel that contains the most dependencies in
// common.
const auto [MostSimilarPID, SimilarDepsCost] =
findMostSimilarPartition(Entry, SP);
// We can chose to explore only one path if we only have one valid path, or
// if we reached maximum search depth and can no longer branch out.
unsigned SinglePIDToTry = InvalidPID;
if (MostSimilarPID == InvalidPID) // no similar PID found
SinglePIDToTry = CheapestPID;
else if (MostSimilarPID == CheapestPID) // both landed on the same PID
SinglePIDToTry = CheapestPID;
else if (Depth >= MaxDepth) {
// We have to choose one path. Use a heuristic to guess which one will be
// more appropriate.
if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
// Check if the amount of code in common makes it worth it.
assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
const double Ratio = static_cast<double>(SimilarDepsCost) /
Entry.CostExcludingGraphEntryPoints;
assert(Ratio >= 0.0 && Ratio <= 1.0);
if (Ratio > LargeFnOverlapForMerge) {
// For debug, just print "L", so we'll see "L3=P3" for instance, which
// will mean we reached max depth and chose P3 based on this
// heuristic.
LLVM_DEBUG(dbgs() << 'L');
SinglePIDToTry = MostSimilarPID;
}
} else
SinglePIDToTry = CheapestPID;
}
// Step 2: Explore candidates.
// When we only explore one possible path, and thus branch depth doesn't
// increase, do not recurse, iterate instead.
if (SinglePIDToTry != InvalidPID) {
LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
// Only one path to explore, don't clone SP, don't increase depth.
SP.add(SinglePIDToTry, Cluster);
++Idx;
continue;
}
assert(MostSimilarPID != InvalidPID);
// We explore multiple paths: recurse at increased depth, then stop this
// function.
LLVM_DEBUG(dbgs() << '\n');
// lb = load balancing = put in cheapest partition
{
SplitProposal BranchSP = SP;
LLVM_DEBUG(dbgs().indent(Depth)
<< " [lb] " << Idx << "=P" << CheapestPID << "? ");
BranchSP.add(CheapestPID, Cluster);
pickPartition(Depth + 1, Idx + 1, BranchSP);
}
// ms = most similar = put in partition with the most in common
{
SplitProposal BranchSP = SP;
LLVM_DEBUG(dbgs().indent(Depth)
<< " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
BranchSP.add(MostSimilarPID, Cluster);
pickPartition(Depth + 1, Idx + 1, BranchSP);
}
return;
}
// Step 3: If we assigned all WorkList items, submit the proposal.
assert(Idx == WorkList.size());
assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
"Search got out of bounds?");
SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
std::to_string(NumProposalsSubmitted++));
LLVM_DEBUG(dbgs() << '\n');
SubmitProposal(SP);
}
std::pair<unsigned, CostType>
RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
const SplitProposal &SP) {
if (!Entry.NumNonEntryNodes)
return {InvalidPID, 0};
// We take the partition that is the most similar using Cost as a metric.
// So we take the set of nodes in common, compute their aggregated cost, and
// pick the partition with the highest cost in common.
unsigned ChosenPID = InvalidPID;
CostType ChosenCost = 0;
for (unsigned PID = 0; PID < NumParts; ++PID) {
BitVector BV = SP[PID];
BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
if (BV.none())
continue;
const CostType Cost = SG.calculateCost(BV);
if (ChosenPID == InvalidPID || ChosenCost < Cost ||
(ChosenCost == Cost && PID > ChosenPID)) {
ChosenPID = PID;
ChosenCost = Cost;
}
}
return {ChosenPID, ChosenCost};
}
//===----------------------------------------------------------------------===//
// DOTGraph Printing Support
//===----------------------------------------------------------------------===//
const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
return E->Dst;
}
using SplitGraphEdgeDstIterator =
mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
} // namespace
template <> struct GraphTraits<SplitGraph> {
using NodeRef = const SplitGraph::Node *;
using nodes_iterator = SplitGraph::nodes_iterator;
using ChildIteratorType = SplitGraphEdgeDstIterator;
using EdgeRef = const SplitGraph::Edge *;
using ChildEdgeIteratorType = SplitGraph::edges_iterator;
static NodeRef getEntryNode(NodeRef N) { return N; }
static ChildIteratorType child_begin(NodeRef Ref) {
return {Ref->outgoing_edges().begin(), mapEdgeToDst};
}
static ChildIteratorType child_end(NodeRef Ref) {
return {Ref->outgoing_edges().end(), mapEdgeToDst};
}
static nodes_iterator nodes_begin(const SplitGraph &G) {
return G.nodes().begin();
}
static nodes_iterator nodes_end(const SplitGraph &G) {
return G.nodes().end();
}
};
template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
static std::string getGraphName(const SplitGraph &SG) {
return SG.getModule().getName().str();
}
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
return N->getName().str();
}
static std::string getNodeDescription(const SplitGraph::Node *N,
const SplitGraph &SG) {
std::string Result;
if (N->isEntryFunctionCC())
Result += "entry-fn-cc ";
if (N->isNonCopyable())
Result += "non-copyable ";
Result += "cost:" + std::to_string(N->getIndividualCost());
return Result;
}
static std::string getNodeAttributes(const SplitGraph::Node *N,
const SplitGraph &SG) {
return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
}
static std::string getEdgeAttributes(const SplitGraph::Node *N,
SplitGraphEdgeDstIterator EI,
const SplitGraph &SG) {
switch ((*EI.getCurrent())->Kind) {
case SplitGraph::EdgeKind::DirectCall:
return "";
case SplitGraph::EdgeKind::IndirectCall:
return "style=\"dashed\"";
}
llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
}
};
//===----------------------------------------------------------------------===//
// Driver
//===----------------------------------------------------------------------===//
namespace {
// If we didn't externalize GVs, then local GVs need to be conservatively
// imported into every module (including their initializers), and then cleaned
// up afterwards.
static bool needsConservativeImport(const GlobalValue *GV) {
if (const auto *Var = dyn_cast<GlobalVariable>(GV))
return Var->hasLocalLinkage();
return isa<GlobalAlias>(GV);
}
/// Prints a summary of the partition \p N, represented by module \p M, to \p
/// OS.
static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
unsigned PartCost, unsigned ModuleCost) {
OS << "*** Partition P" << N << " ***\n";
for (const auto &Fn : M) {
if (!Fn.isDeclaration())
OS << " - [function] " << Fn.getName() << "\n";
}
for (const auto &GV : M.globals()) {
if (GV.hasInitializer())
OS << " - [global] " << GV.getName() << "\n";
}
OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
<< "% of the source\n";
}
static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
LLVM_DEBUG({
New.verifyCompleteness();
if (DebugProposalSearch)
New.print(dbgs());
});
const double CurBScore = Best.getBottleneckScore();
const double CurCSScore = Best.getCodeSizeScore();
const double NewBScore = New.getBottleneckScore();
const double NewCSScore = New.getCodeSizeScore();
// TODO: Improve this
// We can probably lower the precision of the comparison at first
// e.g. if we have
// - (Current): BScore: 0.489 CSCore 1.105
// - (New): BScore: 0.475 CSCore 1.305
// Currently we'd choose the new one because the bottleneck score is
// lower, but the new one duplicates more code. It may be worth it to
// discard the new proposal as the impact on build time is negligible.
// Compare them
bool IsBest = false;
if (NewBScore < CurBScore)
IsBest = true;
else if (NewBScore == CurBScore)
IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
if (IsBest)
Best = std::move(New);
LLVM_DEBUG(if (DebugProposalSearch) {
if (IsBest)
dbgs() << "[search] new best proposal!\n";
else
dbgs() << "[search] discarding - not profitable\n";
});
}
/// Trivial helper to create an identical copy of \p M.
static std::unique_ptr<Module> cloneAll(const Module &M) {
ValueToValueMapTy VMap;
return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
}
/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
static void writeDOTGraph(const SplitGraph &SG) {
if (ModuleDotCfgOutput.empty())
return;
std::error_code EC;
raw_fd_ostream OS(ModuleDotCfgOutput, EC);
if (EC) {
errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
<< "' - DOTGraph will not be printed\n";
}
WriteGraph(OS, SG, /*ShortName=*/false,
/*Title=*/SG.getModule().getName());
}
static void splitAMDGPUModule(
GetTTIFn GetTTI, Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
CallGraph CG(M);
// Externalize functions whose address are taken.
//
// This is needed because partitioning is purely based on calls, but sometimes
// a kernel/function may just look at the address of another local function
// and not do anything (no calls). After partitioning, that local function may
// end up in a different module (so it's just a declaration in the module
// where its address is taken), which emits a "undefined hidden symbol" linker
// error.
//
// Additionally, it guides partitioning to not duplicate this function if it's
// called directly at some point.
//
// TODO: Could we be smarter about this ? This makes all functions whose
// addresses are taken non-copyable. We should probably model this type of
// constraint in the graph and use it to guide splitting, instead of
// externalizing like this. Maybe non-copyable should really mean "keep one
// visible copy, then internalize all other copies" for some functions?
if (!NoExternalizeOnAddrTaken) {
for (auto &Fn : M) {
// TODO: Should aliases count? Probably not but they're so rare I'm not
// sure it's worth fixing.
if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
dbgs() << " because its address is taken\n");
externalize(Fn);
}
}
}
// Externalize local GVs, which avoids duplicating their initializers, which
// in turns helps keep code size in check.
if (!NoExternalizeGlobals) {
for (auto &GV : M.globals()) {
if (GV.hasLocalLinkage())
LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
externalize(GV);
}
}
// Start by calculating the cost of every function in the module, as well as
// the module's overall cost.
FunctionsCostMap FnCosts;
const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
// Build the SplitGraph, which represents the module's functions and models
// their dependencies accurately.
SplitGraph SG(M, FnCosts, ModuleCost);
SG.buildGraph(CG);
if (SG.empty()) {
LLVM_DEBUG(
dbgs()
<< "[!] no nodes in graph, input is empty - no splitting possible\n");
ModuleCallback(cloneAll(M));
return;
}
LLVM_DEBUG({
dbgs() << "[graph] nodes:\n";
for (const SplitGraph::Node *N : SG.nodes()) {
dbgs() << " - [" << N->getID() << "]: " << N->getName() << " "
<< (N->isGraphEntryPoint() ? "(entry)" : "") << " "
<< (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
}
});
writeDOTGraph(SG);
LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
std::optional<SplitProposal> Proposal;
const auto EvaluateProposal = [&](SplitProposal SP) {
SP.calculateScores();
if (!Proposal)
Proposal = std::move(SP);
else
evaluateProposal(*Proposal, std::move(SP));
};
// TODO: It would be very easy to create new strategies by just adding a base
// class to RecursiveSearchSplitting and abstracting it away.
RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
<< Proposal->getName() << "\n";);
if (!Proposal) {
LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
ModuleCallback(cloneAll(M));
return;
}
LLVM_DEBUG(Proposal->print(dbgs()););
std::optional<raw_fd_ostream> SummariesOS;
if (!PartitionSummariesOutput.empty()) {
std::error_code EC;
SummariesOS.emplace(PartitionSummariesOutput, EC);
if (EC)
errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
<< "' - Partition summaries will not be printed\n";
}
for (unsigned PID = 0; PID < NumParts; ++PID) {
SplitModuleTimer SMT2("modules_creation",
"creating modules for each partition");
LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
DenseSet<const Function *> FnsInPart;
for (unsigned NodeID : (*Proposal)[PID].set_bits())
FnsInPart.insert(&SG.getNode(NodeID).getFunction());
ValueToValueMapTy VMap;
CostType PartCost = 0;
std::unique_ptr<Module> MPart(
CloneModule(M, VMap, [&](const GlobalValue *GV) {
// Functions go in their assigned partition.
if (const auto *Fn = dyn_cast<Function>(GV)) {
if (FnsInPart.contains(Fn)) {
PartCost += SG.getCost(*Fn);
return true;
}
return false;
}
// Everything else goes in the first partition.
return needsConservativeImport(GV) || PID == 0;
}));
// FIXME: Aliases aren't seen often, and their handling isn't perfect so
// bugs are possible.
// Clean-up conservatively imported GVs without any users.
for (auto &GV : make_early_inc_range(MPart->global_values())) {
if (needsConservativeImport(&GV) && GV.use_empty())
GV.eraseFromParent();
}
if (SummariesOS)
printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
LLVM_DEBUG(
printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
ModuleCallback(std::move(MPart));
}
}
} // namespace
PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
ModuleAnalysisManager &MAM) {
SplitModuleTimer SMT(
"total", "total pass runtime (incl. potentially waiting for lockfile)");
FunctionAnalysisManager &FAM =
MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
return FAM.getResult<TargetIRAnalysis>(F);
};
bool Done = false;
#ifndef NDEBUG
if (UseLockFile) {
SmallString<128> LockFilePath;
sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
sys::path::append(LockFilePath, "amdgpu-split-module-debug");
LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
<< "'\n");
while (true) {
llvm::LockFileManager Lock(LockFilePath.str());
bool Owned;
if (Error Err = Lock.tryLock().moveInto(Owned)) {
consumeError(std::move(Err));
LLVM_DEBUG(
dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
"output may be mangled by other processes\n");
} else if (!Owned) {
switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) {
case WaitForUnlockResult::Success:
break;
case WaitForUnlockResult::OwnerDied:
continue; // try again to get the lock.
case WaitForUnlockResult::Timeout:
LLVM_DEBUG(
dbgs()
<< "[amdgpu-split-module] unable to acquire lockfile, debug "
"output may be mangled by other processes\n");
Lock.unsafeMaybeUnlock();
break; // give up
}
}
splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
Done = true;
break;
}
}
#endif
if (!Done)
splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
// We can change linkage/visibilities in the input, consider that nothing is
// preserved just to be safe. This pass runs last anyway.
return PreservedAnalyses::none();
}
} // namespace llvm