788 lines
28 KiB
C++
788 lines
28 KiB
C++
//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file Implements a module splitting algorithm designed to support the
|
|
/// FullLTO --lto-partitions option for parallel codegen. This is completely
|
|
/// different from the common SplitModule pass, as this system is designed with
|
|
/// AMDGPU in mind.
|
|
///
|
|
/// The basic idea of this module splitting implementation is the same as
|
|
/// SplitModule: load-balance the module's functions across a set of N
|
|
/// partitions to allow parallel codegen. However, it does it very
|
|
/// differently than the target-agnostic variant:
|
|
/// - The module has "split roots", which are kernels in the vast
|
|
// majority of cases.
|
|
/// - Each root has a set of dependencies, and when a root and its
|
|
/// dependencies is considered "big", we try to put it in a partition where
|
|
/// most dependencies are already imported, to avoid duplicating large
|
|
/// amounts of code.
|
|
/// - There's special care for indirect calls in order to ensure
|
|
/// AMDGPUResourceUsageAnalysis can work correctly.
|
|
///
|
|
/// This file also includes a more elaborate logging system to enable
|
|
/// users to easily generate logs that (if desired) do not include any value
|
|
/// names, in order to not leak information about the source file.
|
|
/// Such logs are very helpful to understand and fix potential issues with
|
|
/// module splitting.
|
|
|
|
#include "AMDGPUSplitModule.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/Analysis/CallGraph.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/User.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/FileSystem.h"
|
|
#include "llvm/Support/Path.h"
|
|
#include "llvm/Support/Process.h"
|
|
#include "llvm/Support/SHA256.h"
|
|
#include "llvm/Support/Threading.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Transforms/Utils/Cloning.h"
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <iterator>
|
|
#include <memory>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-split-module"
|
|
|
|
namespace {
|
|
|
|
static cl::opt<float> LargeFnFactor(
|
|
"amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
|
|
cl::Hidden,
|
|
cl::desc(
|
|
"consider a function as large and needing special treatment when the "
|
|
"cost of importing it into a partition"
|
|
"exceeds the average cost of a partition by this factor; e;g. 2.0 "
|
|
"means if the function and its dependencies is 2 times bigger than "
|
|
"an average partition; 0 disables large functions handling entirely"));
|
|
|
|
static cl::opt<float> LargeFnOverlapForMerge(
|
|
"amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
|
|
cl::Hidden,
|
|
cl::desc(
|
|
"defines how much overlap between two large function's dependencies "
|
|
"is needed to put them in the same partition"));
|
|
|
|
static cl::opt<bool> NoExternalizeGlobals(
|
|
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
|
|
cl::desc("disables externalization of global variable with local linkage; "
|
|
"may cause globals to be duplicated which increases binary size"));
|
|
|
|
static cl::opt<std::string>
|
|
LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
|
|
cl::desc("output directory for AMDGPU module splitting logs"));
|
|
|
|
static cl::opt<bool>
|
|
LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
|
|
cl::desc("hash value names before printing them in the AMDGPU "
|
|
"module splitting logs"));
|
|
|
|
using CostType = InstructionCost::CostType;
|
|
using PartitionID = unsigned;
|
|
using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
|
|
|
|
static bool isEntryPoint(const Function *F) {
|
|
return AMDGPU::isEntryFunctionCC(F->getCallingConv());
|
|
}
|
|
|
|
static std::string getName(const Value &V) {
|
|
static bool HideNames;
|
|
|
|
static llvm::once_flag HideNameInitFlag;
|
|
llvm::call_once(HideNameInitFlag, [&]() {
|
|
if (LogPrivate.getNumOccurrences())
|
|
HideNames = LogPrivate;
|
|
else {
|
|
const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
|
|
HideNames = (EV.value_or("0") != "0");
|
|
}
|
|
});
|
|
|
|
if (!HideNames)
|
|
return V.getName().str();
|
|
return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
|
|
/*LowerCase=*/true);
|
|
}
|
|
|
|
/// Main logging helper.
|
|
///
|
|
/// Logging can be configured by the following environment variable.
|
|
/// AMD_SPLIT_MODULE_LOG_DIR=<filepath>
|
|
/// If set, uses <filepath> as the directory to write logfiles to
|
|
/// each time module splitting is used.
|
|
/// AMD_SPLIT_MODULE_LOG_PRIVATE
|
|
/// If set to anything other than zero, all names are hidden.
|
|
///
|
|
/// Both environment variables have corresponding CL options which
|
|
/// takes priority over them.
|
|
///
|
|
/// Any output printed to the log files is also printed to dbgs() when -debug is
|
|
/// used and LLVM_DEBUG is defined.
|
|
///
|
|
/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
|
|
/// cannot be removed from the code (by building without debug). This probably
|
|
/// has a small performance cost because if some computation/formatting is
|
|
/// needed for logging purpose, it may be done everytime only to be ignored
|
|
/// by the logger.
|
|
///
|
|
/// As this pass only runs once and is not doing anything computationally
|
|
/// expensive, this is likely a reasonable trade-off.
|
|
///
|
|
/// If some computation should really be avoided when unused, users of the class
|
|
/// can check whether any logging will occur by using the bool operator.
|
|
///
|
|
/// \code
|
|
/// if (SML) {
|
|
/// // Executes only if logging to a file or if -debug is available and
|
|
/// used.
|
|
/// }
|
|
/// \endcode
|
|
class SplitModuleLogger {
|
|
public:
|
|
SplitModuleLogger(const Module &M) {
|
|
std::string LogDir = LogDirOpt;
|
|
if (LogDir.empty())
|
|
LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
|
|
|
|
// No log dir specified means we don't need to log to a file.
|
|
// We may still log to dbgs(), though.
|
|
if (LogDir.empty())
|
|
return;
|
|
|
|
// If a log directory is specified, create a new file with a unique name in
|
|
// that directory.
|
|
int Fd;
|
|
SmallString<0> PathTemplate;
|
|
SmallString<0> RealPath;
|
|
sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
|
|
if (auto Err =
|
|
sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
|
|
report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
|
|
"': " + Err.message(),
|
|
/*CrashDiag=*/false);
|
|
}
|
|
|
|
FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
|
|
}
|
|
|
|
bool hasLogFile() const { return FileOS != nullptr; }
|
|
|
|
raw_ostream &logfile() {
|
|
assert(FileOS && "no logfile!");
|
|
return *FileOS;
|
|
}
|
|
|
|
/// \returns true if this SML will log anything either to a file or dbgs().
|
|
/// Can be used to avoid expensive computations that are ignored when logging
|
|
/// is disabled.
|
|
operator bool() const {
|
|
return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
|
|
}
|
|
|
|
private:
|
|
std::unique_ptr<raw_fd_ostream> FileOS;
|
|
};
|
|
|
|
template <typename Ty>
|
|
static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
|
|
static_assert(
|
|
!std::is_same_v<Ty, Value>,
|
|
"do not print values to logs directly, use handleName instead!");
|
|
LLVM_DEBUG(dbgs() << Val);
|
|
if (SML.hasLogFile())
|
|
SML.logfile() << Val;
|
|
return SML;
|
|
}
|
|
|
|
/// Calculate the cost of each function in \p M
|
|
/// \param SML Log Helper
|
|
/// \param GetTTI Abstract getter for TargetTransformInfo.
|
|
/// \param M Module to analyze.
|
|
/// \param CostMap[out] Resulting Function -> Cost map.
|
|
/// \return The module's total cost.
|
|
static CostType
|
|
calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
|
|
DenseMap<const Function *, CostType> &CostMap) {
|
|
CostType ModuleCost = 0;
|
|
CostType KernelCost = 0;
|
|
|
|
for (auto &Fn : M) {
|
|
if (Fn.isDeclaration())
|
|
continue;
|
|
|
|
CostType FnCost = 0;
|
|
const auto &TTI = GetTTI(Fn);
|
|
for (const auto &BB : Fn) {
|
|
for (const auto &I : BB) {
|
|
auto Cost =
|
|
TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
|
|
assert(Cost != InstructionCost::getMax());
|
|
// Assume expensive if we can't tell the cost of an instruction.
|
|
CostType CostVal =
|
|
Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
|
|
assert((FnCost + CostVal) >= FnCost && "Overflow!");
|
|
FnCost += CostVal;
|
|
}
|
|
}
|
|
|
|
assert(FnCost != 0);
|
|
|
|
CostMap[&Fn] = FnCost;
|
|
assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
|
|
ModuleCost += FnCost;
|
|
|
|
if (isEntryPoint(&Fn))
|
|
KernelCost += FnCost;
|
|
}
|
|
|
|
CostType FnCost = (ModuleCost - KernelCost);
|
|
CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
|
|
SML << "=> Total Module Cost: " << ModuleCost << '\n'
|
|
<< " => KernelCost: " << KernelCost << " ("
|
|
<< format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
|
|
<< " => FnsCost: " << FnCost << " ("
|
|
<< format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
|
|
|
|
return ModuleCost;
|
|
}
|
|
|
|
static bool canBeIndirectlyCalled(const Function &F) {
|
|
if (F.isDeclaration() || isEntryPoint(&F))
|
|
return false;
|
|
return !F.hasLocalLinkage() ||
|
|
F.hasAddressTaken(/*PutOffender=*/nullptr,
|
|
/*IgnoreCallbackUses=*/false,
|
|
/*IgnoreAssumeLikeCalls=*/true,
|
|
/*IgnoreLLVMUsed=*/true,
|
|
/*IgnoreARCAttachedCall=*/false,
|
|
/*IgnoreCastedDirectCall=*/true);
|
|
}
|
|
|
|
/// When a function or any of its callees performs an indirect call, this
|
|
/// takes over \ref addAllDependencies and adds all potentially callable
|
|
/// functions to \p Fns so they can be counted as dependencies of the function.
|
|
///
|
|
/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
|
|
/// presence of an indirect call, the function's resource usage is the same as
|
|
/// the most expensive function in the module.
|
|
/// \param M The module.
|
|
/// \param Fns[out] Resulting list of functions.
|
|
static void addAllIndirectCallDependencies(const Module &M,
|
|
DenseSet<const Function *> &Fns) {
|
|
for (const auto &Fn : M) {
|
|
if (canBeIndirectlyCalled(Fn))
|
|
Fns.insert(&Fn);
|
|
}
|
|
}
|
|
|
|
/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
|
|
/// callee until all reachable functions have been gathered.
|
|
///
|
|
/// \param SML Log Helper
|
|
/// \param CG Call graph for \p Fn's module.
|
|
/// \param Fn Current function to look at.
|
|
/// \param Fns[out] Resulting list of functions.
|
|
/// \param OnlyDirect Whether to only consider direct callees.
|
|
/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
|
|
/// point, either in \p Fn or in one of the function it calls. When that
|
|
/// happens, we fall back to adding all callable functions inside \p Fn's module
|
|
/// to \p Fns.
|
|
static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
|
|
const Function &Fn,
|
|
DenseSet<const Function *> &Fns, bool OnlyDirect,
|
|
bool &HadIndirectCall) {
|
|
assert(!Fn.isDeclaration());
|
|
|
|
const Module &M = *Fn.getParent();
|
|
SmallVector<const Function *> WorkList({&Fn});
|
|
while (!WorkList.empty()) {
|
|
const auto &CurFn = *WorkList.pop_back_val();
|
|
assert(!CurFn.isDeclaration());
|
|
|
|
// Scan for an indirect call. If such a call is found, we have to
|
|
// conservatively assume this can call all non-entrypoint functions in the
|
|
// module.
|
|
|
|
for (auto &CGEntry : *CG[&CurFn]) {
|
|
auto *CGNode = CGEntry.second;
|
|
auto *Callee = CGNode->getFunction();
|
|
if (!Callee) {
|
|
if (OnlyDirect)
|
|
continue;
|
|
|
|
// Functions have an edge towards CallsExternalNode if they're external
|
|
// declarations, or if they do an indirect call. As we only process
|
|
// definitions here, we know this means the function has an indirect
|
|
// call. We then have to conservatively assume this can call all
|
|
// non-entrypoint functions in the module.
|
|
if (CGNode != CG.getCallsExternalNode())
|
|
continue; // this is another function-less node we don't care about.
|
|
|
|
SML << "Indirect call detected in " << getName(CurFn)
|
|
<< " - treating all non-entrypoint functions as "
|
|
"potential dependencies\n";
|
|
|
|
// TODO: Print an ORE as well ?
|
|
addAllIndirectCallDependencies(M, Fns);
|
|
HadIndirectCall = true;
|
|
continue;
|
|
}
|
|
|
|
if (Callee->isDeclaration())
|
|
continue;
|
|
|
|
auto [It, Inserted] = Fns.insert(Callee);
|
|
if (Inserted)
|
|
WorkList.push_back(Callee);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Contains information about a function and its dependencies.
|
|
/// This is a splitting root. The splitting algorithm works by
|
|
/// assigning these to partitions.
|
|
struct FunctionWithDependencies {
|
|
FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
|
|
const DenseMap<const Function *, CostType> &FnCosts,
|
|
const Function *Fn)
|
|
: Fn(Fn) {
|
|
// When Fn is not a kernel, we don't need to collect indirect callees.
|
|
// Resource usage analysis is only performed on kernels, and we collect
|
|
// indirect callees for resource usage analysis.
|
|
addAllDependencies(SML, CG, *Fn, Dependencies,
|
|
/*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
|
|
TotalCost = FnCosts.at(Fn);
|
|
for (const auto *Dep : Dependencies) {
|
|
TotalCost += FnCosts.at(Dep);
|
|
|
|
// We cannot duplicate functions with external linkage, or functions that
|
|
// may be overriden at runtime.
|
|
HasNonDuplicatableDependecy |=
|
|
(Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
|
|
}
|
|
}
|
|
|
|
const Function *Fn = nullptr;
|
|
DenseSet<const Function *> Dependencies;
|
|
/// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
|
|
bool HasIndirectCall = false;
|
|
/// Whether any of \p Fn's dependencies cannot be duplicated.
|
|
bool HasNonDuplicatableDependecy = false;
|
|
|
|
CostType TotalCost = 0;
|
|
|
|
/// \returns true if this function and its dependencies can be considered
|
|
/// large according to \p Threshold.
|
|
bool isLarge(CostType Threshold) const {
|
|
return TotalCost > Threshold && !Dependencies.empty();
|
|
}
|
|
};
|
|
|
|
/// Calculates how much overlap there is between \p A and \p B.
|
|
/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
|
|
/// and B have no shared elements. Kernels do not count in overlap calculation.
|
|
static float calculateOverlap(const DenseSet<const Function *> &A,
|
|
const DenseSet<const Function *> &B) {
|
|
DenseSet<const Function *> Total;
|
|
for (const auto *F : A) {
|
|
if (!isEntryPoint(F))
|
|
Total.insert(F);
|
|
}
|
|
|
|
if (Total.empty())
|
|
return 0.0f;
|
|
|
|
unsigned NumCommon = 0;
|
|
for (const auto *F : B) {
|
|
if (isEntryPoint(F))
|
|
continue;
|
|
|
|
auto [It, Inserted] = Total.insert(F);
|
|
if (!Inserted)
|
|
++NumCommon;
|
|
}
|
|
|
|
return static_cast<float>(NumCommon) / Total.size();
|
|
}
|
|
|
|
/// Performs all of the partitioning work on \p M.
|
|
/// \param SML Log Helper
|
|
/// \param M Module to partition.
|
|
/// \param NumParts Number of partitions to create.
|
|
/// \param ModuleCost Total cost of all functions in \p M.
|
|
/// \param FnCosts Map of Function -> Cost
|
|
/// \param WorkList Functions and their dependencies to process in order.
|
|
/// \returns The created partitions (a vector of size \p NumParts )
|
|
static std::vector<DenseSet<const Function *>>
|
|
doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
|
|
CostType ModuleCost,
|
|
const DenseMap<const Function *, CostType> &FnCosts,
|
|
const SmallVector<FunctionWithDependencies> &WorkList) {
|
|
|
|
SML << "\n--Partitioning Starts--\n";
|
|
|
|
// Calculate a "large function threshold". When more than one function's total
|
|
// import cost exceeds this value, we will try to assign it to an existing
|
|
// partition to reduce the amount of duplication needed.
|
|
//
|
|
// e.g. let two functions X and Y have a import cost of ~10% of the module, we
|
|
// assign X to a partition as usual, but when we get to Y, we check if it's
|
|
// worth also putting it in Y's partition.
|
|
const CostType LargeFnThreshold =
|
|
LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
|
|
: std::numeric_limits<CostType>::max();
|
|
|
|
std::vector<DenseSet<const Function *>> Partitions;
|
|
Partitions.resize(NumParts);
|
|
|
|
// Assign functions to partitions, and try to keep the partitions more or
|
|
// less balanced. We do that through a priority queue sorted in reverse, so we
|
|
// can always look at the partition with the least content.
|
|
//
|
|
// There are some cases where we will be deliberately unbalanced though.
|
|
// - Large functions: we try to merge with existing partitions to reduce code
|
|
// duplication.
|
|
// - Functions with indirect or external calls always go in the first
|
|
// partition (P0).
|
|
auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
|
|
const std::pair<PartitionID, CostType> &b) {
|
|
// When two partitions have the same cost, assign to the one with the
|
|
// biggest ID first. This allows us to put things in P0 last, because P0 may
|
|
// have other stuff added later.
|
|
if (a.second == b.second)
|
|
return a.first < b.first;
|
|
return a.second > b.second;
|
|
};
|
|
|
|
// We can't use priority_queue here because we need to be able to access any
|
|
// element. This makes this a bit inefficient as we need to sort it again
|
|
// everytime we change it, but it's a very small array anyway (likely under 64
|
|
// partitions) so it's a cheap operation.
|
|
std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
|
|
for (unsigned I = 0; I < NumParts; ++I)
|
|
BalancingQueue.emplace_back(I, 0);
|
|
|
|
// Helper function to handle assigning a function to a partition. This takes
|
|
// care of updating the balancing queue.
|
|
const auto AssignToPartition = [&](PartitionID PID,
|
|
const FunctionWithDependencies &FWD) {
|
|
auto &FnsInPart = Partitions[PID];
|
|
FnsInPart.insert(FWD.Fn);
|
|
FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
|
|
|
|
SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> ";
|
|
if (!FWD.Dependencies.empty()) {
|
|
SML << FWD.Dependencies.size() << " dependencies added\n";
|
|
};
|
|
|
|
// Update the balancing queue. we scan backwards because in the common case
|
|
// the partition is at the end.
|
|
for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
|
|
if (QueuePID == PID) {
|
|
CostType NewCost = 0;
|
|
for (auto *Fn : Partitions[PID])
|
|
NewCost += FnCosts.at(Fn);
|
|
|
|
SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
|
|
if (Cost) {
|
|
SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
|
|
<< "% increase)";
|
|
}
|
|
SML << '\n';
|
|
|
|
Cost = NewCost;
|
|
}
|
|
}
|
|
|
|
sort(BalancingQueue, ComparePartitions);
|
|
};
|
|
|
|
for (auto &CurFn : WorkList) {
|
|
// When a function has indirect calls, it must stay in the first partition
|
|
// alongside every reachable non-entry function. This is a nightmare case
|
|
// for splitting as it severely limits what we can do.
|
|
if (CurFn.HasIndirectCall) {
|
|
SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
|
|
<< " defaulting to P0\n";
|
|
AssignToPartition(0, CurFn);
|
|
continue;
|
|
}
|
|
|
|
// When a function has non duplicatable dependencies, we have to keep it in
|
|
// the first partition as well. This is a conservative approach, a
|
|
// finer-grained approach could keep track of which dependencies are
|
|
// non-duplicatable exactly and just make sure they're grouped together.
|
|
if (CurFn.HasNonDuplicatableDependecy) {
|
|
SML << "Function with externally visible dependency "
|
|
<< getName(*CurFn.Fn) << " defaulting to P0\n";
|
|
AssignToPartition(0, CurFn);
|
|
continue;
|
|
}
|
|
|
|
// Be smart with large functions to avoid duplicating their dependencies.
|
|
if (CurFn.isLarge(LargeFnThreshold)) {
|
|
assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
|
|
SML << "Large Function: " << getName(*CurFn.Fn)
|
|
<< " - looking for partition with at least "
|
|
<< format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
|
|
|
|
bool Assigned = false;
|
|
for (const auto &[PID, Fns] : enumerate(Partitions)) {
|
|
float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
|
|
SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P"
|
|
<< PID << '\n';
|
|
if (Overlap > LargeFnOverlapForMerge) {
|
|
SML << " selecting P" << PID << '\n';
|
|
AssignToPartition(PID, CurFn);
|
|
Assigned = true;
|
|
}
|
|
}
|
|
|
|
if (Assigned)
|
|
continue;
|
|
}
|
|
|
|
// Normal "load-balancing", assign to partition with least pressure.
|
|
auto [PID, CurCost] = BalancingQueue.back();
|
|
AssignToPartition(PID, CurFn);
|
|
}
|
|
|
|
if (SML) {
|
|
for (const auto &[Idx, Part] : enumerate(Partitions)) {
|
|
CostType Cost = 0;
|
|
for (auto *Fn : Part)
|
|
Cost += FnCosts.at(Fn);
|
|
SML << "P" << Idx << " has a total cost of " << Cost << " ("
|
|
<< format("%0.2f", (float(Cost) / ModuleCost) * 100)
|
|
<< "% of source module)\n";
|
|
}
|
|
|
|
SML << "--Partitioning Done--\n\n";
|
|
}
|
|
|
|
// Check no functions were missed.
|
|
#ifndef NDEBUG
|
|
DenseSet<const Function *> AllFunctions;
|
|
for (const auto &Part : Partitions)
|
|
AllFunctions.insert(Part.begin(), Part.end());
|
|
|
|
for (auto &Fn : M) {
|
|
if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
|
|
assert(AllFunctions.contains(&Fn) && "Missed a function?!");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return Partitions;
|
|
}
|
|
|
|
static void externalize(GlobalValue &GV) {
|
|
if (GV.hasLocalLinkage()) {
|
|
GV.setLinkage(GlobalValue::ExternalLinkage);
|
|
GV.setVisibility(GlobalValue::HiddenVisibility);
|
|
}
|
|
|
|
// Unnamed entities must be named consistently between modules. setName will
|
|
// give a distinct name to each such entity.
|
|
if (!GV.hasName())
|
|
GV.setName("__llvmsplit_unnamed");
|
|
}
|
|
|
|
static bool hasDirectCaller(const Function &Fn) {
|
|
for (auto &U : Fn.uses()) {
|
|
if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void splitAMDGPUModule(
|
|
GetTTIFn GetTTI, Module &M, unsigned N,
|
|
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
|
|
|
|
SplitModuleLogger SML(M);
|
|
|
|
CallGraph CG(M);
|
|
|
|
// Externalize functions whose address are taken.
|
|
//
|
|
// This is needed because partitioning is purely based on calls, but sometimes
|
|
// a kernel/function may just look at the address of another local function
|
|
// and not do anything (no calls). After partitioning, that local function may
|
|
// end up in a different module (so it's just a declaration in the module
|
|
// where its address is taken), which emits a "undefined hidden symbol" linker
|
|
// error.
|
|
//
|
|
// Additionally, it guides partitioning to not duplicate this function if it's
|
|
// called directly at some point.
|
|
for (auto &Fn : M) {
|
|
if (Fn.hasAddressTaken()) {
|
|
if (Fn.hasLocalLinkage()) {
|
|
SML << "[externalize] " << Fn.getName()
|
|
<< " because its address is taken\n";
|
|
}
|
|
externalize(Fn);
|
|
}
|
|
}
|
|
|
|
// Externalize local GVs, which avoids duplicating their initializers, which
|
|
// in turns helps keep code size in check.
|
|
if (!NoExternalizeGlobals) {
|
|
for (auto &GV : M.globals()) {
|
|
if (GV.hasLocalLinkage())
|
|
SML << "[externalize] GV " << GV.getName() << '\n';
|
|
externalize(GV);
|
|
}
|
|
}
|
|
|
|
// Start by calculating the cost of every function in the module, as well as
|
|
// the module's overall cost.
|
|
DenseMap<const Function *, CostType> FnCosts;
|
|
const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
|
|
|
|
// First, gather ever kernel into the worklist.
|
|
SmallVector<FunctionWithDependencies> WorkList;
|
|
for (auto &Fn : M) {
|
|
if (isEntryPoint(&Fn) && !Fn.isDeclaration())
|
|
WorkList.emplace_back(SML, CG, FnCosts, &Fn);
|
|
}
|
|
|
|
// Then, find missing functions that need to be considered as additional
|
|
// roots. These can't be called in theory, but in practice we still have to
|
|
// handle them to avoid linker errors.
|
|
{
|
|
DenseSet<const Function *> SeenFunctions;
|
|
for (const auto &FWD : WorkList) {
|
|
SeenFunctions.insert(FWD.Fn);
|
|
SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
|
|
}
|
|
|
|
for (auto &Fn : M) {
|
|
// If this function is not part of any kernel's dependencies and isn't
|
|
// directly called, consider it as a root.
|
|
if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
|
|
!SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
|
|
WorkList.emplace_back(SML, CG, FnCosts, &Fn);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort the worklist so the most expensive roots are seen first.
|
|
sort(WorkList, [&](auto &A, auto &B) {
|
|
// Sort by total cost, and if the total cost is identical, sort
|
|
// alphabetically.
|
|
if (A.TotalCost == B.TotalCost)
|
|
return A.Fn->getName() < B.Fn->getName();
|
|
return A.TotalCost > B.TotalCost;
|
|
});
|
|
|
|
if (SML) {
|
|
SML << "Worklist\n";
|
|
for (const auto &FWD : WorkList) {
|
|
SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
|
|
<< " indirect:" << FWD.HasIndirectCall
|
|
<< " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
|
|
<< ")\n";
|
|
// Sort function names before printing to ensure determinism.
|
|
SmallVector<std::string> SortedDepNames;
|
|
SortedDepNames.reserve(FWD.Dependencies.size());
|
|
for (const auto *Dep : FWD.Dependencies)
|
|
SortedDepNames.push_back(getName(*Dep));
|
|
sort(SortedDepNames);
|
|
|
|
for (const auto &Name : SortedDepNames)
|
|
SML << " [dependency] " << Name << '\n';
|
|
}
|
|
}
|
|
|
|
// This performs all of the partitioning work.
|
|
auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
|
|
assert(Partitions.size() == N);
|
|
|
|
// If we didn't externalize GVs, then local GVs need to be conservatively
|
|
// imported into every module (including their initializers), and then cleaned
|
|
// up afterwards.
|
|
const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
|
|
// We conservatively import private/internal GVs into every module and clean
|
|
// them up afterwards.
|
|
const auto *Var = dyn_cast<GlobalVariable>(GV);
|
|
return Var && Var->hasLocalLinkage();
|
|
};
|
|
|
|
SML << "Creating " << N << " modules...\n";
|
|
unsigned TotalFnImpls = 0;
|
|
for (unsigned I = 0; I < N; ++I) {
|
|
const auto &FnsInPart = Partitions[I];
|
|
|
|
ValueToValueMapTy VMap;
|
|
std::unique_ptr<Module> MPart(
|
|
CloneModule(M, VMap, [&](const GlobalValue *GV) {
|
|
// Functions go in their assigned partition.
|
|
if (const auto *Fn = dyn_cast<Function>(GV))
|
|
return FnsInPart.contains(Fn);
|
|
|
|
if (NeedsConservativeImport(GV))
|
|
return true;
|
|
|
|
// Everything else goes in the first partition.
|
|
return I == 0;
|
|
}));
|
|
|
|
// Clean-up conservatively imported GVs without any users.
|
|
for (auto &GV : make_early_inc_range(MPart->globals())) {
|
|
if (NeedsConservativeImport(&GV) && GV.use_empty())
|
|
GV.eraseFromParent();
|
|
}
|
|
|
|
unsigned NumAllFns = 0, NumKernels = 0;
|
|
for (auto &Cur : *MPart) {
|
|
if (!Cur.isDeclaration()) {
|
|
++NumAllFns;
|
|
if (isEntryPoint(&Cur))
|
|
++NumKernels;
|
|
}
|
|
}
|
|
TotalFnImpls += NumAllFns;
|
|
SML << " - Module " << I << " with " << NumAllFns << " functions ("
|
|
<< NumKernels << " kernels)\n";
|
|
ModuleCallback(std::move(MPart));
|
|
}
|
|
|
|
SML << TotalFnImpls << " function definitions across all modules ("
|
|
<< format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
|
|
<< "% of original module)\n";
|
|
}
|
|
} // namespace
|
|
|
|
PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
|
|
ModuleAnalysisManager &MAM) {
|
|
FunctionAnalysisManager &FAM =
|
|
MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
|
const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
|
|
return FAM.getResult<TargetIRAnalysis>(F);
|
|
};
|
|
splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
|
|
// We don't change the original module.
|
|
return PreservedAnalyses::all();
|
|
}
|