We need to pass something to functions for this to work. It isn't derivable just from the kernarg segment pointer because the implicit arguments are placed after the kernel arguments. Also fixes missing test for the intrinsic. llvm-svn: 309398
314 lines
8.8 KiB
C++
314 lines
8.8 KiB
C++
//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file This pass adds target attributes to functions which use intrinsics
|
|
/// which will impact calling convention lowering.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "llvm/ADT/Triple.h"
|
|
#include "llvm/Analysis/CallGraphSCCPass.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/InstIterator.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
|
|
private:
|
|
const TargetMachine *TM = nullptr;
|
|
AMDGPUAS AS;
|
|
|
|
bool addFeatureAttributes(Function &F);
|
|
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
|
|
|
|
bool doInitialization(CallGraph &CG) override;
|
|
bool runOnSCC(CallGraphSCC &SCC) override;
|
|
StringRef getPassName() const override {
|
|
return "AMDGPU Annotate Kernel Features";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesAll();
|
|
CallGraphSCCPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
|
|
static bool visitConstantExprsRecursively(
|
|
const Constant *EntryC,
|
|
SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
|
|
AMDGPUAS AS);
|
|
};
|
|
|
|
}
|
|
|
|
char AMDGPUAnnotateKernelFeatures::ID = 0;
|
|
|
|
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
|
|
"Add AMDGPU function attributes", false, false)
|
|
|
|
|
|
// The queue ptr is only needed when casting to flat, not from it.
|
|
static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
|
|
return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
|
|
}
|
|
|
|
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
|
|
const AMDGPUAS &AS) {
|
|
return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
|
|
AMDGPUAS AS) {
|
|
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
|
|
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
|
|
return castRequiresQueuePtr(SrcAS, AS);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
|
|
const Constant *EntryC,
|
|
SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
|
|
AMDGPUAS AS) {
|
|
|
|
if (!ConstantExprVisited.insert(EntryC).second)
|
|
return false;
|
|
|
|
SmallVector<const Constant *, 16> Stack;
|
|
Stack.push_back(EntryC);
|
|
|
|
while (!Stack.empty()) {
|
|
const Constant *C = Stack.pop_back_val();
|
|
|
|
// Check this constant expression.
|
|
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
|
|
if (visitConstantExpr(CE, AS))
|
|
return true;
|
|
}
|
|
|
|
// Visit all sub-expressions.
|
|
for (const Use &U : C->operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
if (!ConstantExprVisited.insert(OpC).second)
|
|
continue;
|
|
|
|
Stack.push_back(OpC);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// We do not need to note the x workitem or workgroup id because they are always
|
|
// initialized.
|
|
//
|
|
// TODO: We should not add the attributes if the known compile time workgroup
|
|
// size is 1 for y/z.
|
|
static StringRef intrinsicToAttrName(Intrinsic::ID ID,
|
|
bool &NonKernelOnly,
|
|
bool &IsQueuePtr) {
|
|
switch (ID) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
NonKernelOnly = true;
|
|
return "amdgpu-work-item-id-x";
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
NonKernelOnly = true;
|
|
return "amdgpu-work-group-id-x";
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
return "amdgpu-work-item-id-y";
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
return "amdgpu-work-item-id-z";
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
case Intrinsic::r600_read_tgid_y:
|
|
return "amdgpu-work-group-id-y";
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
case Intrinsic::r600_read_tgid_z:
|
|
return "amdgpu-work-group-id-z";
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
return "amdgpu-dispatch-ptr";
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
return "amdgpu-dispatch-id";
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr:
|
|
return "amdgpu-kernarg-segment-ptr";
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
return "amdgpu-implicitarg-ptr";
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
case Intrinsic::trap:
|
|
case Intrinsic::debugtrap:
|
|
IsQueuePtr = true;
|
|
return "amdgpu-queue-ptr";
|
|
default:
|
|
return "";
|
|
}
|
|
}
|
|
|
|
static bool handleAttr(Function &Parent, const Function &Callee,
|
|
StringRef Name) {
|
|
if (Callee.hasFnAttribute(Name)) {
|
|
Parent.addFnAttr(Name);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
|
|
bool &NeedQueuePtr) {
|
|
// X ids unnecessarily propagated to kernels.
|
|
static const StringRef AttrNames[] = {
|
|
{ "amdgpu-work-item-id-x" },
|
|
{ "amdgpu-work-item-id-y" },
|
|
{ "amdgpu-work-item-id-z" },
|
|
{ "amdgpu-work-group-id-x" },
|
|
{ "amdgpu-work-group-id-y" },
|
|
{ "amdgpu-work-group-id-z" },
|
|
{ "amdgpu-dispatch-ptr" },
|
|
{ "amdgpu-dispatch-id" },
|
|
{ "amdgpu-kernarg-segment-ptr" },
|
|
{ "amdgpu-implicitarg-ptr" }
|
|
};
|
|
|
|
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
|
|
NeedQueuePtr = true;
|
|
|
|
for (StringRef AttrName : AttrNames)
|
|
handleAttr(Parent, Callee, AttrName);
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
|
|
bool HasFlat = ST.hasFlatAddressSpace();
|
|
bool HasApertureRegs = ST.hasApertureRegs();
|
|
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
|
|
|
|
bool Changed = false;
|
|
bool NeedQueuePtr = false;
|
|
bool HaveCall = false;
|
|
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
|
|
|
|
for (BasicBlock &BB : F) {
|
|
for (Instruction &I : BB) {
|
|
CallSite CS(&I);
|
|
if (CS) {
|
|
Function *Callee = CS.getCalledFunction();
|
|
|
|
// TODO: Do something with indirect calls.
|
|
if (!Callee) {
|
|
if (!CS.isInlineAsm())
|
|
HaveCall = true;
|
|
continue;
|
|
}
|
|
|
|
Intrinsic::ID IID = Callee->getIntrinsicID();
|
|
if (IID == Intrinsic::not_intrinsic) {
|
|
HaveCall = true;
|
|
copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
|
|
Changed = true;
|
|
} else {
|
|
bool NonKernelOnly = false;
|
|
StringRef AttrName = intrinsicToAttrName(IID,
|
|
NonKernelOnly, NeedQueuePtr);
|
|
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
|
|
F.addFnAttr(AttrName);
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NeedQueuePtr || HasApertureRegs)
|
|
continue;
|
|
|
|
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
|
|
if (castRequiresQueuePtr(ASC, AS)) {
|
|
NeedQueuePtr = true;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
for (const Use &U : I.operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
|
|
NeedQueuePtr = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NeedQueuePtr) {
|
|
F.addFnAttr("amdgpu-queue-ptr");
|
|
Changed = true;
|
|
}
|
|
|
|
// TODO: We could refine this to captured pointers that could possibly be
|
|
// accessed by flat instructions. For now this is mostly a poor way of
|
|
// estimating whether there are calls before argument lowering.
|
|
if (HasFlat && !IsFunc && HaveCall) {
|
|
F.addFnAttr("amdgpu-flat-scratch");
|
|
Changed = true;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
|
|
Module &M = SCC.getCallGraph().getModule();
|
|
Triple TT(M.getTargetTriple());
|
|
|
|
bool Changed = false;
|
|
for (CallGraphNode *I : SCC) {
|
|
Function *F = I->getFunction();
|
|
if (!F || F->isDeclaration())
|
|
continue;
|
|
|
|
Changed |= addFeatureAttributes(*F);
|
|
}
|
|
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
|
|
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
|
if (!TPC)
|
|
report_fatal_error("TargetMachine is required");
|
|
|
|
AS = AMDGPU::getAMDGPUAS(CG.getModule());
|
|
TM = &TPC->getTM<TargetMachine>();
|
|
return false;
|
|
}
|
|
|
|
Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
|
|
return new AMDGPUAnnotateKernelFeatures();
|
|
}
|