to reflect the new license. We understand that people may be surprised that we're moving the header entirely to discuss the new license. We checked this carefully with the Foundation's lawyer and we believe this is the correct approach. Essentially, all code in the project is now made available by the LLVM project under our new license, so you will see that the license headers include that license only. Some of our contributors have contributed code under our old license, and accordingly, we have retained a copy of our old license notice in the top-level files in each project and repository. llvm-svn: 351636
322 lines
9.1 KiB
C++
322 lines
9.1 KiB
C++
//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file This pass adds target attributes to functions which use intrinsics
|
|
/// which will impact calling convention lowering.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/ADT/Triple.h"
|
|
#include "llvm/Analysis/CallGraph.h"
|
|
#include "llvm/Analysis/CallGraphSCCPass.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/CallSite.h"
|
|
#include "llvm/IR/Constant.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Use.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
|
|
private:
|
|
const TargetMachine *TM = nullptr;
|
|
|
|
bool addFeatureAttributes(Function &F);
|
|
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
|
|
|
|
bool doInitialization(CallGraph &CG) override;
|
|
bool runOnSCC(CallGraphSCC &SCC) override;
|
|
|
|
StringRef getPassName() const override {
|
|
return "AMDGPU Annotate Kernel Features";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesAll();
|
|
CallGraphSCCPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
static bool visitConstantExpr(const ConstantExpr *CE);
|
|
static bool visitConstantExprsRecursively(
|
|
const Constant *EntryC,
|
|
SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
char AMDGPUAnnotateKernelFeatures::ID = 0;
|
|
|
|
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
|
|
"Add AMDGPU function attributes", false, false)
|
|
|
|
|
|
// The queue ptr is only needed when casting to flat, not from it.
|
|
static bool castRequiresQueuePtr(unsigned SrcAS) {
|
|
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
|
|
}
|
|
|
|
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
|
|
return castRequiresQueuePtr(ASC->getSrcAddressSpace());
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
|
|
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
|
|
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
|
|
return castRequiresQueuePtr(SrcAS);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
|
|
const Constant *EntryC,
|
|
SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
|
|
|
|
if (!ConstantExprVisited.insert(EntryC).second)
|
|
return false;
|
|
|
|
SmallVector<const Constant *, 16> Stack;
|
|
Stack.push_back(EntryC);
|
|
|
|
while (!Stack.empty()) {
|
|
const Constant *C = Stack.pop_back_val();
|
|
|
|
// Check this constant expression.
|
|
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
|
|
if (visitConstantExpr(CE))
|
|
return true;
|
|
}
|
|
|
|
// Visit all sub-expressions.
|
|
for (const Use &U : C->operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
if (!ConstantExprVisited.insert(OpC).second)
|
|
continue;
|
|
|
|
Stack.push_back(OpC);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// We do not need to note the x workitem or workgroup id because they are always
|
|
// initialized.
|
|
//
|
|
// TODO: We should not add the attributes if the known compile time workgroup
|
|
// size is 1 for y/z.
|
|
static StringRef intrinsicToAttrName(Intrinsic::ID ID,
|
|
bool &NonKernelOnly,
|
|
bool &IsQueuePtr) {
|
|
switch (ID) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
NonKernelOnly = true;
|
|
return "amdgpu-work-item-id-x";
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
NonKernelOnly = true;
|
|
return "amdgpu-work-group-id-x";
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
return "amdgpu-work-item-id-y";
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
return "amdgpu-work-item-id-z";
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
case Intrinsic::r600_read_tgid_y:
|
|
return "amdgpu-work-group-id-y";
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
case Intrinsic::r600_read_tgid_z:
|
|
return "amdgpu-work-group-id-z";
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
return "amdgpu-dispatch-ptr";
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
return "amdgpu-dispatch-id";
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr:
|
|
return "amdgpu-kernarg-segment-ptr";
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
return "amdgpu-implicitarg-ptr";
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
case Intrinsic::trap:
|
|
case Intrinsic::debugtrap:
|
|
IsQueuePtr = true;
|
|
return "amdgpu-queue-ptr";
|
|
default:
|
|
return "";
|
|
}
|
|
}
|
|
|
|
static bool handleAttr(Function &Parent, const Function &Callee,
|
|
StringRef Name) {
|
|
if (Callee.hasFnAttribute(Name)) {
|
|
Parent.addFnAttr(Name);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
|
|
bool &NeedQueuePtr) {
|
|
// X ids unnecessarily propagated to kernels.
|
|
static const StringRef AttrNames[] = {
|
|
{ "amdgpu-work-item-id-x" },
|
|
{ "amdgpu-work-item-id-y" },
|
|
{ "amdgpu-work-item-id-z" },
|
|
{ "amdgpu-work-group-id-x" },
|
|
{ "amdgpu-work-group-id-y" },
|
|
{ "amdgpu-work-group-id-z" },
|
|
{ "amdgpu-dispatch-ptr" },
|
|
{ "amdgpu-dispatch-id" },
|
|
{ "amdgpu-kernarg-segment-ptr" },
|
|
{ "amdgpu-implicitarg-ptr" }
|
|
};
|
|
|
|
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
|
|
NeedQueuePtr = true;
|
|
|
|
for (StringRef AttrName : AttrNames)
|
|
handleAttr(Parent, Callee, AttrName);
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
|
|
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
|
|
bool HasFlat = ST.hasFlatAddressSpace();
|
|
bool HasApertureRegs = ST.hasApertureRegs();
|
|
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
|
|
|
|
bool Changed = false;
|
|
bool NeedQueuePtr = false;
|
|
bool HaveCall = false;
|
|
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
|
|
|
|
for (BasicBlock &BB : F) {
|
|
for (Instruction &I : BB) {
|
|
CallSite CS(&I);
|
|
if (CS) {
|
|
Function *Callee = CS.getCalledFunction();
|
|
|
|
// TODO: Do something with indirect calls.
|
|
if (!Callee) {
|
|
if (!CS.isInlineAsm())
|
|
HaveCall = true;
|
|
continue;
|
|
}
|
|
|
|
Intrinsic::ID IID = Callee->getIntrinsicID();
|
|
if (IID == Intrinsic::not_intrinsic) {
|
|
HaveCall = true;
|
|
copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
|
|
Changed = true;
|
|
} else {
|
|
bool NonKernelOnly = false;
|
|
StringRef AttrName = intrinsicToAttrName(IID,
|
|
NonKernelOnly, NeedQueuePtr);
|
|
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
|
|
F.addFnAttr(AttrName);
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NeedQueuePtr || HasApertureRegs)
|
|
continue;
|
|
|
|
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
|
|
if (castRequiresQueuePtr(ASC)) {
|
|
NeedQueuePtr = true;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
for (const Use &U : I.operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
|
|
NeedQueuePtr = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (NeedQueuePtr) {
|
|
F.addFnAttr("amdgpu-queue-ptr");
|
|
Changed = true;
|
|
}
|
|
|
|
// TODO: We could refine this to captured pointers that could possibly be
|
|
// accessed by flat instructions. For now this is mostly a poor way of
|
|
// estimating whether there are calls before argument lowering.
|
|
if (HasFlat && !IsFunc && HaveCall) {
|
|
F.addFnAttr("amdgpu-flat-scratch");
|
|
Changed = true;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
|
|
Module &M = SCC.getCallGraph().getModule();
|
|
Triple TT(M.getTargetTriple());
|
|
|
|
bool Changed = false;
|
|
for (CallGraphNode *I : SCC) {
|
|
Function *F = I->getFunction();
|
|
if (!F || F->isDeclaration())
|
|
continue;
|
|
|
|
Changed |= addFeatureAttributes(*F);
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
|
|
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
|
if (!TPC)
|
|
report_fatal_error("TargetMachine is required");
|
|
|
|
TM = &TPC->getTM<TargetMachine>();
|
|
return false;
|
|
}
|
|
|
|
Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
|
|
return new AMDGPUAnnotateKernelFeatures();
|
|
}
|