Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
376 lines
13 KiB
C++
376 lines
13 KiB
C++
//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Implements the AMDGPU specific subclass of TargetSubtarget.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "AMDGPUCallLowering.h"
|
|
#include "AMDGPUInstructionSelector.h"
|
|
#include "AMDGPULegalizerInfo.h"
|
|
#include "AMDGPURegisterBankInfo.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "R600Subtarget.h"
|
|
#include "SIMachineFunctionInfo.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/SmallString.h"
|
|
#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
|
|
#include "llvm/CodeGen/MachineScheduler.h"
|
|
#include "llvm/CodeGen/TargetFrameLowering.h"
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/IntrinsicsR600.h"
|
|
#include "llvm/IR/MDBuilder.h"
|
|
#include "llvm/MC/MCSubtargetInfo.h"
|
|
#include <algorithm>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-subtarget"
|
|
|
|
AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
|
|
|
|
bool AMDGPUSubtarget::useRealTrue16Insts() const {
|
|
return hasTrue16BitInsts() && EnableRealTrue16Insts;
|
|
}
|
|
|
|
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
|
|
// allows the given function to achieve an occupancy of NWaves waves per
|
|
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
|
|
unsigned
|
|
AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
|
|
const Function &F) const {
|
|
const unsigned WaveSize = getWavefrontSize();
|
|
const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
|
|
const unsigned WavesPerWorkgroup =
|
|
std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
|
|
|
|
const unsigned WorkGroupsPerCU =
|
|
std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
|
|
|
|
return getLocalMemorySize() / WorkGroupsPerCU;
|
|
}
|
|
|
|
// FIXME: Should return min,max range.
|
|
//
|
|
// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
|
|
// be achieved when only the given function is running on the machine; and
|
|
// taking into account the overall number of wave slots, the (maximum) workgroup
|
|
// size, and the per-workgroup LDS allocation size.
|
|
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
|
|
const Function &F) const {
|
|
const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
|
|
const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
|
|
if (!MaxWorkGroupsPerCu)
|
|
return 0;
|
|
|
|
const unsigned WaveSize = getWavefrontSize();
|
|
|
|
// FIXME: Do we need to account for alignment requirement of LDS rounding the
|
|
// size up?
|
|
// Compute restriction based on LDS usage
|
|
unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
|
|
|
|
// This can be queried with more LDS than is possible, so just assume the
|
|
// worst.
|
|
if (NumGroups == 0)
|
|
return 1;
|
|
|
|
NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
|
|
|
|
// Round to the number of waves per CU.
|
|
const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
|
|
unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
|
|
|
|
// Number of waves per EU (SIMD).
|
|
MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
|
|
|
|
// Clamp to the maximum possible number of waves.
|
|
MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
|
|
|
|
// FIXME: Needs to be a multiple of the group size?
|
|
//MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
|
|
|
|
assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
|
|
"computed invalid occupancy");
|
|
return MaxWaves;
|
|
}
|
|
|
|
unsigned
|
|
AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
|
|
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
|
|
}
|
|
|
|
std::pair<unsigned, unsigned>
|
|
AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
|
|
switch (CC) {
|
|
case CallingConv::AMDGPU_VS:
|
|
case CallingConv::AMDGPU_LS:
|
|
case CallingConv::AMDGPU_HS:
|
|
case CallingConv::AMDGPU_ES:
|
|
case CallingConv::AMDGPU_GS:
|
|
case CallingConv::AMDGPU_PS:
|
|
return std::pair(1, getWavefrontSize());
|
|
default:
|
|
return std::pair(1u, getMaxFlatWorkGroupSize());
|
|
}
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
|
|
const Function &F) const {
|
|
// Default minimum/maximum flat work group sizes.
|
|
std::pair<unsigned, unsigned> Default =
|
|
getDefaultFlatWorkGroupSize(F.getCallingConv());
|
|
|
|
// Requested minimum/maximum flat work group sizes.
|
|
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
|
|
F, "amdgpu-flat-work-group-size", Default);
|
|
|
|
// Make sure requested minimum is less than requested maximum.
|
|
if (Requested.first > Requested.second)
|
|
return Default;
|
|
|
|
// Make sure requested values do not violate subtarget's specifications.
|
|
if (Requested.first < getMinFlatWorkGroupSize())
|
|
return Default;
|
|
if (Requested.second > getMaxFlatWorkGroupSize())
|
|
return Default;
|
|
|
|
return Requested;
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
|
|
std::pair<unsigned, unsigned> Requested,
|
|
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
|
|
// Default minimum/maximum number of waves per execution unit.
|
|
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
|
|
|
|
// If minimum/maximum flat work group sizes were explicitly requested using
|
|
// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
|
|
// number of waves per execution unit to values implied by requested
|
|
// minimum/maximum flat work group sizes.
|
|
unsigned MinImpliedByFlatWorkGroupSize =
|
|
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
|
|
Default.first = MinImpliedByFlatWorkGroupSize;
|
|
|
|
// Make sure requested minimum is less than requested maximum.
|
|
if (Requested.second && Requested.first > Requested.second)
|
|
return Default;
|
|
|
|
// Make sure requested values do not violate subtarget's specifications.
|
|
if (Requested.first < getMinWavesPerEU() ||
|
|
Requested.second > getMaxWavesPerEU())
|
|
return Default;
|
|
|
|
// Make sure requested values are compatible with values implied by requested
|
|
// minimum/maximum flat work group sizes.
|
|
if (Requested.first < MinImpliedByFlatWorkGroupSize)
|
|
return Default;
|
|
|
|
return Requested;
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
|
|
const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
|
|
// Default minimum/maximum number of waves per execution unit.
|
|
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
|
|
|
|
// Requested minimum/maximum number of waves per execution unit.
|
|
std::pair<unsigned, unsigned> Requested =
|
|
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
|
|
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
|
|
}
|
|
|
|
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
|
|
auto *Node = Kernel.getMetadata("reqd_work_group_size");
|
|
if (Node && Node->getNumOperands() == 3)
|
|
return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
|
|
return std::numeric_limits<unsigned>::max();
|
|
}
|
|
|
|
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
|
|
return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
|
|
}
|
|
|
|
unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
|
|
unsigned Dimension) const {
|
|
unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
|
|
if (ReqdSize != std::numeric_limits<unsigned>::max())
|
|
return ReqdSize - 1;
|
|
return getFlatWorkGroupSizes(Kernel).second - 1;
|
|
}
|
|
|
|
bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
|
|
for (int I = 0; I < 3; ++I) {
|
|
if (getMaxWorkitemID(Func, I) > 0)
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
|
|
Function *Kernel = I->getParent()->getParent();
|
|
unsigned MinSize = 0;
|
|
unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
|
|
bool IdQuery = false;
|
|
|
|
// If reqd_work_group_size is present it narrows value down.
|
|
if (auto *CI = dyn_cast<CallInst>(I)) {
|
|
const Function *F = CI->getCalledFunction();
|
|
if (F) {
|
|
unsigned Dim = UINT_MAX;
|
|
switch (F->getIntrinsicID()) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
case Intrinsic::r600_read_tidig_x:
|
|
IdQuery = true;
|
|
[[fallthrough]];
|
|
case Intrinsic::r600_read_local_size_x:
|
|
Dim = 0;
|
|
break;
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
IdQuery = true;
|
|
[[fallthrough]];
|
|
case Intrinsic::r600_read_local_size_y:
|
|
Dim = 1;
|
|
break;
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
IdQuery = true;
|
|
[[fallthrough]];
|
|
case Intrinsic::r600_read_local_size_z:
|
|
Dim = 2;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (Dim <= 3) {
|
|
unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
|
|
if (ReqdSize != std::numeric_limits<unsigned>::max())
|
|
MinSize = MaxSize = ReqdSize;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!MaxSize)
|
|
return false;
|
|
|
|
// Range metadata is [Lo, Hi). For ID query we need to pass max size
|
|
// as Hi. For size query we need to pass Hi + 1.
|
|
if (IdQuery)
|
|
MinSize = 0;
|
|
else
|
|
++MaxSize;
|
|
|
|
APInt Lower{32, MinSize};
|
|
APInt Upper{32, MaxSize};
|
|
if (auto *CI = dyn_cast<CallBase>(I)) {
|
|
ConstantRange Range(Lower, Upper);
|
|
CI->addRangeRetAttr(Range);
|
|
} else {
|
|
MDBuilder MDB(I->getContext());
|
|
MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
|
|
I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
|
|
assert(AMDGPU::isKernel(F.getCallingConv()));
|
|
|
|
// We don't allocate the segment if we know the implicit arguments weren't
|
|
// used, even if the ABI implies we need them.
|
|
if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
|
|
return 0;
|
|
|
|
if (isMesaKernel(F))
|
|
return 16;
|
|
|
|
// Assume all implicit inputs are used by default
|
|
const Module *M = F.getParent();
|
|
unsigned NBytes =
|
|
AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
|
|
return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
|
|
NBytes);
|
|
}
|
|
|
|
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
|
|
Align &MaxAlign) const {
|
|
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
|
|
F.getCallingConv() == CallingConv::SPIR_KERNEL);
|
|
|
|
const DataLayout &DL = F.getDataLayout();
|
|
uint64_t ExplicitArgBytes = 0;
|
|
MaxAlign = Align(1);
|
|
|
|
for (const Argument &Arg : F.args()) {
|
|
if (Arg.hasAttribute("amdgpu-hidden-argument"))
|
|
continue;
|
|
|
|
const bool IsByRef = Arg.hasByRefAttr();
|
|
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
|
|
Align Alignment = DL.getValueOrABITypeAlignment(
|
|
IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
|
|
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
|
|
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
|
|
MaxAlign = std::max(MaxAlign, Alignment);
|
|
}
|
|
|
|
return ExplicitArgBytes;
|
|
}
|
|
|
|
unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
|
|
Align &MaxAlign) const {
|
|
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
|
|
F.getCallingConv() != CallingConv::SPIR_KERNEL)
|
|
return 0;
|
|
|
|
uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
|
|
|
|
unsigned ExplicitOffset = getExplicitKernelArgOffset();
|
|
|
|
uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
|
|
unsigned ImplicitBytes = getImplicitArgNumBytes(F);
|
|
if (ImplicitBytes != 0) {
|
|
const Align Alignment = getAlignmentForImplicitArgPtr();
|
|
TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
|
|
MaxAlign = std::max(MaxAlign, Alignment);
|
|
}
|
|
|
|
// Being able to dereference past the end is useful for emitting scalar loads.
|
|
return alignTo(TotalSize, 4);
|
|
}
|
|
|
|
AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
|
|
return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
|
|
: AMDGPUDwarfFlavour::Wave64;
|
|
}
|
|
|
|
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
|
|
if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
|
|
return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
|
|
return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
|
|
}
|
|
|
|
const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
|
|
if (TM.getTargetTriple().getArch() == Triple::amdgcn)
|
|
return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
|
|
return static_cast<const AMDGPUSubtarget &>(
|
|
TM.getSubtarget<R600Subtarget>(F));
|
|
}
|
|
|
|
SmallVector<unsigned>
|
|
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
|
|
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
|
|
}
|