Revert "[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#… (#144039)
…133242)"
This reverts commit 130080fab1 because it
causes issues in testcases similar to coalescer_remat.ll [1], i.e. when
we use a VGPR tuple but only write to its lower parts. The high VGPRs
would then not be included in the vgpr_count, and accessing them would
be an out of bounds violation.
[1]
https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
This commit is contained in:
@@ -4263,9 +4263,10 @@ same *vendor-name*.
|
||||
wavefront for
|
||||
GFX6-GFX9. A register
|
||||
is required if it is
|
||||
written to, or
|
||||
used explicitly, or
|
||||
if a higher numbered
|
||||
register is written to. This
|
||||
register is used
|
||||
explicitly. This
|
||||
includes the special
|
||||
SGPRs for VCC, Flat
|
||||
Scratch (GFX7-GFX9)
|
||||
@@ -4283,10 +4284,10 @@ same *vendor-name*.
|
||||
each work-item for
|
||||
GFX6-GFX9. A register
|
||||
is required if it is
|
||||
written to, or
|
||||
used explicitly, or
|
||||
if a higher numbered
|
||||
register is
|
||||
written to.
|
||||
register is used
|
||||
explicitly.
|
||||
".agpr_count" integer Required Number of accumulator
|
||||
registers required by
|
||||
each work-item for
|
||||
|
||||
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
// dispatch registers are function args.
|
||||
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
|
||||
|
||||
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
|
||||
if (isShader(F.getCallingConv())) {
|
||||
bool IsPixelShader =
|
||||
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
|
||||
|
||||
@@ -1060,6 +1060,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
|
||||
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
|
||||
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
|
||||
} else if (isKernel(F.getCallingConv()) &&
|
||||
MFI->getNumKernargPreloadedSGPRs()) {
|
||||
// Consider cases where the total number of UserSGPRs with trailing
|
||||
// allocated preload SGPRs, is greater than the number of explicitly
|
||||
// referenced SGPRs.
|
||||
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
|
||||
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
|
||||
ProgInfo.NumSGPR =
|
||||
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
|
||||
}
|
||||
|
||||
// Adjust number of registers used to meet default/requested minimum/maximum
|
||||
|
||||
@@ -137,29 +137,274 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
|
||||
if (MFI->isStackRealigned())
|
||||
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
|
||||
|
||||
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
|
||||
Info.UsesVCC =
|
||||
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
|
||||
|
||||
Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
|
||||
Info.NumExplicitSGPR =
|
||||
TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
|
||||
if (ST.hasMAIInsts())
|
||||
Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
|
||||
|
||||
// Preloaded registers are written by the hardware, not defined in the
|
||||
// function body, so they need special handling.
|
||||
if (MFI->isEntryFunction()) {
|
||||
Info.NumExplicitSGPR =
|
||||
std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
|
||||
Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
|
||||
// If there are no calls, MachineRegisterInfo can tell us the used register
|
||||
// count easily.
|
||||
// A tail call isn't considered a call for MachineFrameInfo's purposes.
|
||||
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
|
||||
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
|
||||
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
|
||||
if (ST.hasMAIInsts())
|
||||
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
|
||||
return Info;
|
||||
}
|
||||
|
||||
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
|
||||
return Info;
|
||||
|
||||
int32_t MaxVGPR = -1;
|
||||
int32_t MaxAGPR = -1;
|
||||
int32_t MaxSGPR = -1;
|
||||
Info.CalleeSegmentSize = 0;
|
||||
|
||||
for (const MachineBasicBlock &MBB : MF) {
|
||||
for (const MachineInstr &MI : MBB) {
|
||||
// TODO: Check regmasks? Do they occur anywhere except calls?
|
||||
for (const MachineOperand &MO : MI.operands()) {
|
||||
unsigned Width = 0;
|
||||
bool IsSGPR = false;
|
||||
bool IsAGPR = false;
|
||||
|
||||
if (!MO.isReg())
|
||||
continue;
|
||||
|
||||
Register Reg = MO.getReg();
|
||||
switch (Reg) {
|
||||
case AMDGPU::EXEC:
|
||||
case AMDGPU::EXEC_LO:
|
||||
case AMDGPU::EXEC_HI:
|
||||
case AMDGPU::SCC:
|
||||
case AMDGPU::M0:
|
||||
case AMDGPU::M0_LO16:
|
||||
case AMDGPU::M0_HI16:
|
||||
case AMDGPU::SRC_SHARED_BASE_LO:
|
||||
case AMDGPU::SRC_SHARED_BASE:
|
||||
case AMDGPU::SRC_SHARED_LIMIT_LO:
|
||||
case AMDGPU::SRC_SHARED_LIMIT:
|
||||
case AMDGPU::SRC_PRIVATE_BASE_LO:
|
||||
case AMDGPU::SRC_PRIVATE_BASE:
|
||||
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
|
||||
case AMDGPU::SRC_PRIVATE_LIMIT:
|
||||
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
|
||||
case AMDGPU::SGPR_NULL:
|
||||
case AMDGPU::SGPR_NULL64:
|
||||
case AMDGPU::MODE:
|
||||
continue;
|
||||
|
||||
case AMDGPU::NoRegister:
|
||||
assert(MI.isDebugInstr() &&
|
||||
"Instruction uses invalid noreg register");
|
||||
continue;
|
||||
|
||||
case AMDGPU::VCC:
|
||||
case AMDGPU::VCC_LO:
|
||||
case AMDGPU::VCC_HI:
|
||||
case AMDGPU::VCC_LO_LO16:
|
||||
case AMDGPU::VCC_LO_HI16:
|
||||
case AMDGPU::VCC_HI_LO16:
|
||||
case AMDGPU::VCC_HI_HI16:
|
||||
Info.UsesVCC = true;
|
||||
continue;
|
||||
|
||||
case AMDGPU::FLAT_SCR:
|
||||
case AMDGPU::FLAT_SCR_LO:
|
||||
case AMDGPU::FLAT_SCR_HI:
|
||||
continue;
|
||||
|
||||
case AMDGPU::XNACK_MASK:
|
||||
case AMDGPU::XNACK_MASK_LO:
|
||||
case AMDGPU::XNACK_MASK_HI:
|
||||
llvm_unreachable("xnack_mask registers should not be used");
|
||||
|
||||
case AMDGPU::LDS_DIRECT:
|
||||
llvm_unreachable("lds_direct register should not be used");
|
||||
|
||||
case AMDGPU::TBA:
|
||||
case AMDGPU::TBA_LO:
|
||||
case AMDGPU::TBA_HI:
|
||||
case AMDGPU::TMA:
|
||||
case AMDGPU::TMA_LO:
|
||||
case AMDGPU::TMA_HI:
|
||||
llvm_unreachable("trap handler registers should not be used");
|
||||
|
||||
case AMDGPU::SRC_VCCZ:
|
||||
llvm_unreachable("src_vccz register should not be used");
|
||||
|
||||
case AMDGPU::SRC_EXECZ:
|
||||
llvm_unreachable("src_execz register should not be used");
|
||||
|
||||
case AMDGPU::SRC_SCC:
|
||||
llvm_unreachable("src_scc register should not be used");
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
|
||||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
|
||||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 1;
|
||||
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
|
||||
AMDGPU::VGPR_16RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 1;
|
||||
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
|
||||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 1;
|
||||
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 2;
|
||||
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 2;
|
||||
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 2;
|
||||
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 3;
|
||||
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 3;
|
||||
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 3;
|
||||
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 4;
|
||||
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 4;
|
||||
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 4;
|
||||
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 5;
|
||||
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 5;
|
||||
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 5;
|
||||
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 6;
|
||||
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 6;
|
||||
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 6;
|
||||
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 7;
|
||||
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 7;
|
||||
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 7;
|
||||
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 8;
|
||||
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 8;
|
||||
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 8;
|
||||
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 9;
|
||||
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 9;
|
||||
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 9;
|
||||
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 10;
|
||||
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 10;
|
||||
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 10;
|
||||
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 11;
|
||||
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 11;
|
||||
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 11;
|
||||
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 12;
|
||||
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 12;
|
||||
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 12;
|
||||
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 16;
|
||||
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 16;
|
||||
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 16;
|
||||
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 32;
|
||||
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 32;
|
||||
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
IsAGPR = true;
|
||||
Width = 32;
|
||||
} else {
|
||||
// We only expect TTMP registers or registers that do not belong to
|
||||
// any RC.
|
||||
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
|
||||
AMDGPU::TTMP_64RegClass.contains(Reg) ||
|
||||
AMDGPU::TTMP_128RegClass.contains(Reg) ||
|
||||
AMDGPU::TTMP_256RegClass.contains(Reg) ||
|
||||
AMDGPU::TTMP_512RegClass.contains(Reg) ||
|
||||
!TRI.getPhysRegBaseClass(Reg)) &&
|
||||
"Unknown register class");
|
||||
}
|
||||
unsigned HWReg = TRI.getHWRegIndex(Reg);
|
||||
int MaxUsed = HWReg + Width - 1;
|
||||
if (IsSGPR) {
|
||||
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
|
||||
} else if (IsAGPR) {
|
||||
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
|
||||
} else {
|
||||
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
|
||||
}
|
||||
}
|
||||
|
||||
if (MI.isCall()) {
|
||||
// Pseudo used just to encode the underlying global. Is there a better
|
||||
// way to track this?
|
||||
@@ -219,5 +464,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
|
||||
}
|
||||
}
|
||||
|
||||
Info.NumExplicitSGPR = MaxSGPR + 1;
|
||||
Info.NumVGPR = MaxVGPR + 1;
|
||||
Info.NumAGPR = MaxAGPR + 1;
|
||||
|
||||
return Info;
|
||||
}
|
||||
|
||||
@@ -970,25 +970,10 @@ public:
|
||||
return NumUserSGPRs;
|
||||
}
|
||||
|
||||
// Get the number of preloaded SGPRs for compute kernels.
|
||||
unsigned getNumPreloadedSGPRs() const {
|
||||
return NumUserSGPRs + NumSystemSGPRs;
|
||||
}
|
||||
|
||||
// Get the number of preloaded VGPRs for compute kernels.
|
||||
unsigned getNumPreloadedVGPRs() const {
|
||||
if (hasWorkItemIDZ())
|
||||
return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
|
||||
|
||||
if (hasWorkItemIDY())
|
||||
return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
|
||||
|
||||
if (hasWorkItemIDX())
|
||||
return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned getNumKernargPreloadedSGPRs() const {
|
||||
return UserSGPRInfo.getNumKernargPreloadSGPRs();
|
||||
}
|
||||
|
||||
@@ -4055,20 +4055,6 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned
|
||||
SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
|
||||
const TargetRegisterClass &RC) const {
|
||||
for (MCPhysReg Reg : reverse(RC.getRegisters())) {
|
||||
for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
|
||||
if (llvm::any_of(MRI.def_instructions(*AI), [](const MachineInstr &MI) {
|
||||
return !MI.isImplicitDef();
|
||||
}))
|
||||
return getHWRegIndex(Reg) + 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
SmallVector<StringLiteral>
|
||||
SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
|
||||
const MachineFunction &MF) const {
|
||||
|
||||
@@ -486,11 +486,6 @@ public:
|
||||
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
|
||||
const TargetRegisterClass &RC) const;
|
||||
|
||||
// \returns the number of registers of a given \p RC defined in a function.
|
||||
// Does not go inside function calls.
|
||||
unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
|
||||
const TargetRegisterClass &RC) const;
|
||||
|
||||
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
|
||||
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
|
||||
: std::optional<uint8_t>{};
|
||||
|
||||
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
|
||||
; GPRIDX-NEXT: gds_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
|
||||
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 24
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 17
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 3
|
||||
; GPRIDX-NEXT: reserved_vgpr_first = 0
|
||||
; GPRIDX-NEXT: reserved_vgpr_count = 0
|
||||
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
|
||||
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
|
||||
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GFX10-NEXT: priority = 0
|
||||
; GFX10-NEXT: float_mode = 240
|
||||
; GFX10-NEXT: priv = 0
|
||||
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
|
||||
; GFX10-NEXT: gds_segment_byte_size = 0
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX10-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 18
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 10
|
||||
; GFX10-NEXT: workitem_vgpr_count = 3
|
||||
; GFX10-NEXT: reserved_vgpr_first = 0
|
||||
; GFX10-NEXT: reserved_vgpr_count = 0
|
||||
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
|
||||
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
|
||||
; GFX11-NEXT: priority = 0
|
||||
; GFX11-NEXT: float_mode = 240
|
||||
; GFX11-NEXT: priv = 0
|
||||
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
|
||||
; GFX11-NEXT: gds_segment_byte_size = 0
|
||||
; GFX11-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX11-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX11-NEXT: wavefront_sgpr_count = 16
|
||||
; GFX11-NEXT: wavefront_sgpr_count = 7
|
||||
; GFX11-NEXT: workitem_vgpr_count = 3
|
||||
; GFX11-NEXT: reserved_vgpr_first = 0
|
||||
; GFX11-NEXT: reserved_vgpr_count = 0
|
||||
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
|
||||
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GPRIDX-NEXT: priority = 0
|
||||
; GPRIDX-NEXT: float_mode = 240
|
||||
; GPRIDX-NEXT: priv = 0
|
||||
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GPRIDX-NEXT: gds_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
|
||||
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 24
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 3
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 16
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 2
|
||||
; GPRIDX-NEXT: reserved_vgpr_first = 0
|
||||
; GPRIDX-NEXT: reserved_vgpr_count = 0
|
||||
; GPRIDX-NEXT: reserved_sgpr_first = 0
|
||||
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
|
||||
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GFX10-NEXT: priority = 0
|
||||
; GFX10-NEXT: float_mode = 240
|
||||
; GFX10-NEXT: priv = 0
|
||||
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX10-NEXT: gds_segment_byte_size = 0
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX10-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 18
|
||||
; GFX10-NEXT: workitem_vgpr_count = 3
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 10
|
||||
; GFX10-NEXT: workitem_vgpr_count = 2
|
||||
; GFX10-NEXT: reserved_vgpr_first = 0
|
||||
; GFX10-NEXT: reserved_vgpr_count = 0
|
||||
; GFX10-NEXT: reserved_sgpr_first = 0
|
||||
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
|
||||
; GFX11-NEXT: priority = 0
|
||||
; GFX11-NEXT: float_mode = 240
|
||||
; GFX11-NEXT: priv = 0
|
||||
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX11-NEXT: gds_segment_byte_size = 0
|
||||
; GFX11-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX11-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX11-NEXT: wavefront_sgpr_count = 16
|
||||
; GFX11-NEXT: wavefront_sgpr_count = 6
|
||||
; GFX11-NEXT: workitem_vgpr_count = 2
|
||||
; GFX11-NEXT: reserved_vgpr_first = 0
|
||||
; GFX11-NEXT: reserved_vgpr_count = 0
|
||||
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
|
||||
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GPRIDX-NEXT: priority = 0
|
||||
; GPRIDX-NEXT: float_mode = 240
|
||||
; GPRIDX-NEXT: priv = 0
|
||||
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GPRIDX-NEXT: gds_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
|
||||
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 24
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 16
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 3
|
||||
; GPRIDX-NEXT: reserved_vgpr_first = 0
|
||||
; GPRIDX-NEXT: reserved_vgpr_count = 0
|
||||
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
|
||||
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GFX10-NEXT: priority = 0
|
||||
; GFX10-NEXT: float_mode = 240
|
||||
; GFX10-NEXT: priv = 0
|
||||
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX10-NEXT: gds_segment_byte_size = 0
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX10-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 18
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 10
|
||||
; GFX10-NEXT: workitem_vgpr_count = 3
|
||||
; GFX10-NEXT: reserved_vgpr_first = 0
|
||||
; GFX10-NEXT: reserved_vgpr_count = 0
|
||||
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
|
||||
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
|
||||
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
|
||||
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
|
||||
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
|
||||
; GFX11-NEXT: priority = 0
|
||||
; GFX11-NEXT: float_mode = 240
|
||||
; GFX11-NEXT: priv = 0
|
||||
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
|
||||
; GFX11-NEXT: gds_segment_byte_size = 0
|
||||
; GFX11-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX11-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX11-NEXT: wavefront_sgpr_count = 16
|
||||
; GFX11-NEXT: wavefront_sgpr_count = 7
|
||||
; GFX11-NEXT: workitem_vgpr_count = 3
|
||||
; GFX11-NEXT: reserved_vgpr_first = 0
|
||||
; GFX11-NEXT: reserved_vgpr_count = 0
|
||||
|
||||
@@ -13,9 +13,8 @@
|
||||
; CHECK: {{^}}kernel_illegal_agpr_use_asm:
|
||||
; CHECK: ; use a0
|
||||
|
||||
; GFX908: NumVgprs: 3
|
||||
; GFX90A: NumVgprs: 1
|
||||
; CHECK: NumAgprs: 0
|
||||
; CHECK: NumVgprs: 0
|
||||
; CHECK: NumAgprs: 1
|
||||
define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
|
||||
call void asm sideeffect "; use $0", "a"(i32 poison)
|
||||
ret void
|
||||
@@ -25,7 +24,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
|
||||
; CHECK: ; use a0
|
||||
|
||||
; CHECK: NumVgprs: 0
|
||||
; CHECK: NumAgprs: 0
|
||||
; CHECK: NumAgprs: 1
|
||||
define void @func_illegal_agpr_use_asm() #0 {
|
||||
call void asm sideeffect "; use $0", "a"(i32 poison)
|
||||
ret void
|
||||
|
||||
@@ -10,9 +10,9 @@
|
||||
|
||||
; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
|
||||
; ASM: .amdhsa_user_sgpr_count 12
|
||||
; ASM: .amdhsa_next_free_sgpr 15
|
||||
; ASM: ; TotalNumSgprs: 21
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 21
|
||||
; ASM: .amdhsa_next_free_sgpr 12
|
||||
; ASM: ; TotalNumSgprs: 18
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 18
|
||||
|
||||
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
|
||||
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
|
||||
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
|
||||
; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
|
||||
; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
|
||||
; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
|
||||
; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000 ................
|
||||
; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
|
||||
|
||||
; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
|
||||
; ASM: .amdhsa_user_sgpr_count 10
|
||||
; ASM: .amdhsa_next_free_sgpr 11
|
||||
; ASM: ; TotalNumSgprs: 17
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 17
|
||||
; ASM: .amdhsa_next_free_sgpr 10
|
||||
; ASM: ; TotalNumSgprs: 16
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 16
|
||||
|
||||
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
|
||||
; implicit, and 6 extra.
|
||||
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
|
||||
|
||||
; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
|
||||
; ASM: .amdhsa_user_sgpr_count 3
|
||||
; ASM: .amdhsa_next_free_sgpr 4
|
||||
; ASM: ; TotalNumSgprs: 10
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 10
|
||||
; ASM: .amdhsa_next_free_sgpr 3
|
||||
; ASM: ; TotalNumSgprs: 9
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 9
|
||||
|
||||
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
|
||||
|
||||
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
|
||||
; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
|
||||
; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
|
||||
; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
|
||||
; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000 @...............
|
||||
; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
|
||||
|
||||
; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
|
||||
; ASM: .amdhsa_user_sgpr_count 2
|
||||
; ASM: .amdhsa_next_free_sgpr 3
|
||||
; ASM: ; TotalNumSgprs: 9
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 9
|
||||
; ASM: .amdhsa_next_free_sgpr 0
|
||||
; ASM: ; TotalNumSgprs: 6
|
||||
; ASM: ; NumSGPRsForWavesPerEU: 6
|
||||
|
||||
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
|
||||
; Encoded like '00'.
|
||||
|
||||
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
|
||||
|
||||
; GCN: amdpal.pipelines:
|
||||
; GCN-NEXT: - .registers:
|
||||
; GFX8-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
|
||||
; GFX9-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
|
||||
; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
|
||||
; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
|
||||
; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
|
||||
; GCN-NEXT: .shader_functions:
|
||||
; GCN-NEXT: dynamic_stack:
|
||||
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
|
||||
; GCN-NEXT: multiple_stack:
|
||||
; GCN-NEXT: .backend_stack_size: 0x24{{$}}
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x21{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x3{{$}}
|
||||
; GCN-NEXT: no_stack:
|
||||
; GCN-NEXT: .backend_stack_size: 0{{$}}
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x20{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: no_stack_call:
|
||||
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
|
||||
; GCN-NEXT: simple_lds:
|
||||
; GCN-NEXT: .backend_stack_size: 0{{$}}
|
||||
; GCN-NEXT: .lds_size: 0x100{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x20{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: simple_lds_recurse:
|
||||
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
|
||||
; GCN-NEXT: simple_stack:
|
||||
; GCN-NEXT: .backend_stack_size: 0x14{{$}}
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x21{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2{{$}}
|
||||
; GCN-NEXT: simple_stack_call:
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
|
||||
|
||||
; ELF: Section {
|
||||
; ELF: Name: .text
|
||||
@@ -23,16 +23,8 @@
|
||||
; ELF: Section: .text (0x2)
|
||||
; ELF: }
|
||||
|
||||
; GFX10: NumSGPRsForWavesPerEU: 12
|
||||
; GFX10: NumVGPRsForWavesPerEU: 3
|
||||
|
||||
; Wave32 and 64 behave differently due to the UserSGPRInit16Bug,
|
||||
; which only affects Wave32.
|
||||
; GFX11W32: NumSGPRsForWavesPerEU: 16
|
||||
; GFX11W32: NumVGPRsForWavesPerEU: 1
|
||||
|
||||
; GFX11W64: NumSGPRsForWavesPerEU: 11
|
||||
; GFX11W64: NumVGPRsForWavesPerEU: 1
|
||||
; GFX10: NumSGPRsForWavesPerEU: 6
|
||||
; GFX10: NumVGPRsForWavesPerEU: 1
|
||||
|
||||
define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
|
||||
entry:
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
|
||||
|
||||
; CHECK-LABEL: {{^}}min_64_max_64:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @min_64_max_64() #0 {
|
||||
entry:
|
||||
ret void
|
||||
@@ -13,10 +13,10 @@ entry:
|
||||
attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
|
||||
|
||||
; CHECK-LABEL: {{^}}min_64_max_128:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @min_64_max_128() #1 {
|
||||
entry:
|
||||
ret void
|
||||
|
||||
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
|
||||
|
||||
; Exactly 10 waves per execution unit.
|
||||
; CHECK-LABEL: {{^}}empty_exactly_10:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @empty_exactly_10() #2 {
|
||||
entry:
|
||||
ret void
|
||||
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
|
||||
|
||||
; At least 1 wave per execution unit.
|
||||
; CHECK-LABEL: {{^}}empty_at_least_1:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @empty_at_least_1() #3 {
|
||||
entry:
|
||||
ret void
|
||||
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
|
||||
|
||||
; At least 5 waves per execution unit.
|
||||
; CHECK-LABEL: {{^}}empty_at_least_5:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @empty_at_least_5() #4 {
|
||||
entry:
|
||||
ret void
|
||||
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
|
||||
|
||||
; At least 10 waves per execution unit.
|
||||
; CHECK-LABEL: {{^}}empty_at_least_10:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @empty_at_least_10() #5 {
|
||||
entry:
|
||||
ret void
|
||||
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
|
||||
|
||||
; At most 10 waves per execution unit.
|
||||
; CHECK-LABEL: {{^}}empty_at_most_10:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @empty_at_most_10() #7 {
|
||||
entry:
|
||||
ret void
|
||||
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
|
||||
|
||||
; Between 5 and 10 waves per execution unit.
|
||||
; CHECK-LABEL: {{^}}empty_between_5_and_10:
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 3
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define amdgpu_kernel void @empty_between_5_and_10() #8 {
|
||||
entry:
|
||||
ret void
|
||||
|
||||
@@ -28,7 +28,7 @@ bb:
|
||||
}
|
||||
; ALL: .set .Laliasee_default.num_vgpr, 0
|
||||
; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
|
||||
; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
|
||||
; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
|
||||
|
||||
attributes #0 = { noinline norecurse nounwind optnone }
|
||||
attributes #1 = { noinline norecurse nounwind readnone willreturn }
|
||||
|
||||
@@ -18,7 +18,7 @@ bb:
|
||||
|
||||
; CHECK: .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
|
||||
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
|
||||
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
|
||||
bb:
|
||||
call void asm sideeffect "; clobber v52 ", "~{v52}"()
|
||||
|
||||
@@ -24,7 +24,7 @@ bb:
|
||||
|
||||
; CHECK: .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
|
||||
define internal void @aliasee_vgpr32_sgpr76() #1 {
|
||||
bb:
|
||||
call void asm sideeffect "; clobber v26 ", "~{v26}"()
|
||||
|
||||
@@ -21,7 +21,7 @@ bb:
|
||||
|
||||
; CHECK: .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
|
||||
define internal void @aliasee_vgpr64_sgpr102() #1 {
|
||||
bb:
|
||||
call void asm sideeffect "; clobber v52 ", "~{v52}"()
|
||||
|
||||
@@ -21,7 +21,7 @@ bb:
|
||||
|
||||
; CHECK: .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
|
||||
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
|
||||
define internal void @aliasee_vgpr256_sgpr102() #1 {
|
||||
bb:
|
||||
call void asm sideeffect "; clobber v252 ", "~{v252}"()
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
; Make sure to run a GPU with the SGPR allocation bug.
|
||||
|
||||
; GCN-LABEL: {{^}}use_vcc:
|
||||
; GCN: ; TotalNumSgprs: 2
|
||||
; GCN: ; TotalNumSgprs: 34
|
||||
; GCN: ; NumVgprs: 0
|
||||
define void @use_vcc() #1 {
|
||||
call void asm sideeffect "", "~{vcc}" () #0
|
||||
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_flat_scratch:
|
||||
; CI: ; TotalNumSgprs: 4
|
||||
; VI: ; TotalNumSgprs: 6
|
||||
; CI: ; TotalNumSgprs: 36
|
||||
; VI: ; TotalNumSgprs: 38
|
||||
; GCN: ; NumVgprs: 0
|
||||
define void @use_flat_scratch() #1 {
|
||||
call void asm sideeffect "", "~{flat_scratch}" () #0
|
||||
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
|
||||
; Make sure there's no assert when a sgpr96 is used.
|
||||
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
|
||||
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
|
||||
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
|
||||
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
|
||||
; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
|
||||
; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
|
||||
; VI-BUG: TotalNumSgprs: 96
|
||||
@@ -249,7 +249,7 @@ entry:
|
||||
; Make sure there's no assert when a sgpr160 is used.
|
||||
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
|
||||
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
|
||||
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
|
||||
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
|
||||
; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
|
||||
; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
|
||||
; VI-BUG: TotalNumSgprs: 96
|
||||
|
||||
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
|
||||
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; It's probably OK if this is slightly higher:
|
||||
; CHECK: ; NumVgprs: 5
|
||||
; CHECK: ; NumVgprs: 8
|
||||
define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
|
||||
entry:
|
||||
%cmpflag = icmp eq i32 %flag, 1
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
||||
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
||||
@@ -35,7 +35,7 @@
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
||||
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
||||
@@ -93,7 +93,7 @@ entry:
|
||||
; registers used.
|
||||
;
|
||||
; ALL-ASM-LABEL: {{^}}empty:
|
||||
; ALL-ASM: .amdhsa_next_free_vgpr 3
|
||||
; ALL-ASM: .amdhsa_next_free_vgpr 1
|
||||
; ALL-ASM: .amdhsa_next_free_sgpr 1
|
||||
define amdgpu_kernel void @empty(
|
||||
i32 %i,
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
; OSABI-HSA-ELF: .sgpr_count: 96
|
||||
; OSABI-HSA-ELF: .sgpr_spill_count: 0
|
||||
; OSABI-HSA-ELF: .symbol: elf_notes.kd
|
||||
; OSABI-HSA-ELF: .vgpr_count: 1
|
||||
; OSABI-HSA-ELF: .vgpr_count: 0
|
||||
; OSABI-HSA-ELF: .vgpr_spill_count: 0
|
||||
; OSABI-HSA-ELF: .wavefront_size: 64
|
||||
; OSABI-HSA-ELF: amdhsa.target: amdgcn-amd-amdhsa--gfx802
|
||||
|
||||
@@ -27,15 +27,15 @@
|
||||
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: ; TotalNumSgprs: 12
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 12
|
||||
; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
|
||||
; VI-XNACK: ; TotalNumSgprs: 16
|
||||
; HSA-VI-XNACK: ; TotalNumSgprs: 22
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
|
||||
; CI: ; TotalNumSgprs: 8
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 8
|
||||
; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
|
||||
; VI-XNACK: ; TotalNumSgprs: 12
|
||||
; HSA-VI-XNACK: ; TotalNumSgprs: 12
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
|
||||
define amdgpu_kernel void @no_vcc_no_flat() {
|
||||
entry:
|
||||
call void asm sideeffect "", "~{s7}"()
|
||||
@@ -50,15 +50,15 @@ entry:
|
||||
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: ; TotalNumSgprs: 14
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 14
|
||||
; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
|
||||
; VI-XNACK: ; TotalNumSgprs: 16
|
||||
; HSA-VI-XNACK: ; TotalNumSgprs: 22
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
|
||||
; CI: ; TotalNumSgprs: 10
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 10
|
||||
; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
|
||||
; VI-XNACK: ; TotalNumSgprs: 12
|
||||
; HSA-VI-XNACK: ; TotalNumSgprs: 12
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
|
||||
define amdgpu_kernel void @vcc_no_flat() {
|
||||
entry:
|
||||
call void asm sideeffect "", "~{s7},~{vcc}"()
|
||||
@@ -73,15 +73,15 @@ entry:
|
||||
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: ; TotalNumSgprs: 16
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 18
|
||||
; CI: ; TotalNumSgprs: 12
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 14
|
||||
; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
|
||||
; VI-XNACK: ; TotalNumSgprs: 18
|
||||
; VI-XNACK: ; TotalNumSgprs: 14
|
||||
; HSA-VI-XNACK: ; TotalNumSgprs: 24
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
|
||||
define amdgpu_kernel void @no_vcc_flat() {
|
||||
entry:
|
||||
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
|
||||
@@ -96,15 +96,15 @@ entry:
|
||||
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: ; TotalNumSgprs: 16
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 18
|
||||
; CI: ; TotalNumSgprs: 12
|
||||
; VI-NOXNACK: ; TotalNumSgprs: 14
|
||||
; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
|
||||
; VI-XNACK: ; TotalNumSgprs: 18
|
||||
; VI-XNACK: ; TotalNumSgprs: 14
|
||||
; HSA-VI-XNACK: ; TotalNumSgprs: 24
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
|
||||
define amdgpu_kernel void @vcc_flat() {
|
||||
entry:
|
||||
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
|
||||
@@ -122,15 +122,15 @@ entry:
|
||||
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: NumSgprs: 16
|
||||
; VI-NOXNACK: NumSgprs: 18
|
||||
; CI: NumSgprs: 4
|
||||
; VI-NOXNACK: NumSgprs: 6
|
||||
; HSA-VI-NOXNACK: NumSgprs: 24
|
||||
; VI-XNACK: NumSgprs: 18
|
||||
; VI-XNACK: NumSgprs: 6
|
||||
; HSA-VI-XNACK: NumSgprs: 24
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
|
||||
define amdgpu_kernel void @use_flat_scr() #0 {
|
||||
entry:
|
||||
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
|
||||
@@ -143,15 +143,15 @@ entry:
|
||||
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
|
||||
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: NumSgprs: 16
|
||||
; VI-NOXNACK: NumSgprs: 18
|
||||
; CI: NumSgprs: 4
|
||||
; VI-NOXNACK: NumSgprs: 6
|
||||
; HSA-VI-NOXNACK: NumSgprs: 24
|
||||
; VI-XNACK: NumSgprs: 18
|
||||
; VI-XNACK: NumSgprs: 6
|
||||
; HSA-VI-XNACK: NumSgprs: 24
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
|
||||
define amdgpu_kernel void @use_flat_scr_lo() #0 {
|
||||
entry:
|
||||
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
|
||||
@@ -166,15 +166,15 @@ entry:
|
||||
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; CI: NumSgprs: 16
|
||||
; VI-NOXNACK: NumSgprs: 18
|
||||
; CI: NumSgprs: 4
|
||||
; VI-NOXNACK: NumSgprs: 6
|
||||
; HSA-VI-NOXNACK: NumSgprs: 24
|
||||
; VI-XNACK: NumSgprs: 18
|
||||
; VI-XNACK: NumSgprs: 6
|
||||
; HSA-VI-XNACK: NumSgprs: 24
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
|
||||
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
|
||||
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
|
||||
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
|
||||
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
|
||||
define amdgpu_kernel void @use_flat_scr_hi() #0 {
|
||||
entry:
|
||||
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
|
||||
|
||||
@@ -5,14 +5,14 @@
|
||||
; GCN-LABEL: {{^}}use_vcc:
|
||||
; GCN: .set use_vcc.num_vgpr, 0
|
||||
; GCN: .set use_vcc.num_agpr, 0
|
||||
; GCN: .set use_vcc.numbered_sgpr, 0
|
||||
; GCN: .set use_vcc.numbered_sgpr, 32
|
||||
; GCN: .set use_vcc.private_seg_size, 0
|
||||
; GCN: .set use_vcc.uses_vcc, 1
|
||||
; GCN: .set use_vcc.uses_flat_scratch, 0
|
||||
; GCN: .set use_vcc.has_dyn_sized_stack, 0
|
||||
; GCN: .set use_vcc.has_recursion, 0
|
||||
; GCN: .set use_vcc.has_indirect_call, 0
|
||||
; GCN: TotalNumSgprs: 4
|
||||
; GCN: TotalNumSgprs: 36
|
||||
; GCN: NumVgprs: 0
|
||||
; GCN: ScratchSize: 0
|
||||
define void @use_vcc() #1 {
|
||||
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
|
||||
; GCN-LABEL: {{^}}use_flat_scratch:
|
||||
; GCN: .set use_flat_scratch.num_vgpr, 0
|
||||
; GCN: .set use_flat_scratch.num_agpr, 0
|
||||
; GCN: .set use_flat_scratch.numbered_sgpr, 0
|
||||
; GCN: .set use_flat_scratch.numbered_sgpr, 32
|
||||
; GCN: .set use_flat_scratch.private_seg_size, 0
|
||||
; GCN: .set use_flat_scratch.uses_vcc, 0
|
||||
; GCN: .set use_flat_scratch.uses_flat_scratch, 1
|
||||
; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
|
||||
; GCN: .set use_flat_scratch.has_recursion, 0
|
||||
; GCN: .set use_flat_scratch.has_indirect_call, 0
|
||||
; GCN: TotalNumSgprs: 6
|
||||
; GCN: TotalNumSgprs: 38
|
||||
; GCN: NumVgprs: 0
|
||||
; GCN: ScratchSize: 0
|
||||
define void @use_flat_scratch() #1 {
|
||||
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
|
||||
; GCN-LABEL: {{^}}use_10_vgpr:
|
||||
; GCN: .set use_10_vgpr.num_vgpr, 10
|
||||
; GCN: .set use_10_vgpr.num_agpr, 0
|
||||
; GCN: .set use_10_vgpr.numbered_sgpr, 0
|
||||
; GCN: .set use_10_vgpr.numbered_sgpr, 32
|
||||
; GCN: .set use_10_vgpr.private_seg_size, 0
|
||||
; GCN: .set use_10_vgpr.uses_vcc, 0
|
||||
; GCN: .set use_10_vgpr.uses_flat_scratch, 0
|
||||
; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
|
||||
; GCN: .set use_10_vgpr.has_recursion, 0
|
||||
; GCN: .set use_10_vgpr.has_indirect_call, 0
|
||||
; GCN: TotalNumSgprs: 4
|
||||
; GCN: TotalNumSgprs: 36
|
||||
; GCN: NumVgprs: 10
|
||||
; GCN: ScratchSize: 0
|
||||
define void @use_10_vgpr() #1 {
|
||||
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
|
||||
; GCN-LABEL: {{^}}use_50_vgpr:
|
||||
; GCN: .set use_50_vgpr.num_vgpr, 50
|
||||
; GCN: .set use_50_vgpr.num_agpr, 0
|
||||
; GCN: .set use_50_vgpr.numbered_sgpr, 0
|
||||
; GCN: .set use_50_vgpr.numbered_sgpr, 32
|
||||
; GCN: .set use_50_vgpr.private_seg_size, 0
|
||||
; GCN: .set use_50_vgpr.uses_vcc, 0
|
||||
; GCN: .set use_50_vgpr.uses_flat_scratch, 0
|
||||
; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
|
||||
; GCN: .set use_50_vgpr.has_recursion, 0
|
||||
; GCN: .set use_50_vgpr.has_indirect_call, 0
|
||||
; GCN: TotalNumSgprs: 4
|
||||
; GCN: TotalNumSgprs: 36
|
||||
; GCN: NumVgprs: 50
|
||||
; GCN: ScratchSize: 0
|
||||
define void @use_50_vgpr() #1 {
|
||||
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
|
||||
; GCN-LABEL: {{^}}use_stack0:
|
||||
; GCN: .set use_stack0.num_vgpr, 1
|
||||
; GCN: .set use_stack0.num_agpr, 0
|
||||
; GCN: .set use_stack0.numbered_sgpr, 0
|
||||
; GCN: .set use_stack0.numbered_sgpr, 33
|
||||
; GCN: .set use_stack0.private_seg_size, 2052
|
||||
; GCN: .set use_stack0.uses_vcc, 0
|
||||
; GCN: .set use_stack0.uses_flat_scratch, 0
|
||||
; GCN: .set use_stack0.has_dyn_sized_stack, 0
|
||||
; GCN: .set use_stack0.has_recursion, 0
|
||||
; GCN: .set use_stack0.has_indirect_call, 0
|
||||
; GCN: TotalNumSgprs: 4
|
||||
; GCN: TotalNumSgprs: 37
|
||||
; GCN: NumVgprs: 1
|
||||
; GCN: ScratchSize: 2052
|
||||
define void @use_stack0() #1 {
|
||||
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
|
||||
; GCN-LABEL: {{^}}use_stack1:
|
||||
; GCN: .set use_stack1.num_vgpr, 1
|
||||
; GCN: .set use_stack1.num_agpr, 0
|
||||
; GCN: .set use_stack1.numbered_sgpr, 0
|
||||
; GCN: .set use_stack1.numbered_sgpr, 33
|
||||
; GCN: .set use_stack1.private_seg_size, 404
|
||||
; GCN: .set use_stack1.uses_vcc, 0
|
||||
; GCN: .set use_stack1.uses_flat_scratch, 0
|
||||
; GCN: .set use_stack1.has_dyn_sized_stack, 0
|
||||
; GCN: .set use_stack1.has_recursion, 0
|
||||
; GCN: .set use_stack1.has_indirect_call, 0
|
||||
; GCN: TotalNumSgprs: 4
|
||||
; GCN: TotalNumSgprs: 37
|
||||
; GCN: NumVgprs: 1
|
||||
; GCN: ScratchSize: 404
|
||||
define void @use_stack1() #1 {
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
; CHECK: .max_flat_workgroup_size: 1024
|
||||
; CHECK: .name: test
|
||||
; CHECK: .private_segment_fixed_size: 0
|
||||
; CHECK: .sgpr_count: 16
|
||||
; CHECK: .sgpr_count: 10
|
||||
; CHECK: .symbol: test.kd
|
||||
; CHECK: .vgpr_count: {{3|6}}
|
||||
; WAVE64: .wavefront_size: 64
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
|
||||
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
|
||||
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
|
||||
; ELF: 0250: 6770725F 636F756E 7401B12E 76677072
|
||||
; ELF: 0250: 6770725F 636F756E 7402B12E 76677072
|
||||
; ELF: 0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
|
||||
; ELF: 0270: 61766566 726F6E74 5F73697A 6540AD61
|
||||
; ELF: 0280: 6D646873 612E7461 72676574 BD616D64
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: .shader_functions:
|
||||
|
||||
; Use VGPRs above the input arguments.
|
||||
; CHECK-LABEL: _miss_1:
|
||||
; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
|
||||
|
||||
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
|
||||
i32 %vcr, { i32 } %system.data,
|
||||
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
|
||||
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
|
||||
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
|
||||
local_unnamed_addr {
|
||||
entry:
|
||||
%system.data.value = extractvalue { i32 } %system.data, 0
|
||||
%dead.val = call i32 @llvm.amdgcn.dead.i32()
|
||||
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
|
||||
br i1 %is.whole.wave, label %shader, label %tail
|
||||
|
||||
shader:
|
||||
%system.data.extract = extractvalue { i32 } %system.data, 0
|
||||
%data.mul = mul i32 %system.data.extract, 2
|
||||
%data.add = add i32 %data.mul, 1
|
||||
call void asm sideeffect "; clobber v28", "~{v28}"()
|
||||
br label %tail
|
||||
|
||||
tail:
|
||||
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
|
||||
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
|
||||
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
|
||||
|
||||
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
|
||||
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
|
||||
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
|
||||
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
|
||||
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
|
||||
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
|
||||
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
|
||||
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
|
||||
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
|
||||
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
|
||||
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
|
||||
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
|
||||
|
||||
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
|
||||
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
|
||||
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
|
||||
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
|
||||
|
||||
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
|
||||
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
|
||||
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
|
||||
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
|
||||
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
|
||||
unreachable
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.dead.i32()
|
||||
declare i1 @llvm.amdgcn.init.whole.wave()
|
||||
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
|
||||
|
||||
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
|
||||
@@ -1,46 +0,0 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: .shader_functions:
|
||||
|
||||
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
|
||||
; CHECK-LABEL: leaf_shader:
|
||||
; CHECK: .vgpr_count:{{.*}}0x1{{$}}
|
||||
|
||||
; Function without calls.
|
||||
define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
|
||||
i32 %active.vgpr1, i32 %active.vgpr2,
|
||||
i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
|
||||
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
|
||||
local_unnamed_addr {
|
||||
entry:
|
||||
%dead.val = call i32 @llvm.amdgcn.dead.i32()
|
||||
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
|
||||
br i1 %is.whole.wave, label %compute, label %merge
|
||||
|
||||
compute:
|
||||
; Perform a more complex computation using active VGPRs
|
||||
%square = mul i32 %active.vgpr1, %active.vgpr1
|
||||
%product = mul i32 %square, %active.vgpr2
|
||||
%sum = add i32 %product, %input.value
|
||||
%result = add i32 %sum, 42
|
||||
br label %merge
|
||||
|
||||
merge:
|
||||
%final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
|
||||
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
|
||||
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
|
||||
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
|
||||
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
|
||||
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
|
||||
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
|
||||
|
||||
store i32 %final.result, ptr %output.ptr, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.dead.i32()
|
||||
declare i1 @llvm.amdgcn.init.whole.wave()
|
||||
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
|
||||
|
||||
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
|
||||
@@ -1,74 +0,0 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: .shader_functions:
|
||||
|
||||
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
|
||||
; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
|
||||
; In that case, the VGPR should be included in the .vgpr_count
|
||||
; CHECK-LABEL: _miss_1:
|
||||
; CHECK: .vgpr_count:{{.*}}0xd{{$}}
|
||||
|
||||
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
|
||||
i32 %vcr, { i32 } %system.data,
|
||||
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
|
||||
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
|
||||
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
|
||||
local_unnamed_addr {
|
||||
entry:
|
||||
%system.data.value = extractvalue { i32 } %system.data, 0
|
||||
%dead.val = call i32 @llvm.amdgcn.dead.i32()
|
||||
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
|
||||
br i1 %is.whole.wave, label %shader, label %tail
|
||||
|
||||
shader:
|
||||
%system.data.extract = extractvalue { i32 } %system.data, 0
|
||||
%data.mul = mul i32 %system.data.extract, 2
|
||||
%data.add = add i32 %data.mul, 1
|
||||
call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
|
||||
br label %tail
|
||||
|
||||
tail:
|
||||
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
|
||||
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
|
||||
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
|
||||
|
||||
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
|
||||
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
|
||||
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
|
||||
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
|
||||
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
|
||||
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
|
||||
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
|
||||
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
|
||||
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
|
||||
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
|
||||
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
|
||||
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
|
||||
|
||||
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
|
||||
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
|
||||
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
|
||||
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
|
||||
|
||||
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
|
||||
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
|
||||
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
|
||||
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
|
||||
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
|
||||
unreachable
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.dead.i32()
|
||||
declare i1 @llvm.amdgcn.init.whole.wave()
|
||||
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
|
||||
|
||||
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
|
||||
@@ -1,71 +0,0 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: .shader_functions:
|
||||
|
||||
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
|
||||
; CHECK-LABEL: _miss_1:
|
||||
; CHECK: .vgpr_count:{{.*}}0xa{{$}}
|
||||
|
||||
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
|
||||
i32 %vcr, { i32 } %system.data,
|
||||
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
|
||||
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
|
||||
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
|
||||
local_unnamed_addr {
|
||||
entry:
|
||||
%system.data.value = extractvalue { i32 } %system.data, 0
|
||||
%dead.val = call i32 @llvm.amdgcn.dead.i32()
|
||||
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
|
||||
br i1 %is.whole.wave, label %shader, label %tail
|
||||
|
||||
shader:
|
||||
%system.data.extract = extractvalue { i32 } %system.data, 0
|
||||
%data.mul = mul i32 %system.data.extract, 2
|
||||
%data.add = add i32 %data.mul, 1
|
||||
br label %tail
|
||||
|
||||
tail:
|
||||
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
|
||||
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
|
||||
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
|
||||
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
|
||||
|
||||
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
|
||||
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
|
||||
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
|
||||
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
|
||||
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
|
||||
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
|
||||
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
|
||||
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
|
||||
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
|
||||
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
|
||||
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
|
||||
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
|
||||
|
||||
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
|
||||
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
|
||||
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
|
||||
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
|
||||
|
||||
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
|
||||
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
|
||||
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
|
||||
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
|
||||
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
|
||||
unreachable
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.dead.i32()
|
||||
declare i1 @llvm.amdgcn.init.whole.wave()
|
||||
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
|
||||
|
||||
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
|
||||
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
|
||||
; GCN-NEXT: s_addc_u32 s17,
|
||||
; GCN-NEXT: s_setpc_b64 s[16:17]
|
||||
|
||||
; GCN: ; TotalNumSgprs: 18
|
||||
; GCN: ; TotalNumSgprs: 32
|
||||
; GCN: ; NumVgprs: 8
|
||||
define void @func_tail_call() #1 {
|
||||
tail call void @func()
|
||||
|
||||
@@ -24,7 +24,7 @@ define void @I_Quit() {
|
||||
; CHECK-LABEL: P_RemoveMobj:
|
||||
; CHECK: .set P_RemoveMobj.num_vgpr, 0
|
||||
; CHECK: .set P_RemoveMobj.num_agpr, 0
|
||||
; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
|
||||
; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
|
||||
; CHECK: .set P_RemoveMobj.private_seg_size, 0
|
||||
; CHECK: .set P_RemoveMobj.uses_vcc, 0
|
||||
; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
|
||||
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
|
||||
; CHECK-LABEL: P_SpawnMobj:
|
||||
; CHECK: .set P_SpawnMobj.num_vgpr, 0
|
||||
; CHECK: .set P_SpawnMobj.num_agpr, 0
|
||||
; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
|
||||
; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
|
||||
; CHECK: .set P_SpawnMobj.private_seg_size, 0
|
||||
; CHECK: .set P_SpawnMobj.uses_vcc, 0
|
||||
; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
|
||||
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
|
||||
; CHECK-LABEL: G_PlayerReborn:
|
||||
; CHECK: .set G_PlayerReborn.num_vgpr, 0
|
||||
; CHECK: .set G_PlayerReborn.num_agpr, 0
|
||||
; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
|
||||
; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
|
||||
; CHECK: .set G_PlayerReborn.private_seg_size, 0
|
||||
; CHECK: .set G_PlayerReborn.uses_vcc, 0
|
||||
; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
|
||||
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
|
||||
; CHECK-LABEL: P_SetThingPosition:
|
||||
; CHECK: .set P_SetThingPosition.num_vgpr, 0
|
||||
; CHECK: .set P_SetThingPosition.num_agpr, 0
|
||||
; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
|
||||
; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
|
||||
; CHECK: .set P_SetThingPosition.private_seg_size, 0
|
||||
; CHECK: .set P_SetThingPosition.uses_vcc, 0
|
||||
; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
|
||||
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
|
||||
; CHECK-LABEL: HU_Start:
|
||||
; CHECK: .set HU_Start.num_vgpr, 0
|
||||
; CHECK: .set HU_Start.num_agpr, 0
|
||||
; CHECK: .set HU_Start.numbered_sgpr, 0
|
||||
; CHECK: .set HU_Start.numbered_sgpr, 32
|
||||
; CHECK: .set HU_Start.private_seg_size, 0
|
||||
; CHECK: .set HU_Start.uses_vcc, 0
|
||||
; CHECK: .set HU_Start.uses_flat_scratch, 0
|
||||
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
|
||||
; CHECK-LABEL: AM_Stop:
|
||||
; CHECK: .set AM_Stop.num_vgpr, 0
|
||||
; CHECK: .set AM_Stop.num_agpr, 0
|
||||
; CHECK: .set AM_Stop.numbered_sgpr, 0
|
||||
; CHECK: .set AM_Stop.numbered_sgpr, 32
|
||||
; CHECK: .set AM_Stop.private_seg_size, 0
|
||||
; CHECK: .set AM_Stop.uses_vcc, 0
|
||||
; CHECK: .set AM_Stop.uses_flat_scratch, 0
|
||||
@@ -176,7 +176,7 @@ define void @AM_Stop() {
|
||||
; CHECK-LABEL: D_AdvanceDemo:
|
||||
; CHECK: .set D_AdvanceDemo.num_vgpr, 0
|
||||
; CHECK: .set D_AdvanceDemo.num_agpr, 0
|
||||
; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
|
||||
; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
|
||||
; CHECK: .set D_AdvanceDemo.private_seg_size, 0
|
||||
; CHECK: .set D_AdvanceDemo.uses_vcc, 0
|
||||
; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
|
||||
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
|
||||
; CHECK-LABEL: F_StartFinale:
|
||||
; CHECK: .set F_StartFinale.num_vgpr, 0
|
||||
; CHECK: .set F_StartFinale.num_agpr, 0
|
||||
; CHECK: .set F_StartFinale.numbered_sgpr, 0
|
||||
; CHECK: .set F_StartFinale.numbered_sgpr, 32
|
||||
; CHECK: .set F_StartFinale.private_seg_size, 0
|
||||
; CHECK: .set F_StartFinale.uses_vcc, 0
|
||||
; CHECK: .set F_StartFinale.uses_flat_scratch, 0
|
||||
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
|
||||
; CHECK-LABEL: F_Ticker:
|
||||
; CHECK: .set F_Ticker.num_vgpr, 0
|
||||
; CHECK: .set F_Ticker.num_agpr, 0
|
||||
; CHECK: .set F_Ticker.numbered_sgpr, 0
|
||||
; CHECK: .set F_Ticker.numbered_sgpr, 32
|
||||
; CHECK: .set F_Ticker.private_seg_size, 0
|
||||
; CHECK: .set F_Ticker.uses_vcc, 0
|
||||
; CHECK: .set F_Ticker.uses_flat_scratch, 0
|
||||
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
|
||||
; CHECK-LABEL: P_TempSaveGameFile:
|
||||
; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
|
||||
; CHECK: .set P_TempSaveGameFile.num_agpr, 0
|
||||
; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
|
||||
; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
|
||||
; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
|
||||
; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
|
||||
; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
|
||||
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
|
||||
; CHECK-LABEL: P_SaveGameFile:
|
||||
; CHECK: .set P_SaveGameFile.num_vgpr, 2
|
||||
; CHECK: .set P_SaveGameFile.num_agpr, 0
|
||||
; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
|
||||
; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
|
||||
; CHECK: .set P_SaveGameFile.private_seg_size, 0
|
||||
; CHECK: .set P_SaveGameFile.uses_vcc, 0
|
||||
; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
|
||||
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
|
||||
; CHECK-LABEL: R_FlatNumForName:
|
||||
; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
|
||||
; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
|
||||
; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
|
||||
; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
|
||||
; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
|
||||
; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
|
||||
; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
; CHECK-LABEL: {{^}}qux
|
||||
; CHECK: .set qux.num_vgpr, 13
|
||||
; CHECK: .set qux.num_agpr, 0
|
||||
; CHECK: .set qux.numbered_sgpr, 0
|
||||
; CHECK: .set qux.numbered_sgpr, 32
|
||||
; CHECK: .set qux.private_seg_size, 0
|
||||
; CHECK: .set qux.uses_vcc, 0
|
||||
; CHECK: .set qux.uses_flat_scratch, 0
|
||||
|
||||
@@ -83,13 +83,13 @@
|
||||
; CHECK-NEXT: multiple_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x24
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .sgpr_count: 0x21
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
|
||||
; CHECK-NEXT: .vgpr_count: 0x3
|
||||
; CHECK-NEXT: no_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .sgpr_count: 0x20
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: no_stack_call:
|
||||
@@ -122,7 +122,7 @@
|
||||
; CHECK-NEXT: simple_lds:
|
||||
; CHECK-NEXT: .backend_stack_size: 0
|
||||
; CHECK-NEXT: .lds_size: 0x100
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .sgpr_count: 0x20
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: simple_lds_recurse:
|
||||
@@ -134,7 +134,7 @@
|
||||
; CHECK-NEXT: simple_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x14
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .sgpr_count: 0x21
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
|
||||
; CHECK-NEXT: .vgpr_count: 0x2
|
||||
; CHECK-NEXT: simple_stack_call:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
|
||||
|
||||
; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
|
||||
; ;CHECK: NumVgprs: 2
|
||||
; ;CHECK: NumVgprs: 4
|
||||
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
|
||||
.entry:
|
||||
%i1 = extractelement <2 x float> %arg3, i32 1
|
||||
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
|
||||
|
||||
; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
|
||||
; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
|
||||
; CHECK: NumVgprs: 2
|
||||
; CHECK: NumVgprs: 4
|
||||
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
|
||||
.entry:
|
||||
ret { <4 x float> } undef
|
||||
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
|
||||
; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
|
||||
; Additionally set the PSInputAddr to 0 via the metadata
|
||||
; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
|
||||
; CHECK: NumVgprs: 2
|
||||
; CHECK: NumVgprs: 4
|
||||
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
|
||||
.entry:
|
||||
ret { <4 x float> } undef
|
||||
|
||||
@@ -24,9 +24,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
|
||||
|
||||
; SI-LABEL: {{^}}one_vgpr_used:
|
||||
; SI: NumVgprs: 1
|
||||
define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
|
||||
define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
|
||||
store i32 %x, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
|
||||
|
||||
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
|
||||
}
|
||||
|
||||
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 22
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 3
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
|
||||
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
; CHECK-LABEL: {{^}}spill:
|
||||
; GCN: NumSgprs: 104
|
||||
; GCN-GCNTRACKERS: NumSgprs: 104
|
||||
; GCN: NumVgprs: 3
|
||||
; GCN-GCNTRACKERS: NumVgprs: 3
|
||||
; GCN: NumVgprs: 1
|
||||
; GCN-GCNTRACKERS: NumVgprs: 2
|
||||
; GCN: ScratchSize: 0
|
||||
; GCN-GCNTRACKERS: ScratchSize: 0
|
||||
; GCN: Occupancy: 5
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
; allow scheduling of other instructions which reduce RP
|
||||
|
||||
; CHECK-LABEL: {{^}}return_72xi32:
|
||||
; GFX11-PAL: NumSgprs: 0
|
||||
; GFX11-PAL-GCNTRACKERS: NumSgprs: 0
|
||||
; GFX11-PAL: NumSgprs: 33
|
||||
; GFX11-PAL-GCNTRACKERS: NumSgprs: 33
|
||||
; GFX11-PAL: NumVgprs: 64
|
||||
; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
|
||||
; GFX11-PAL: ScratchSize: 220
|
||||
|
||||
@@ -7,14 +7,14 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
|
||||
|
||||
; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
|
||||
; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
|
||||
; SI-MINREG: NumSgprs: {{[1-9]$}}
|
||||
; SI-MINREG: NumVgprs: {{[1-9]$}}
|
||||
|
||||
; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
|
||||
; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
|
||||
|
||||
; stores may alias loads
|
||||
; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
|
||||
; VI-MINREG: NumSgprs: {{[0-9]$}}
|
||||
; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
|
||||
|
||||
; stores may alias loads
|
||||
|
||||
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
|
||||
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
|
||||
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
|
||||
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
|
||||
; VI-NEXT: .amdhsa_next_free_vgpr 3
|
||||
; VI-NEXT: .amdhsa_next_free_vgpr 1
|
||||
; VI-NEXT: .amdhsa_next_free_sgpr 18
|
||||
; VI-NEXT: .amdhsa_reserve_vcc 0
|
||||
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
|
||||
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
|
||||
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
|
||||
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
|
||||
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GFX9-NEXT: .amdhsa_next_free_vgpr 3
|
||||
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
|
||||
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
|
||||
; GFX9-NEXT: .amdhsa_reserve_vcc 0
|
||||
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
|
||||
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
|
||||
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
|
||||
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
|
||||
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
|
||||
; VI-NEXT: .amdhsa_next_free_vgpr 3
|
||||
; VI-NEXT: .amdhsa_next_free_vgpr 1
|
||||
; VI-NEXT: .amdhsa_next_free_sgpr 18
|
||||
; VI-NEXT: .amdhsa_reserve_vcc 0
|
||||
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
|
||||
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
|
||||
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
|
||||
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
|
||||
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GFX9-NEXT: .amdhsa_next_free_vgpr 3
|
||||
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
|
||||
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
|
||||
; GFX9-NEXT: .amdhsa_reserve_vcc 0
|
||||
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
|
||||
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
|
||||
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
|
||||
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
|
||||
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
|
||||
; VI-NEXT: .amdhsa_next_free_vgpr 3
|
||||
; VI-NEXT: .amdhsa_next_free_vgpr 1
|
||||
; VI-NEXT: .amdhsa_next_free_sgpr 18
|
||||
; VI-NEXT: .amdhsa_reserve_vcc 0
|
||||
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
|
||||
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
|
||||
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
|
||||
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
|
||||
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GFX9-NEXT: .amdhsa_next_free_vgpr 3
|
||||
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
|
||||
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
|
||||
; GFX9-NEXT: .amdhsa_reserve_vcc 0
|
||||
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
define amdgpu_kernel void @kern() #0 {
|
||||
; ASM-LABEL: kern:
|
||||
; ASM: .amdhsa_next_free_sgpr 8
|
||||
; ASM: .amdhsa_next_free_sgpr 5
|
||||
; ASM: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
|
||||
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
|
||||
; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
|
||||
|
||||
; ELF: AMDGPU Metadata
|
||||
; ELF: .sgpr_count: 12
|
||||
; ELF: .sgpr_count: 9
|
||||
entry:
|
||||
tail call void asm sideeffect "", "~{s[0:4]}"()
|
||||
ret void
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
define amdgpu_kernel void @kern() #0 {
|
||||
; ASM-LABEL: kern:
|
||||
; ASM: .amdhsa_next_free_sgpr 8
|
||||
; ASM: .amdhsa_next_free_sgpr 5
|
||||
; ASM: .amdhsa_reserve_xnack_mask 0
|
||||
|
||||
; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
|
||||
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
|
||||
; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
|
||||
|
||||
; ELF: AMDGPU Metadata
|
||||
; ELF: .sgpr_count: 8
|
||||
; ELF: .sgpr_count: 5
|
||||
entry:
|
||||
tail call void asm sideeffect "", "~{s[0:4]}"()
|
||||
ret void
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
define amdgpu_kernel void @kern() #0 {
|
||||
; ASM-LABEL: kern:
|
||||
; ASM: .amdhsa_next_free_sgpr 8
|
||||
; ASM: .amdhsa_next_free_sgpr 5
|
||||
; ASM: .amdhsa_reserve_xnack_mask 1
|
||||
|
||||
; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
|
||||
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
|
||||
; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
|
||||
|
||||
; ELF: AMDGPU Metadata
|
||||
; ELF: .sgpr_count: 12
|
||||
; ELF: .sgpr_count: 9
|
||||
entry:
|
||||
tail call void asm sideeffect "", "~{s[0:4]}"()
|
||||
ret void
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
; CHECK-LABEL: __unnamed_1:
|
||||
; CHECK: .set __unnamed_1.num_vgpr, 0
|
||||
; CHECK: .set __unnamed_1.num_agpr, 0
|
||||
; CHECK: .set __unnamed_1.numbered_sgpr, 0
|
||||
; CHECK: .set __unnamed_1.numbered_sgpr, 32
|
||||
; CHECK: .set __unnamed_1.private_seg_size, 0
|
||||
; CHECK: .set __unnamed_1.uses_vcc, 0
|
||||
; CHECK: .set __unnamed_1.uses_flat_scratch, 0
|
||||
@@ -16,7 +16,7 @@ entry:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: __unnamed_2:
|
||||
; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
|
||||
; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
|
||||
; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
|
||||
; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
|
||||
; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
|
||||
|
||||
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}f1024_0:
|
||||
; GFX90A: NumVgprs: 1
|
||||
; GFX90A: NumVgprs: 32
|
||||
; GFX90A: NumAgprs: 1
|
||||
; GFX90A: TotalNumVgprs: 5
|
||||
; GFX90A: TotalNumVgprs: 33
|
||||
define void @f1024_0() #1024 {
|
||||
call void @foo()
|
||||
ret void
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
|
||||
; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
|
||||
target triple = "amdgcn-amd-amdhsa"
|
||||
|
||||
@global = addrspace(1) global i32 poison, align 4
|
||||
|
||||
; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
|
||||
; Only hardware-initialized VGPRs (v0) are read in this kernel.
|
||||
|
||||
; CHECK-LABEL: amdhsa.kernels:
|
||||
; CHECK-LABEL: kernel_x
|
||||
; CHECK: .vgpr_count: 1
|
||||
define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: kernel_z
|
||||
; PACKED: .vgpr_count: 1
|
||||
; NOTPACKED: .vgpr_count: 3
|
||||
define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
|
||||
@@ -1,35 +0,0 @@
|
||||
; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
|
||||
; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
|
||||
target triple = "amdgcn--amdpal"
|
||||
|
||||
@global = addrspace(1) global i32 poison, align 4
|
||||
|
||||
; CHECK-LABEL: amdpal.pipelines:
|
||||
|
||||
; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
|
||||
; CHECK-LABEL: .entry_point_symbol: kernel_use
|
||||
; CHECK: .vgpr_count: 0x20
|
||||
define amdgpu_cs void @kernel_use([32 x i32] %args) {
|
||||
entry:
|
||||
%a = extractvalue [32 x i32] %args, 14
|
||||
store i32 %a, ptr addrspace(1) @global
|
||||
ret void
|
||||
}
|
||||
|
||||
; Neither uses not writes a VGPR
|
||||
; CHECK-LABEL: chain_func:
|
||||
; CHECK: .vgpr_count: 0x1
|
||||
define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
|
||||
entry:
|
||||
call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
|
||||
ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
|
||||
unreachable
|
||||
}
|
||||
|
||||
; Neither uses not writes a VGPR
|
||||
; CHECK-LABEL: gfx_func:
|
||||
; CHECK: .vgpr_count: 0x1
|
||||
define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
|
||||
entry:
|
||||
ret [32 x i32] %args
|
||||
}
|
||||
Reference in New Issue
Block a user