[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#133242)

Don't count register uses when determining the maximum number of
registers used by a function. Count only the defs. This is really an
underestimate of the true register usage, but in practice that's not
a problem because if a function uses a register, then it has either
defined it earlier, or some other function that executed before has
defined it.

In particular, the register counts are used:
1. When launching an entry function - in which case we're safe because
   the register counts of the entry function will include the register
   counts of all callees.
2. At function boundaries in dynamic VGPR mode. In this case it's safe
   because whenever we set the new VGPR allocation we take into account
   the outgoing_vgpr_count set by the middle-end.

The main advantage of doing this is that the artificial VGPR arguments
used only for preserving the inactive lanes when using the
llvm.amdgcn.init.whole.wave intrinsic are no longer counted. This
enables us to allocate only the registers we need in dynamic VGPR mode.

---------

Co-authored-by: Thomas Symalla <5754458+tsymalla@users.noreply.github.com>
This commit is contained in:
Diana Picus
2025-06-03 11:20:48 +02:00
committed by GitHub
parent e1276ece70
commit 130080fab1
48 changed files with 586 additions and 472 deletions

View File

@@ -4263,10 +4263,9 @@ same *vendor-name*.
wavefront for
GFX6-GFX9. A register
is required if it is
used explicitly, or
written to, or
if a higher numbered
register is used
explicitly. This
register is written to. This
includes the special
SGPRs for VCC, Flat
Scratch (GFX7-GFX9)
@@ -4284,10 +4283,10 @@ same *vendor-name*.
each work-item for
GFX6-GFX9. A register
is required if it is
used explicitly, or
written to, or
if a higher numbered
register is used
explicitly.
register is
written to.
".agpr_count" integer Required Number of accumulator
registers required by
each work-item for

View File

@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
if (isShader(F.getCallingConv())) {
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
@@ -1060,15 +1060,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
} else if (isKernel(F.getCallingConv()) &&
MFI->getNumKernargPreloadedSGPRs()) {
// Consider cases where the total number of UserSGPRs with trailing
// allocated preload SGPRs, is greater than the number of explicitly
// referenced SGPRs.
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
ProgInfo.NumSGPR =
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum

View File

@@ -137,274 +137,29 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
if (MFI->isStackRealigned())
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
return Info;
Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR =
TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
// Preloaded registers are written by the hardware, not defined in the
// function body, so they need special handling.
if (MFI->isEntryFunction()) {
Info.NumExplicitSGPR =
std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
}
int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
return Info;
Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::M0_LO16:
case AMDGPU::M0_HI16:
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE_LO:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;
case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;
case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::VCC_LO_LO16:
case AMDGPU::VCC_LO_HI16:
case AMDGPU::VCC_HI_LO16:
case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");
case AMDGPU::LDS_DIRECT:
llvm_unreachable("lds_direct register should not be used");
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");
case AMDGPU::SRC_VCCZ:
llvm_unreachable("src_vccz register should not be used");
case AMDGPU::SRC_EXECZ:
llvm_unreachable("src_execz register should not be used");
case AMDGPU::SRC_SCC:
llvm_unreachable("src_scc register should not be used");
default:
break;
}
if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
AMDGPU::VGPR_16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 3;
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 4;
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
IsSGPR = false;
Width = 5;
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 5;
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
IsSGPR = false;
Width = 6;
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
IsSGPR = true;
Width = 6;
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 8;
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
IsSGPR = false;
Width = 9;
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
IsSGPR = true;
Width = 9;
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 9;
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
IsSGPR = false;
Width = 10;
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
IsSGPR = true;
Width = 10;
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 10;
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
IsSGPR = false;
Width = 11;
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
IsSGPR = true;
Width = 11;
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 11;
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
IsSGPR = false;
Width = 12;
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
IsSGPR = true;
Width = 12;
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 32;
} else {
// We only expect TTMP registers or registers that do not belong to
// any RC.
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
AMDGPU::TTMP_64RegClass.contains(Reg) ||
AMDGPU::TTMP_128RegClass.contains(Reg) ||
AMDGPU::TTMP_256RegClass.contains(Reg) ||
AMDGPU::TTMP_512RegClass.contains(Reg) ||
!TRI.getPhysRegBaseClass(Reg)) &&
"Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else if (IsAGPR) {
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}
if (MI.isCall()) {
// Pseudo used just to encode the underlying global. Is there a better
// way to track this?
@@ -464,9 +219,5 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
}
}
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
return Info;
}

View File

@@ -970,10 +970,25 @@ public:
return NumUserSGPRs;
}
// Get the number of preloaded SGPRs for compute kernels.
unsigned getNumPreloadedSGPRs() const {
return NumUserSGPRs + NumSystemSGPRs;
}
// Get the number of preloaded VGPRs for compute kernels.
unsigned getNumPreloadedVGPRs() const {
if (hasWorkItemIDZ())
return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
if (hasWorkItemIDY())
return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
if (hasWorkItemIDX())
return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
return 0;
}
unsigned getNumKernargPreloadedSGPRs() const {
return UserSGPRInfo.getNumKernargPreloadSGPRs();
}

View File

@@ -4055,6 +4055,20 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
return 0;
}
unsigned
SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const {
for (MCPhysReg Reg : reverse(RC.getRegisters())) {
for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
if (std::any_of(
MRI.def_instr_begin(*AI), MRI.def_instr_end(),
[](const MachineInstr &MI) { return !MI.isImplicitDef(); }))
return getHWRegIndex(Reg) + 1;
}
}
return 0;
}
SmallVector<StringLiteral>
SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
const MachineFunction &MF) const {

View File

@@ -486,6 +486,11 @@ public:
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;
// \returns the number of registers of a given \p RC defined in a function.
// Does not go inside function calls.
unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;
std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
: std::optional<uint8_t>{};

View File

@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 17
; GPRIDX-NEXT: wavefront_sgpr_count = 24
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
; GFX10-NEXT: wavefront_sgpr_count = 10
; GFX10-NEXT: wavefront_sgpr_count = 18
; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
; GFX11-NEXT: wavefront_sgpr_count = 7
; GFX11-NEXT: wavefront_sgpr_count = 16
; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: wavefront_sgpr_count = 24
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
; GFX10-NEXT: wavefront_sgpr_count = 10
; GFX10-NEXT: workitem_vgpr_count = 2
; GFX10-NEXT: wavefront_sgpr_count = 18
; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
; GFX11-NEXT: wavefront_sgpr_count = 6
; GFX11-NEXT: wavefront_sgpr_count = 16
; GFX11-NEXT: workitem_vgpr_count = 2
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: wavefront_sgpr_count = 24
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: granulated_wavefront_sgpr_count = 2
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
; GFX10-NEXT: wavefront_sgpr_count = 10
; GFX10-NEXT: wavefront_sgpr_count = 18
; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
; GFX11-NEXT: wavefront_sgpr_count = 7
; GFX11-NEXT: wavefront_sgpr_count = 16
; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0

View File

@@ -13,8 +13,9 @@
; CHECK: {{^}}kernel_illegal_agpr_use_asm:
; CHECK: ; use a0
; CHECK: NumVgprs: 0
; CHECK: NumAgprs: 1
; GFX908: NumVgprs: 3
; GFX90A: NumVgprs: 1
; CHECK: NumAgprs: 0
define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
call void asm sideeffect "; use $0", "a"(i32 poison)
ret void
@@ -24,7 +25,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
; CHECK: ; use a0
; CHECK: NumVgprs: 0
; CHECK: NumAgprs: 1
; CHECK: NumAgprs: 0
define void @func_illegal_agpr_use_asm() #0 {
call void asm sideeffect "; use $0", "a"(i32 poison)
ret void

View File

@@ -10,9 +10,9 @@
; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
; ASM: .amdhsa_user_sgpr_count 12
; ASM: .amdhsa_next_free_sgpr 12
; ASM: ; TotalNumSgprs: 18
; ASM: ; NumSGPRsForWavesPerEU: 18
; ASM: .amdhsa_next_free_sgpr 15
; ASM: ; TotalNumSgprs: 21
; ASM: ; NumSGPRsForWavesPerEU: 21
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000 ................
; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
; ASM: .amdhsa_user_sgpr_count 10
; ASM: .amdhsa_next_free_sgpr 10
; ASM: ; TotalNumSgprs: 16
; ASM: ; NumSGPRsForWavesPerEU: 16
; ASM: .amdhsa_next_free_sgpr 11
; ASM: ; TotalNumSgprs: 17
; ASM: ; NumSGPRsForWavesPerEU: 17
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
; ASM: .amdhsa_user_sgpr_count 3
; ASM: .amdhsa_next_free_sgpr 3
; ASM: ; TotalNumSgprs: 9
; ASM: ; NumSGPRsForWavesPerEU: 9
; ASM: .amdhsa_next_free_sgpr 4
; ASM: ; TotalNumSgprs: 10
; ASM: ; NumSGPRsForWavesPerEU: 10
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000 @...............
; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
; ASM: .amdhsa_user_sgpr_count 2
; ASM: .amdhsa_next_free_sgpr 0
; ASM: ; TotalNumSgprs: 6
; ASM: ; NumSGPRsForWavesPerEU: 6
; ASM: .amdhsa_next_free_sgpr 3
; ASM: ; TotalNumSgprs: 9
; ASM: ; NumSGPRsForWavesPerEU: 9
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
; Encoded like '00'.

View File

@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
; GCN: amdpal.pipelines:
; GCN-NEXT: - .registers:
; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
; GFX8-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
; GFX9-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
; GCN-NEXT: .shader_functions:
; GCN-NEXT: dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
; GCN-NEXT: multiple_stack:
; GCN-NEXT: .backend_stack_size: 0x24{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
; GCN-NEXT: .sgpr_count: 0x21{{$}}
; GCN-NEXT: .sgpr_count: 0x1{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}}
; GCN-NEXT: .vgpr_count: 0x3{{$}}
; GCN-NEXT: no_stack:
; GCN-NEXT: .backend_stack_size: 0{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
; GCN-NEXT: .sgpr_count: 0x20{{$}}
; GCN-NEXT: .sgpr_count: 0x1{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT: .vgpr_count: 0x1{{$}}
; GCN-NEXT: no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: simple_lds:
; GCN-NEXT: .backend_stack_size: 0{{$}}
; GCN-NEXT: .lds_size: 0x100{{$}}
; GCN-NEXT: .sgpr_count: 0x20{{$}}
; GCN-NEXT: .sgpr_count: 0x1{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
; GCN-NEXT: .vgpr_count: 0x1{{$}}
; GCN-NEXT: simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
; GCN-NEXT: simple_stack:
; GCN-NEXT: .backend_stack_size: 0x14{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
; GCN-NEXT: .sgpr_count: 0x21{{$}}
; GCN-NEXT: .sgpr_count: 0x1{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}}
; GCN-NEXT: .vgpr_count: 0x2{{$}}
; GCN-NEXT: simple_stack_call:

View File

@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
; ELF: Section {
; ELF: Name: .text
@@ -23,8 +23,16 @@
; ELF: Section: .text (0x2)
; ELF: }
; GFX10: NumSGPRsForWavesPerEU: 6
; GFX10: NumVGPRsForWavesPerEU: 1
; GFX10: NumSGPRsForWavesPerEU: 12
; GFX10: NumVGPRsForWavesPerEU: 3
; Wave32 and 64 behave differently due to the UserSGPRInit16Bug,
; which only affects Wave32.
; GFX11W32: NumSGPRsForWavesPerEU: 16
; GFX11W32: NumVGPRsForWavesPerEU: 1
; GFX11W64: NumSGPRsForWavesPerEU: 11
; GFX11W64: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
entry:

View File

@@ -2,10 +2,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
; CHECK-LABEL: {{^}}min_64_max_64:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @min_64_max_64() #0 {
entry:
ret void
@@ -13,10 +13,10 @@ entry:
attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
; CHECK-LABEL: {{^}}min_64_max_128:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @min_64_max_128() #1 {
entry:
ret void

View File

@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_10:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @empty_exactly_10() #2 {
entry:
ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
; At least 1 wave per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_1:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @empty_at_least_1() #3 {
entry:
ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
; At least 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_5:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @empty_at_least_5() #4 {
entry:
ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
; At least 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_10:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @empty_at_least_10() #5 {
entry:
ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
; At most 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_10:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @empty_at_most_10() #7 {
entry:
ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
; Between 5 and 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_between_5_and_10:
; CHECK: SGPRBlocks: 0
; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 0
; CHECK: NumSGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 1
; CHECK: NumVGPRsForWavesPerEU: 3
define amdgpu_kernel void @empty_between_5_and_10() #8 {
entry:
ret void

View File

@@ -28,7 +28,7 @@ bb:
}
; ALL: .set .Laliasee_default.num_vgpr, 0
; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
attributes #0 = { noinline norecurse nounwind optnone }
attributes #1 = { noinline norecurse nounwind readnone willreturn }

View File

@@ -18,7 +18,7 @@ bb:
; CHECK: .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()

View File

@@ -24,7 +24,7 @@ bb:
; CHECK: .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
define internal void @aliasee_vgpr32_sgpr76() #1 {
bb:
call void asm sideeffect "; clobber v26 ", "~{v26}"()

View File

@@ -21,7 +21,7 @@ bb:
; CHECK: .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
define internal void @aliasee_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()

View File

@@ -21,7 +21,7 @@ bb:
; CHECK: .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
define internal void @aliasee_vgpr256_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v252 ", "~{v252}"()

View File

@@ -7,7 +7,7 @@
; Make sure to run a GPU with the SGPR allocation bug.
; GCN-LABEL: {{^}}use_vcc:
; GCN: ; TotalNumSgprs: 34
; GCN: ; TotalNumSgprs: 2
; GCN: ; NumVgprs: 0
define void @use_vcc() #1 {
call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
}
; GCN-LABEL: {{^}}use_flat_scratch:
; CI: ; TotalNumSgprs: 36
; VI: ; TotalNumSgprs: 38
; CI: ; TotalNumSgprs: 4
; VI: ; TotalNumSgprs: 6
; GCN: ; NumVgprs: 0
define void @use_flat_scratch() #1 {
call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96

View File

@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; It's probably OK if this is slightly higher:
; CHECK: ; NumVgprs: 8
; CHECK: ; NumVgprs: 5
define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
entry:
%cmpflag = icmp eq i32 %flag, 1

View File

@@ -16,7 +16,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
; registers used.
;
; ALL-ASM-LABEL: {{^}}empty:
; ALL-ASM: .amdhsa_next_free_vgpr 1
; ALL-ASM: .amdhsa_next_free_vgpr 3
; ALL-ASM: .amdhsa_next_free_sgpr 1
define amdgpu_kernel void @empty(
i32 %i,

View File

@@ -43,7 +43,7 @@
; OSABI-HSA-ELF: .sgpr_count: 96
; OSABI-HSA-ELF: .sgpr_spill_count: 0
; OSABI-HSA-ELF: .symbol: elf_notes.kd
; OSABI-HSA-ELF: .vgpr_count: 0
; OSABI-HSA-ELF: .vgpr_count: 1
; OSABI-HSA-ELF: .vgpr_spill_count: 0
; OSABI-HSA-ELF: .wavefront_size: 64
; OSABI-HSA-ELF: amdhsa.target: amdgcn-amd-amdhsa--gfx802

View File

@@ -27,15 +27,15 @@
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 8
; VI-NOXNACK: ; TotalNumSgprs: 8
; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
; VI-XNACK: ; TotalNumSgprs: 12
; HSA-VI-XNACK: ; TotalNumSgprs: 12
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 12
; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
; VI-XNACK: ; TotalNumSgprs: 16
; HSA-VI-XNACK: ; TotalNumSgprs: 22
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
define amdgpu_kernel void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 10
; VI-NOXNACK: ; TotalNumSgprs: 10
; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
; VI-XNACK: ; TotalNumSgprs: 12
; HSA-VI-XNACK: ; TotalNumSgprs: 12
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
; CI: ; TotalNumSgprs: 14
; VI-NOXNACK: ; TotalNumSgprs: 14
; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
; VI-XNACK: ; TotalNumSgprs: 16
; HSA-VI-XNACK: ; TotalNumSgprs: 22
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
define amdgpu_kernel void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
; CI: ; TotalNumSgprs: 16
; VI-NOXNACK: ; TotalNumSgprs: 18
; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
; VI-XNACK: ; TotalNumSgprs: 14
; VI-XNACK: ; TotalNumSgprs: 18
; HSA-VI-XNACK: ; TotalNumSgprs: 24
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
define amdgpu_kernel void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
; CI: ; TotalNumSgprs: 16
; VI-NOXNACK: ; TotalNumSgprs: 18
; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
; VI-XNACK: ; TotalNumSgprs: 14
; VI-XNACK: ; TotalNumSgprs: 18
; HSA-VI-XNACK: ; TotalNumSgprs: 24
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
define amdgpu_kernel void @vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
; CI: NumSgprs: 16
; VI-NOXNACK: NumSgprs: 18
; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
; VI-XNACK: NumSgprs: 18
; HSA-VI-XNACK: NumSgprs: 24
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
define amdgpu_kernel void @use_flat_scr() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
; CI: NumSgprs: 16
; VI-NOXNACK: NumSgprs: 18
; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
; VI-XNACK: NumSgprs: 18
; HSA-VI-XNACK: NumSgprs: 24
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
define amdgpu_kernel void @use_flat_scr_lo() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
; CI: NumSgprs: 16
; VI-NOXNACK: NumSgprs: 18
; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
; VI-XNACK: NumSgprs: 18
; HSA-VI-XNACK: NumSgprs: 24
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
define amdgpu_kernel void @use_flat_scr_hi() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()

View File

@@ -5,14 +5,14 @@
; GCN-LABEL: {{^}}use_vcc:
; GCN: .set use_vcc.num_vgpr, 0
; GCN: .set use_vcc.num_agpr, 0
; GCN: .set use_vcc.numbered_sgpr, 32
; GCN: .set use_vcc.numbered_sgpr, 0
; GCN: .set use_vcc.private_seg_size, 0
; GCN: .set use_vcc.uses_vcc, 1
; GCN: .set use_vcc.uses_flat_scratch, 0
; GCN: .set use_vcc.has_dyn_sized_stack, 0
; GCN: .set use_vcc.has_recursion, 0
; GCN: .set use_vcc.has_indirect_call, 0
; GCN: TotalNumSgprs: 36
; GCN: TotalNumSgprs: 4
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
; GCN-LABEL: {{^}}use_flat_scratch:
; GCN: .set use_flat_scratch.num_vgpr, 0
; GCN: .set use_flat_scratch.num_agpr, 0
; GCN: .set use_flat_scratch.numbered_sgpr, 32
; GCN: .set use_flat_scratch.numbered_sgpr, 0
; GCN: .set use_flat_scratch.private_seg_size, 0
; GCN: .set use_flat_scratch.uses_vcc, 0
; GCN: .set use_flat_scratch.uses_flat_scratch, 1
; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
; GCN: .set use_flat_scratch.has_recursion, 0
; GCN: .set use_flat_scratch.has_indirect_call, 0
; GCN: TotalNumSgprs: 38
; GCN: TotalNumSgprs: 6
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
; GCN-LABEL: {{^}}use_10_vgpr:
; GCN: .set use_10_vgpr.num_vgpr, 10
; GCN: .set use_10_vgpr.num_agpr, 0
; GCN: .set use_10_vgpr.numbered_sgpr, 32
; GCN: .set use_10_vgpr.numbered_sgpr, 0
; GCN: .set use_10_vgpr.private_seg_size, 0
; GCN: .set use_10_vgpr.uses_vcc, 0
; GCN: .set use_10_vgpr.uses_flat_scratch, 0
; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_10_vgpr.has_recursion, 0
; GCN: .set use_10_vgpr.has_indirect_call, 0
; GCN: TotalNumSgprs: 36
; GCN: TotalNumSgprs: 4
; GCN: NumVgprs: 10
; GCN: ScratchSize: 0
define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
; GCN-LABEL: {{^}}use_50_vgpr:
; GCN: .set use_50_vgpr.num_vgpr, 50
; GCN: .set use_50_vgpr.num_agpr, 0
; GCN: .set use_50_vgpr.numbered_sgpr, 32
; GCN: .set use_50_vgpr.numbered_sgpr, 0
; GCN: .set use_50_vgpr.private_seg_size, 0
; GCN: .set use_50_vgpr.uses_vcc, 0
; GCN: .set use_50_vgpr.uses_flat_scratch, 0
; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_50_vgpr.has_recursion, 0
; GCN: .set use_50_vgpr.has_indirect_call, 0
; GCN: TotalNumSgprs: 36
; GCN: TotalNumSgprs: 4
; GCN: NumVgprs: 50
; GCN: ScratchSize: 0
define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
; GCN-LABEL: {{^}}use_stack0:
; GCN: .set use_stack0.num_vgpr, 1
; GCN: .set use_stack0.num_agpr, 0
; GCN: .set use_stack0.numbered_sgpr, 33
; GCN: .set use_stack0.numbered_sgpr, 0
; GCN: .set use_stack0.private_seg_size, 2052
; GCN: .set use_stack0.uses_vcc, 0
; GCN: .set use_stack0.uses_flat_scratch, 0
; GCN: .set use_stack0.has_dyn_sized_stack, 0
; GCN: .set use_stack0.has_recursion, 0
; GCN: .set use_stack0.has_indirect_call, 0
; GCN: TotalNumSgprs: 37
; GCN: TotalNumSgprs: 4
; GCN: NumVgprs: 1
; GCN: ScratchSize: 2052
define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
; GCN-LABEL: {{^}}use_stack1:
; GCN: .set use_stack1.num_vgpr, 1
; GCN: .set use_stack1.num_agpr, 0
; GCN: .set use_stack1.numbered_sgpr, 33
; GCN: .set use_stack1.numbered_sgpr, 0
; GCN: .set use_stack1.private_seg_size, 404
; GCN: .set use_stack1.uses_vcc, 0
; GCN: .set use_stack1.uses_flat_scratch, 0
; GCN: .set use_stack1.has_dyn_sized_stack, 0
; GCN: .set use_stack1.has_recursion, 0
; GCN: .set use_stack1.has_indirect_call, 0
; GCN: TotalNumSgprs: 37
; GCN: TotalNumSgprs: 4
; GCN: NumVgprs: 1
; GCN: ScratchSize: 404
define void @use_stack1() #1 {

View File

@@ -15,7 +15,7 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
; CHECK: .sgpr_count: 10
; CHECK: .sgpr_count: 16
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64

View File

@@ -63,7 +63,7 @@
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
; ELF: 0250: 6770725F 636F756E 7402B12E 76677072
; ELF: 0250: 6770725F 636F756E 7401B12E 76677072
; ELF: 0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
; ELF: 0270: 61766566 726F6E74 5F73697A 6540AD61
; ELF: 0280: 6D646873 612E7461 72676574 BD616D64

View File

@@ -0,0 +1,72 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
; CHECK-LABEL: .shader_functions:
; Use VGPRs above the input arguments.
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail
shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
call void asm sideeffect "; clobber v28", "~{v28}"()
br label %tail
tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}
declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

View File

@@ -0,0 +1,46 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
; CHECK-LABEL: .shader_functions:
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; CHECK-LABEL: leaf_shader:
; CHECK: .vgpr_count:{{.*}}0x1{{$}}
; Function without calls.
define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
i32 %active.vgpr1, i32 %active.vgpr2,
i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
local_unnamed_addr {
entry:
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %compute, label %merge
compute:
; Perform a more complex computation using active VGPRs
%square = mul i32 %active.vgpr1, %active.vgpr1
%product = mul i32 %square, %active.vgpr2
%sum = add i32 %product, %input.value
%result = add i32 %sum, 42
br label %merge
merge:
%final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
store i32 %final.result, ptr %output.ptr, align 4
ret void
}
declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

View File

@@ -0,0 +1,74 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
; CHECK-LABEL: .shader_functions:
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
; In that case, the VGPR should be included in the .vgpr_count
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0xd{{$}}
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail
shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
br label %tail
tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}
declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

View File

@@ -0,0 +1,71 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
; CHECK-LABEL: .shader_functions:
; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0xa{{$}}
define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail
shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
br label %tail
tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}
declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

View File

@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
; GCN-NEXT: s_addc_u32 s17,
; GCN-NEXT: s_setpc_b64 s[16:17]
; GCN: ; TotalNumSgprs: 32
; GCN: ; TotalNumSgprs: 18
; GCN: ; NumVgprs: 8
define void @func_tail_call() #1 {
tail call void @func()

View File

@@ -24,7 +24,7 @@ define void @I_Quit() {
; CHECK-LABEL: P_RemoveMobj:
; CHECK: .set P_RemoveMobj.num_vgpr, 0
; CHECK: .set P_RemoveMobj.num_agpr, 0
; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
; CHECK: .set P_RemoveMobj.private_seg_size, 0
; CHECK: .set P_RemoveMobj.uses_vcc, 0
; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
; CHECK-LABEL: P_SpawnMobj:
; CHECK: .set P_SpawnMobj.num_vgpr, 0
; CHECK: .set P_SpawnMobj.num_agpr, 0
; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
; CHECK: .set P_SpawnMobj.private_seg_size, 0
; CHECK: .set P_SpawnMobj.uses_vcc, 0
; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
; CHECK-LABEL: G_PlayerReborn:
; CHECK: .set G_PlayerReborn.num_vgpr, 0
; CHECK: .set G_PlayerReborn.num_agpr, 0
; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
; CHECK: .set G_PlayerReborn.private_seg_size, 0
; CHECK: .set G_PlayerReborn.uses_vcc, 0
; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
; CHECK-LABEL: P_SetThingPosition:
; CHECK: .set P_SetThingPosition.num_vgpr, 0
; CHECK: .set P_SetThingPosition.num_agpr, 0
; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
; CHECK: .set P_SetThingPosition.private_seg_size, 0
; CHECK: .set P_SetThingPosition.uses_vcc, 0
; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
; CHECK-LABEL: HU_Start:
; CHECK: .set HU_Start.num_vgpr, 0
; CHECK: .set HU_Start.num_agpr, 0
; CHECK: .set HU_Start.numbered_sgpr, 32
; CHECK: .set HU_Start.numbered_sgpr, 0
; CHECK: .set HU_Start.private_seg_size, 0
; CHECK: .set HU_Start.uses_vcc, 0
; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
; CHECK-LABEL: AM_Stop:
; CHECK: .set AM_Stop.num_vgpr, 0
; CHECK: .set AM_Stop.num_agpr, 0
; CHECK: .set AM_Stop.numbered_sgpr, 32
; CHECK: .set AM_Stop.numbered_sgpr, 0
; CHECK: .set AM_Stop.private_seg_size, 0
; CHECK: .set AM_Stop.uses_vcc, 0
; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
; CHECK-LABEL: D_AdvanceDemo:
; CHECK: .set D_AdvanceDemo.num_vgpr, 0
; CHECK: .set D_AdvanceDemo.num_agpr, 0
; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
; CHECK: .set D_AdvanceDemo.private_seg_size, 0
; CHECK: .set D_AdvanceDemo.uses_vcc, 0
; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
; CHECK-LABEL: F_StartFinale:
; CHECK: .set F_StartFinale.num_vgpr, 0
; CHECK: .set F_StartFinale.num_agpr, 0
; CHECK: .set F_StartFinale.numbered_sgpr, 32
; CHECK: .set F_StartFinale.numbered_sgpr, 0
; CHECK: .set F_StartFinale.private_seg_size, 0
; CHECK: .set F_StartFinale.uses_vcc, 0
; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
; CHECK-LABEL: F_Ticker:
; CHECK: .set F_Ticker.num_vgpr, 0
; CHECK: .set F_Ticker.num_agpr, 0
; CHECK: .set F_Ticker.numbered_sgpr, 32
; CHECK: .set F_Ticker.numbered_sgpr, 0
; CHECK: .set F_Ticker.private_seg_size, 0
; CHECK: .set F_Ticker.uses_vcc, 0
; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
; CHECK-LABEL: P_TempSaveGameFile:
; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
; CHECK: .set P_TempSaveGameFile.num_agpr, 0
; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
; CHECK-LABEL: P_SaveGameFile:
; CHECK: .set P_SaveGameFile.num_vgpr, 2
; CHECK: .set P_SaveGameFile.num_agpr, 0
; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
; CHECK: .set P_SaveGameFile.private_seg_size, 0
; CHECK: .set P_SaveGameFile.uses_vcc, 0
; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
; CHECK-LABEL: R_FlatNumForName:
; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)

View File

@@ -3,7 +3,7 @@
; CHECK-LABEL: {{^}}qux
; CHECK: .set qux.num_vgpr, 13
; CHECK: .set qux.num_agpr, 0
; CHECK: .set qux.numbered_sgpr, 32
; CHECK: .set qux.numbered_sgpr, 0
; CHECK: .set qux.private_seg_size, 0
; CHECK: .set qux.uses_vcc, 0
; CHECK: .set qux.uses_flat_scratch, 0

View File

@@ -83,13 +83,13 @@
; CHECK-NEXT: multiple_stack:
; CHECK-NEXT: .backend_stack_size: 0x24
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x21
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x20
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: no_stack_call:
@@ -122,7 +122,7 @@
; CHECK-NEXT: simple_lds:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x20
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: simple_lds_recurse:
@@ -134,7 +134,7 @@
; CHECK-NEXT: simple_stack:
; CHECK-NEXT: .backend_stack_size: 0x14
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x21
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: simple_stack_call:

View File

@@ -2,7 +2,7 @@
;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
; ;CHECK: NumVgprs: 4
; ;CHECK: NumVgprs: 2
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
.entry:
%i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
; CHECK: NumVgprs: 4
; CHECK: NumVgprs: 2
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
.entry:
ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
; Additionally set the PSInputAddr to 0 via the metadata
; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
; CHECK: NumVgprs: 4
; CHECK: NumVgprs: 2
define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
.entry:
ret { <4 x float> } undef

View File

@@ -24,7 +24,9 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
; SI-LABEL: {{^}}one_vgpr_used:
; SI: NumVgprs: 1
define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
store i32 %x, ptr addrspace(1) %out, align 4
ret void
}
attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

View File

@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 22
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 3
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False

View File

@@ -4,8 +4,8 @@
; CHECK-LABEL: {{^}}spill:
; GCN: NumSgprs: 104
; GCN-GCNTRACKERS: NumSgprs: 104
; GCN: NumVgprs: 1
; GCN-GCNTRACKERS: NumVgprs: 2
; GCN: NumVgprs: 3
; GCN-GCNTRACKERS: NumVgprs: 3
; GCN: ScratchSize: 0
; GCN-GCNTRACKERS: ScratchSize: 0
; GCN: Occupancy: 5

View File

@@ -11,8 +11,8 @@
; allow scheduling of other instructions which reduce RP
; CHECK-LABEL: {{^}}return_72xi32:
; GFX11-PAL: NumSgprs: 33
; GFX11-PAL-GCNTRACKERS: NumSgprs: 33
; GFX11-PAL: NumSgprs: 0
; GFX11-PAL-GCNTRACKERS: NumSgprs: 0
; GFX11-PAL: NumVgprs: 64
; GFX11-PAL-GCNTRACKERS: NumVgprs: 64
; GFX11-PAL: ScratchSize: 220

View File

@@ -7,14 +7,14 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
; SI-MINREG: NumSgprs: {{[1-9]$}}
; SI-MINREG: NumVgprs: {{[1-9]$}}
; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
; stores may alias loads
; VI-MINREG: NumSgprs: {{[0-9]$}}
; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
; stores may alias loads

View File

@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_vgpr 3
; VI-NEXT: .amdhsa_next_free_sgpr 18
; VI-NEXT: .amdhsa_reserve_vcc 0
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_vgpr 3
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
; GFX9-NEXT: .amdhsa_reserve_vcc 0
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_vgpr 3
; VI-NEXT: .amdhsa_next_free_sgpr 18
; VI-NEXT: .amdhsa_reserve_vcc 0
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_vgpr 3
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
; GFX9-NEXT: .amdhsa_reserve_vcc 0
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_vgpr 3
; VI-NEXT: .amdhsa_next_free_sgpr 18
; VI-NEXT: .amdhsa_reserve_vcc 0
; VI-NEXT: .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_vgpr 3
; GFX9-NEXT: .amdhsa_next_free_sgpr 18
; GFX9-NEXT: .amdhsa_reserve_vcc 0
; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0

View File

@@ -6,7 +6,7 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_next_free_sgpr 8
; ASM: .amdhsa_reserve_xnack_mask 1
; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
; ELF: .sgpr_count: 12
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void

View File

@@ -6,7 +6,7 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_next_free_sgpr 8
; ASM: .amdhsa_reserve_xnack_mask 0
; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 5
; ELF: .sgpr_count: 8
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void

View File

@@ -6,7 +6,7 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_next_free_sgpr 8
; ASM: .amdhsa_reserve_xnack_mask 1
; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
; ELF: .sgpr_count: 12
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void

View File

@@ -3,7 +3,7 @@
; CHECK-LABEL: __unnamed_1:
; CHECK: .set __unnamed_1.num_vgpr, 0
; CHECK: .set __unnamed_1.num_agpr, 0
; CHECK: .set __unnamed_1.numbered_sgpr, 32
; CHECK: .set __unnamed_1.numbered_sgpr, 0
; CHECK: .set __unnamed_1.private_seg_size, 0
; CHECK: .set __unnamed_1.uses_vcc, 0
; CHECK: .set __unnamed_1.uses_flat_scratch, 0
@@ -16,7 +16,7 @@ entry:
}
; CHECK-LABEL: __unnamed_2:
; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)

View File

@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
}
; GCN-LABEL: {{^}}f1024_0:
; GFX90A: NumVgprs: 32
; GFX90A: NumVgprs: 1
; GFX90A: NumAgprs: 1
; GFX90A: TotalNumVgprs: 33
; GFX90A: TotalNumVgprs: 5
define void @f1024_0() #1024 {
call void @foo()
ret void

View File

@@ -0,0 +1,30 @@
; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
target triple = "amdgcn-amd-amdhsa"
@global = addrspace(1) global i32 poison, align 4
; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
; Only hardware-initialized VGPRs (v0) are read in this kernel.
; CHECK-LABEL: amdhsa.kernels:
; CHECK-LABEL: kernel_x
; CHECK: .vgpr_count: 1
define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x()
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret void
}
; CHECK-LABEL: kernel_z
; PACKED: .vgpr_count: 1
; NOTPACKED: .vgpr_count: 3
define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
entry:
%id = call i32 @llvm.amdgcn.workitem.id.z()
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret void
}
attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

View File

@@ -0,0 +1,35 @@
; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
target triple = "amdgcn--amdpal"
@global = addrspace(1) global i32 poison, align 4
; CHECK-LABEL: amdpal.pipelines:
; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
; CHECK-LABEL: .entry_point_symbol: kernel_use
; CHECK: .vgpr_count: 0x20
define amdgpu_cs void @kernel_use([32 x i32] %args) {
entry:
%a = extractvalue [32 x i32] %args, 14
store i32 %a, ptr addrspace(1) @global
ret void
}
; Neither uses not writes a VGPR
; CHECK-LABEL: chain_func:
; CHECK: .vgpr_count: 0x1
define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
entry:
call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
unreachable
}
; Neither uses not writes a VGPR
; CHECK-LABEL: gfx_func:
; CHECK: .vgpr_count: 0x1
define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
entry:
ret [32 x i32] %args
}