Files
clang-p2996/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
Diana Picus 72c3c30452 [AMDGPU] Allocate scratch space for dVGPRs for CWSR (#130055)
The CWSR trap handler needs to save and restore the VGPRs. When dynamic
VGPRs are in use, the fixed function hardware will only allocate enough
space for one VGPR block. The rest will have to be stored in scratch, at
offset 0.

This patch allocates the necessary space by:
- generating a prologue that checks at runtime if we're on a compute
queue (since CWSR only works on compute queues); for this we will have
to check the ME_ID bits of the ID_HW_ID2 register - if that is non-zero,
we can assume we're on a compute queue and initialize the SP and FP with
enough room for the dynamic VGPRs
- forcing all compute entry functions to use a FP so they can access
their locals/spills correctly (this isn't ideal but it's the quickest to
implement)

Note that at the moment we allocate enough space for the theoretical
maximum number of VGPRs that can be allocated dynamically (for blocks of
16 registers, this will be 128, of which we subtract the first 16, which
are already allocated by the fixed function hardware). Future patches
may decide to allocate less if they can prove the shader never allocates
that many blocks.

Also note that this should not affect any reported stack sizes (e.g. PAL
backend_stack_size etc).
2025-03-19 13:49:19 +01:00

73 lines
2.0 KiB
LLVM

; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
define amdgpu_cs void @amdgpu_cs() #0 {
; CHECK-LABEL: {{^}}name: amdgpu_cs
; CHECK: scratchReservedForDynamicVGPRs: 448
ret void
}
define amdgpu_kernel void @kernel() #0 {
; CHECK-LABEL: {{^}}name: kernel
; CHECK: scratchReservedForDynamicVGPRs: 448
ret void
}
define amdgpu_cs void @with_local() #0 {
; CHECK-LABEL: {{^}}name: with_local
; CHECK: scratchReservedForDynamicVGPRs: 448
%local = alloca i32, addrspace(5)
store volatile i8 13, ptr addrspace(5) %local
ret void
}
define amdgpu_cs void @with_calls() #0 {
; CHECK-LABEL: {{^}}name: with_calls
; CHECK: scratchReservedForDynamicVGPRs: 448
%local = alloca i32, addrspace(5)
store volatile i8 15, ptr addrspace(5) %local
call amdgpu_gfx void @callee(i32 71)
ret void
}
define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
; CHECK-LABEL: {{^}}name: realign_stack
; CHECK: scratchReservedForDynamicVGPRs: 512
%v = alloca <32 x i32>, align 128, addrspace(5)
store <32 x i32> %x, ptr addrspace(5) %v
call amdgpu_gfx void @callee(i32 71)
ret void
}
; Non-entry functions and graphics shaders can't run on a compute queue,
; so they don't need to worry about CWSR.
define amdgpu_gs void @amdgpu_gs() #0 {
; CHECK-LABEL: {{^}}name: amdgpu_gs
; CHECK: scratchReservedForDynamicVGPRs: 0
%local = alloca i32, addrspace(5)
store volatile i8 15, ptr addrspace(5) %local
call amdgpu_gfx void @callee(i32 71)
ret void
}
define amdgpu_gfx void @amdgpu_gfx() #0 {
; CHECK-LABEL: {{^}}name: amdgpu_gfx
; CHECK: scratchReservedForDynamicVGPRs: 0
%local = alloca i32, addrspace(5)
store volatile i8 15, ptr addrspace(5) %local
call amdgpu_gfx void @callee(i32 71)
ret void
}
define void @default() #0 {
; CHECK-LABEL: {{^}}name: default
; CHECK: scratchReservedForDynamicVGPRs: 0
ret void
}
declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind }