The CWSR trap handler needs to save and restore the VGPRs. When dynamic VGPRs are in use, the fixed function hardware will only allocate enough space for one VGPR block. The rest will have to be stored in scratch, at offset 0. This patch allocates the necessary space by: - generating a prologue that checks at runtime if we're on a compute queue (since CWSR only works on compute queues); for this we will have to check the ME_ID bits of the ID_HW_ID2 register - if that is non-zero, we can assume we're on a compute queue and initialize the SP and FP with enough room for the dynamic VGPRs - forcing all compute entry functions to use a FP so they can access their locals/spills correctly (this isn't ideal but it's the quickest to implement) Note that at the moment we allocate enough space for the theoretical maximum number of VGPRs that can be allocated dynamically (for blocks of 16 registers, this will be 128, of which we subtract the first 16, which are already allocated by the fixed function hardware). Future patches may decide to allocate less if they can prove the shader never allocates that many blocks. Also note that this should not affect any reported stack sizes (e.g. PAL backend_stack_size etc).
73 lines
2.0 KiB
LLVM
73 lines
2.0 KiB
LLVM
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
|
|
|
|
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
|
|
|
|
define amdgpu_cs void @amdgpu_cs() #0 {
|
|
; CHECK-LABEL: {{^}}name: amdgpu_cs
|
|
; CHECK: scratchReservedForDynamicVGPRs: 448
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @kernel() #0 {
|
|
; CHECK-LABEL: {{^}}name: kernel
|
|
; CHECK: scratchReservedForDynamicVGPRs: 448
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_cs void @with_local() #0 {
|
|
; CHECK-LABEL: {{^}}name: with_local
|
|
; CHECK: scratchReservedForDynamicVGPRs: 448
|
|
%local = alloca i32, addrspace(5)
|
|
store volatile i8 13, ptr addrspace(5) %local
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_cs void @with_calls() #0 {
|
|
; CHECK-LABEL: {{^}}name: with_calls
|
|
; CHECK: scratchReservedForDynamicVGPRs: 448
|
|
%local = alloca i32, addrspace(5)
|
|
store volatile i8 15, ptr addrspace(5) %local
|
|
call amdgpu_gfx void @callee(i32 71)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
|
|
; CHECK-LABEL: {{^}}name: realign_stack
|
|
; CHECK: scratchReservedForDynamicVGPRs: 512
|
|
%v = alloca <32 x i32>, align 128, addrspace(5)
|
|
store <32 x i32> %x, ptr addrspace(5) %v
|
|
call amdgpu_gfx void @callee(i32 71)
|
|
ret void
|
|
}
|
|
|
|
; Non-entry functions and graphics shaders can't run on a compute queue,
|
|
; so they don't need to worry about CWSR.
|
|
define amdgpu_gs void @amdgpu_gs() #0 {
|
|
; CHECK-LABEL: {{^}}name: amdgpu_gs
|
|
; CHECK: scratchReservedForDynamicVGPRs: 0
|
|
%local = alloca i32, addrspace(5)
|
|
store volatile i8 15, ptr addrspace(5) %local
|
|
call amdgpu_gfx void @callee(i32 71)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @amdgpu_gfx() #0 {
|
|
; CHECK-LABEL: {{^}}name: amdgpu_gfx
|
|
; CHECK: scratchReservedForDynamicVGPRs: 0
|
|
%local = alloca i32, addrspace(5)
|
|
store volatile i8 15, ptr addrspace(5) %local
|
|
call amdgpu_gfx void @callee(i32 71)
|
|
ret void
|
|
}
|
|
|
|
define void @default() #0 {
|
|
; CHECK-LABEL: {{^}}name: default
|
|
; CHECK: scratchReservedForDynamicVGPRs: 0
|
|
ret void
|
|
}
|
|
|
|
declare amdgpu_gfx void @callee(i32) #0
|
|
|
|
attributes #0 = { nounwind }
|
|
|