Unlike the callee-saved VGPR spill instructions emitted by `PEI::spillCalleeSavedRegs`, the CS VGPR spills inserted during emitPrologue/emitEpilogue require the exec bits flipping to avoid clobbering the inactive lanes of VGPRs used for SGPR spilling. Currently, these spill instructions are referenced from the SP at function entry and when the callee performs a stack realignment, they ended up getting incorrect stack offsets. Even if we try to adjust the offsets, the FP-SP becomes a runtime entity with dynamic stack realignment and the offsets would still be inaccurate. To fix it, use FP as the frame base in the spill instructions whenever the function has FP. The offsets obtained for the CS objects would always be the right values from FP. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D134949
234 lines
8.8 KiB
LLVM
234 lines
8.8 KiB
LLVM
; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
|
|
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s
|
|
|
|
declare amdgpu_gfx float @extern_func(float) #0
|
|
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
|
|
|
|
@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4
|
|
|
|
define amdgpu_gfx float @no_stack(float %arg0) #0 {
|
|
%add = fadd float %arg0, 1.0
|
|
ret float %add
|
|
}
|
|
|
|
define amdgpu_gfx float @simple_stack(float %arg0) #0 {
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%add = fadd float %arg0, %val
|
|
ret float %add
|
|
}
|
|
|
|
define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%add = fadd float %arg0, %val
|
|
%stack2 = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack2
|
|
%val2 = load volatile float, float addrspace(5)* %stack2
|
|
%add2 = fadd float %add, %val2
|
|
ret float %add2
|
|
}
|
|
|
|
define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
|
|
bb0:
|
|
%cmp = fcmp ogt float %arg0, 0.0
|
|
br i1 %cmp, label %bb1, label %bb2
|
|
|
|
bb1:
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%add = fadd float %arg0, %val
|
|
br label %bb2
|
|
|
|
bb2:
|
|
%res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
|
|
ret float %res
|
|
}
|
|
|
|
define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
|
|
bb0:
|
|
br label %bb1
|
|
|
|
bb1:
|
|
%ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%add = fadd float %arg0, %val
|
|
%cmp = icmp sgt i32 %ctr, 0
|
|
%newctr = sub i32 %ctr, 1
|
|
br i1 %cmp, label %bb1, label %bb2
|
|
|
|
bb2:
|
|
ret float %add
|
|
}
|
|
|
|
define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
|
|
%res = call amdgpu_gfx float @simple_stack(float %arg0)
|
|
ret float %res
|
|
}
|
|
|
|
define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%res = call amdgpu_gfx float @simple_stack(float %arg0)
|
|
%add = fadd float %res, %val
|
|
ret float %add
|
|
}
|
|
|
|
define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
|
|
%res = call amdgpu_gfx float @extern_func(float %arg0)
|
|
ret float %res
|
|
}
|
|
|
|
define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%res = call amdgpu_gfx float @extern_func(float %arg0)
|
|
%add = fadd float %res, %val
|
|
ret float %add
|
|
}
|
|
|
|
define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
|
|
%res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
|
|
ret float %res
|
|
}
|
|
|
|
define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
|
|
%fptr = load void()*, void()* addrspace(4)* @funcptr
|
|
call amdgpu_gfx void %fptr()
|
|
ret float %arg0
|
|
}
|
|
|
|
define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%fptr = load void()*, void()* addrspace(4)* @funcptr
|
|
call amdgpu_gfx void %fptr()
|
|
%add = fadd float %arg0, %val
|
|
ret float %add
|
|
}
|
|
|
|
define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
|
|
%stack = alloca float, i32 4, align 4, addrspace(5)
|
|
store volatile float 2.0, float addrspace(5)* %stack
|
|
%val = load volatile float, float addrspace(5)* %stack
|
|
%res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
|
|
%add = fadd float %res, %val
|
|
ret float %add
|
|
}
|
|
|
|
@lds = internal addrspace(3) global [64 x float] undef
|
|
|
|
define amdgpu_gfx float @simple_lds(float %arg0) #0 {
|
|
%lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
|
|
%val = load float, float addrspace(3)* %lds_ptr
|
|
ret float %val
|
|
}
|
|
|
|
define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
|
|
%lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
|
|
%val = load float, float addrspace(3)* %lds_ptr
|
|
%res = call amdgpu_gfx float @simple_lds_recurse(float %val)
|
|
ret float %res
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
|
|
; GCN: amdpal.pipelines:
|
|
; GCN-NEXT: - .registers:
|
|
; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
|
|
; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
|
|
; GCN-NEXT: .shader_functions:
|
|
; GCN-NEXT: dynamic_stack:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
|
; SDAG-NEXT: .vgpr_count: 0x2{{$}}
|
|
; GISEL-NEXT: .vgpr_count: 0x3{{$}}
|
|
; GCN-NEXT: dynamic_stack_loop:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; SDAG-NEXT: .sgpr_count: 0x25{{$}}
|
|
; GISEL-NEXT: .sgpr_count: 0x26{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
|
; SDAG-NEXT: .vgpr_count: 0x3{{$}}
|
|
; GISEL-NEXT: .vgpr_count: 0x4{{$}}
|
|
; GCN-NEXT: multiple_stack:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x21{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x3{{$}}
|
|
; GCN-NEXT: no_stack:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x20{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x1{{$}}
|
|
; GCN-NEXT: no_stack_call:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x25{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x3{{$}}
|
|
; GCN-NEXT: no_stack_extern_call:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: no_stack_extern_call_many_args:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: no_stack_indirect_call:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: simple_lds:
|
|
; GCN-NEXT: .lds_size: 0x100{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x20{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x1{{$}}
|
|
; GCN-NEXT: simple_lds_recurse:
|
|
; GCN-NEXT: .lds_size: 0x100{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2a{{$}}
|
|
; GCN-NEXT: simple_stack:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x21{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2{{$}}
|
|
; GCN-NEXT: simple_stack_call:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x25{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x4{{$}}
|
|
; GCN-NEXT: simple_stack_extern_call:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: simple_stack_indirect_call:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x30{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2c{{$}}
|
|
; GCN-NEXT: simple_stack_recurse:
|
|
; GCN-NEXT: .lds_size: 0{{$}}
|
|
; GCN-NEXT: .sgpr_count: 0x28{{$}}
|
|
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
|
; GCN-NEXT: .vgpr_count: 0x2b{{$}}
|
|
; GCN-NEXT: ...
|