Currently, the custom SGPR spill lowering pass spills SGPRs into physical VGPR lanes and the remaining VGPRs are used by regalloc for vector regclass allocation. This imposes many restrictions that we ended up with unsuccessful SGPR spilling when there won't be enough VGPRs and we are forced to spill the leftover into memory during PEI. The custom spill handling during PEI has many edge cases and often breaks the compiler time to time. This patch implements spilling SGPRs into virtual VGPR lanes. Since we now split the register allocation for SGPRs and VGPRs, the virtual registers introduced for the spill lanes would get allocated automatically in the subsequent regalloc invocation for VGPRs. Spill to virtual registers will always be successful, even in the high-pressure situations, and hence it avoids most of the edge cases during PEI. We are now left with only the custom SGPR spills during PEI for special registers like the frame pointer which is an unproblematic case. Differential Revision: https://reviews.llvm.org/D124196
473 lines
18 KiB
LLVM
473 lines
18 KiB
LLVM
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
|
target datalayout = "A5"
|
|
|
|
; FIXME: Why is this commuted only sometimes?
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
|
|
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GCN-NEXT: s_setpc_b64
|
|
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
|
|
%add0 = add i32 %arg0, %arg1
|
|
ret i32 %add0
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
|
|
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
|
|
; GCN: s_waitcnt vmcnt(0)
|
|
; GCN: s_setpc_b64
|
|
; GCN: ; ScratchSize: 68
|
|
define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%add0 = add i32 %arg0, %arg1
|
|
ret i32 %add0
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
|
|
define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
|
|
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
|
|
; GCN: s_setpc_b64
|
|
; GCN: ; ScratchSize: 68
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
|
|
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
|
|
; GCN: s_setpc_b64
|
|
; GCN: ; ScratchSize: 136
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
|
|
define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret void
|
|
}
|
|
|
|
; It doesn't make sense to do a tail from a kernel
|
|
; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
|
|
;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 {
|
|
%arg1.load = load i32, ptr addrspace(5) %arg1, align 4
|
|
%add0 = add i32 %arg0, %arg1.load
|
|
ret i32 %add0
|
|
}
|
|
|
|
; Tail call disallowed with byval in parent.
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
|
|
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
|
|
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
|
|
; GCN: s_swappc_b64
|
|
; GCN-NOT: v_readlane_b32 s32
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Tail call disallowed with byval in parent, not callee. The stack
|
|
; usage of incoming arguments must be <= the outgoing stack
|
|
; arguments.
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
|
|
; GCN-NOT: v0
|
|
; GCN-NOT: s32
|
|
; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
|
|
; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
|
; GCN-NEXT: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
|
|
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
|
|
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
|
|
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
|
|
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
|
|
|
|
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9: v_add3_u32 v0, v0, v3, v2
|
|
|
|
; GCN-NEXT: s_setpc_b64
|
|
define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
|
|
%val_firststack = extractvalue [32 x i32] %large, 30
|
|
%val_laststack = extractvalue [32 x i32] %large, 31
|
|
%add0 = add i32 %arg0, %arg1
|
|
%add1 = add i32 %add0, %val_firststack
|
|
%add2 = add i32 %add1, %val_laststack
|
|
ret i32 %add2
|
|
}
|
|
|
|
; FIXME: Why load and store same location for stack args?
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
|
|
|
|
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
|
|
; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8
|
|
|
|
; GCN-NOT: s32
|
|
|
|
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
|
|
; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8
|
|
|
|
; GCN-NOT: s32
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
|
|
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; If the callee requires more stack argument space than the caller,
|
|
; don't do a tail call.
|
|
; TODO: Do we really need this restriction?
|
|
|
|
; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
|
|
; GCN: s_swappc_b64
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Have another non-tail in the function
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
|
|
; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
|
|
; GCN-NEXT: s_mov_b32 s33, s32
|
|
; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
|
; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NEXT: s_mov_b64 exec
|
|
; GCN-DAG: s_addk_i32 s32, 0x400
|
|
; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2
|
|
|
|
; GCN-DAG: s_getpc_b64 s[4:5]
|
|
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
|
|
; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
|
|
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
|
; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
|
|
|
|
|
|
; GCN: s_swappc_b64
|
|
|
|
; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
|
|
|
; GCN: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
|
|
; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1
|
|
; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0
|
|
; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2
|
|
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
|
; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
%ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Have stack object in caller and stack passed arguments. SP should be
|
|
; in same place at function exit.
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
|
|
; GCN-NOT: s33
|
|
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
|
|
|
|
; GCN-NOT: s33
|
|
|
|
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
|
|
; GCN: s_setpc_b64 s[4:5]
|
|
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
|
|
; GCN-NOT: s33
|
|
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
|
|
|
|
; GCN-NOT: s33
|
|
; GCN: s_setpc_b64 s[4:5]
|
|
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
|
ret i32 %ret
|
|
}
|
|
|
|
@func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4
|
|
|
|
; Do support tail calls with a uniform, but unknown, callee.
|
|
; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32:
|
|
; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]]
|
|
; GCN: s_setpc_b64 [[FUNC_PTR]]
|
|
define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
|
|
%ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; We can't support a tail call to a divergent target. Use a waterfall
|
|
; loop around a regular call
|
|
; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32:
|
|
; GCN: v_readfirstlane_b32
|
|
; GCN: v_readfirstlane_b32
|
|
; GCN: s_and_saveexec_b64
|
|
; GCN: s_swappc_b64
|
|
; GCN: s_cbranch_execnz
|
|
; GCN: s_setpc_b64
|
|
define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%add = add i32 %b, %c
|
|
%ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
|
|
ret i32 %ret
|
|
}
|
|
|
|
declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64]))
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval:
|
|
; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
|
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
|
|
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
|
|
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}}
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}}
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}}
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}}
|
|
|
|
; GCN: s_setpc_b64 [[TARGET_ADDR]]
|
|
define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
|
|
entry:
|
|
%alloca0 = alloca [3 x i32], align 16, addrspace(5)
|
|
%alloca1 = alloca [2 x i64], align 8, addrspace(5)
|
|
store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca0
|
|
store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca1
|
|
tail call fastcc void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) %alloca0, ptr addrspace(5) byval([2 x i64]) %alloca1)
|
|
ret void
|
|
}
|
|
|
|
declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32)
|
|
|
|
; Callee has a byval and non-byval stack passed argument
|
|
; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed:
|
|
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}}
|
|
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
|
|
|
; GCN: v_mov_b32_e32 v0, 0
|
|
; GCN: v_mov_b32_e32 v30, 0
|
|
|
|
; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]]
|
|
define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
|
|
entry:
|
|
%alloca = alloca [3 x i32], align 16, addrspace(5)
|
|
store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
|
|
tail call fastcc void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) %alloca, [32 x i32] zeroinitializer, i32 %stack.out.arg)
|
|
ret void
|
|
}
|
|
|
|
declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
|
|
ret i64 %ret
|
|
}
|
|
|
|
declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
|
|
ret ptr addrspace(1) %ret
|
|
}
|
|
|
|
declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
|
|
ret i16 %ret
|
|
}
|
|
|
|
declare hidden fastcc half @f16_fastcc_f16(half %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc half @f16_fastcc_f16(half %a)
|
|
ret half %ret
|
|
}
|
|
|
|
declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
|
|
ret <3 x i16> %ret
|
|
}
|
|
|
|
declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
|
|
ret <4 x i16> %ret
|
|
}
|
|
|
|
declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
|
|
ret <2 x i64> %ret
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind noinline }
|