CSR SGPR spilling currently uses the early available physical VGPRs. It currently imposes a high register pressure while trying to allocate large VGPR tuples within the default register budget. This patch changes the spilling strategy by picking the VGPRs in the reverse order, the highest available VGPR first and later after regalloc shift them back to the lowest available range. With that, the initial VGPRs would be available for allocation and possibility of finding large number of contiguous registers will be more.
473 lines
18 KiB
LLVM
473 lines
18 KiB
LLVM
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
|
target datalayout = "A5"
|
|
|
|
; FIXME: Why is this commuted only sometimes?
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
|
|
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GCN-NEXT: s_setpc_b64
|
|
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
|
|
%add0 = add i32 %arg0, %arg1
|
|
ret i32 %add0
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
|
|
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
|
|
; GCN: s_waitcnt vmcnt(0)
|
|
; GCN: s_setpc_b64
|
|
; GCN: ; ScratchSize: 68
|
|
define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%add0 = add i32 %arg0, %arg1
|
|
ret i32 %add0
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
|
|
define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
|
|
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
|
|
; GCN: s_setpc_b64
|
|
; GCN: ; ScratchSize: 68
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
|
|
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
|
|
; GCN: s_setpc_b64
|
|
; GCN: ; ScratchSize: 136
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
|
|
define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret void
|
|
}
|
|
|
|
; It doesn't make sense to do a tail from a kernel
|
|
; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
|
|
;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 {
|
|
%arg1.load = load i32, ptr addrspace(5) %arg1, align 4
|
|
%add0 = add i32 %arg0, %arg1.load
|
|
ret i32 %add0
|
|
}
|
|
|
|
; Tail call disallowed with byval in parent.
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
|
|
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
|
|
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
|
|
; GCN: s_swappc_b64
|
|
; GCN-NOT: v_readlane_b32 s32
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Tail call disallowed with byval in parent, not callee. The stack
|
|
; usage of incoming arguments must be <= the outgoing stack
|
|
; arguments.
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
|
|
; GCN-NOT: v0
|
|
; GCN-NOT: s32
|
|
; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
|
|
; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
|
; GCN-NEXT: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
|
|
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
|
|
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
|
|
|
|
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
|
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
|
|
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
|
|
|
|
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9: v_add3_u32 v0, v0, v3, v2
|
|
|
|
; GCN-NEXT: s_setpc_b64
|
|
define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
|
|
%val_firststack = extractvalue [32 x i32] %large, 30
|
|
%val_laststack = extractvalue [32 x i32] %large, 31
|
|
%add0 = add i32 %arg0, %arg1
|
|
%add1 = add i32 %add0, %val_firststack
|
|
%add2 = add i32 %add1, %val_laststack
|
|
ret i32 %add2
|
|
}
|
|
|
|
; FIXME: Why load and store same location for stack args?
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
|
|
|
|
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
|
|
; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8
|
|
|
|
; GCN-NOT: s32
|
|
|
|
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
|
|
; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8
|
|
|
|
; GCN-NOT: s32
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
|
|
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; If the callee requires more stack argument space than the caller,
|
|
; don't do a tail call.
|
|
; TODO: Do we really need this restriction?
|
|
|
|
; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
|
|
; GCN: s_swappc_b64
|
|
; GCN: s_setpc_b64
|
|
define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Have another non-tail in the function
|
|
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
|
|
; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
|
|
; GCN-NEXT: s_mov_b32 s33, s32
|
|
; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
|
; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NEXT: s_mov_b64 exec
|
|
; GCN-DAG: s_addk_i32 s32, 0x400
|
|
; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2
|
|
|
|
; GCN-DAG: s_getpc_b64 s[4:5]
|
|
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
|
; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
|
|
|
|
; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
|
|
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
|
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
|
|
|
|
|
|
; GCN: s_swappc_b64
|
|
|
|
; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
|
|
|
; GCN: s_getpc_b64 s[4:5]
|
|
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
|
|
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
|
|
; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1
|
|
; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0
|
|
; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2
|
|
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
|
|
; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NEXT: s_mov_b64 exec, s[8:9]
|
|
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
|
; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
|
|
; GCN-NEXT: s_setpc_b64 s[4:5]
|
|
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
|
%ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; Have stack object in caller and stack passed arguments. SP should be
|
|
; in same place at function exit.
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
|
|
; GCN-NOT: s33
|
|
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
|
|
|
|
; GCN-NOT: s33
|
|
|
|
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
|
|
; GCN: s_setpc_b64 s[4:5]
|
|
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
|
|
; GCN-NOT: s33
|
|
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
|
|
|
|
; GCN-NOT: s33
|
|
; GCN: s_setpc_b64 s[4:5]
|
|
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
|
|
entry:
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
%gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
|
|
store volatile i32 9, ptr addrspace(5) %gep
|
|
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
|
ret i32 %ret
|
|
}
|
|
|
|
@func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4
|
|
|
|
; Do support tail calls with a uniform, but unknown, callee.
|
|
; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32:
|
|
; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]]
|
|
; GCN: s_setpc_b64 [[FUNC_PTR]]
|
|
define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
|
|
%ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
|
|
ret i32 %ret
|
|
}
|
|
|
|
; We can't support a tail call to a divergent target. Use a waterfall
|
|
; loop around a regular call
|
|
; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32:
|
|
; GCN: v_readfirstlane_b32
|
|
; GCN: v_readfirstlane_b32
|
|
; GCN: s_and_saveexec_b64
|
|
; GCN: s_swappc_b64
|
|
; GCN: s_cbranch_execnz
|
|
; GCN: s_setpc_b64
|
|
define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
|
|
entry:
|
|
%add = add i32 %b, %c
|
|
%ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
|
|
ret i32 %ret
|
|
}
|
|
|
|
declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64]))
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval:
|
|
; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
|
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
|
|
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
|
|
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}}
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}}
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}}
|
|
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}}
|
|
|
|
; GCN: s_setpc_b64 [[TARGET_ADDR]]
|
|
define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
|
|
entry:
|
|
%alloca0 = alloca [3 x i32], align 16, addrspace(5)
|
|
%alloca1 = alloca [2 x i64], align 8, addrspace(5)
|
|
store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca0
|
|
store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca1
|
|
tail call fastcc void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) %alloca0, ptr addrspace(5) byval([2 x i64]) %alloca1)
|
|
ret void
|
|
}
|
|
|
|
declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32)
|
|
|
|
; Callee has a byval and non-byval stack passed argument
|
|
; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed:
|
|
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
|
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
|
|
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}}
|
|
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16
|
|
|
|
; GCN: v_mov_b32_e32 v0, 0
|
|
; GCN: v_mov_b32_e32 v30, 0
|
|
|
|
; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]]
|
|
define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
|
|
entry:
|
|
%alloca = alloca [3 x i32], align 16, addrspace(5)
|
|
store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
|
|
tail call fastcc void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) %alloca, [32 x i32] zeroinitializer, i32 %stack.out.arg)
|
|
ret void
|
|
}
|
|
|
|
declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
|
|
ret i64 %ret
|
|
}
|
|
|
|
declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
|
|
ret ptr addrspace(1) %ret
|
|
}
|
|
|
|
declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
|
|
ret i16 %ret
|
|
}
|
|
|
|
declare hidden fastcc half @f16_fastcc_f16(half %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc half @f16_fastcc_f16(half %a)
|
|
ret half %ret
|
|
}
|
|
|
|
declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
|
|
ret <3 x i16> %ret
|
|
}
|
|
|
|
declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
|
|
ret <4 x i16> %ret
|
|
}
|
|
|
|
declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
|
|
|
|
; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_getpc_b64
|
|
; GCN-NEXT: s_add_u32
|
|
; GCN-NEXT: s_addc_u32
|
|
; GCN-NEXT: s_setpc_b64
|
|
define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
|
|
entry:
|
|
%ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
|
|
ret <2 x i64> %ret
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind noinline }
|