Unlike the callee-saved VGPR spill instructions emitted by `PEI::spillCalleeSavedRegs`, the CS VGPR spills inserted during emitPrologue/emitEpilogue require the exec bits flipping to avoid clobbering the inactive lanes of VGPRs used for SGPR spilling. Currently, these spill instructions are referenced from the SP at function entry and when the callee performs a stack realignment, they ended up getting incorrect stack offsets. Even if we try to adjust the offsets, the FP-SP becomes a runtime entity with dynamic stack realignment and the offsets would still be inaccurate. To fix it, use FP as the frame base in the spill instructions whenever the function has FP. The offsets obtained for the CS objects would always be the right values from FP. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D134949
1188 lines
47 KiB
LLVM
1188 lines
47 KiB
LLVM
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s
|
|
|
|
; GCN-LABEL: {{^}}test_vopc_i32:
|
|
; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}}
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc_lo
|
|
; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}}
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc{{$}}
|
|
define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
%cmp = icmp sgt i32 %load, 0
|
|
%sel = select i1 %cmp, i32 1, i32 2
|
|
store i32 %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vopc_f32:
|
|
; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}}
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc_lo
|
|
; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}}
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc{{$}}
|
|
define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load float, ptr addrspace(1) %gep, align 4
|
|
%cmp = fcmp ugt float %load, 0.0
|
|
%sel = select i1 %cmp, float 1.0, float 2.0
|
|
store float %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vopc_vcmp:
|
|
; GFX1032: v_cmp_nle_f32_e32 vcc_lo, 0, v{{[0-9]+}}
|
|
; GFX1064: v_cmp_nle_f32_e32 vcc, 0, v{{[0-9]+}}
|
|
define amdgpu_ps void @test_vopc_vcmp(float %x) {
|
|
%cmp = fcmp oge float %x, 0.0
|
|
call void @llvm.amdgcn.kill(i1 %cmp)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vopc_2xf16:
|
|
; GFX1032: v_cmp_le_f16_sdwa [[SC:vcc_lo|s[0-9]+]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD
|
|
; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]]
|
|
; GFX1064: v_cmp_le_f16_sdwa [[SC:vcc|s\[[0-9:]+\]]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD
|
|
; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]]
|
|
define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load <2 x half>, ptr addrspace(1) %gep, align 4
|
|
%elt = extractelement <2 x half> %load, i32 1
|
|
%cmp = fcmp ugt half %elt, 0.0
|
|
%sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load
|
|
store <2 x half> %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vopc_class:
|
|
; GFX1032: v_cmp_class_f32_e64 [[C:vcc_lo|s[0-9:]+]], s{{[0-9]+}}, 0x204
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]
|
|
; GFX1064: v_cmp_class_f32_e64 [[C:vcc|s\[[0-9:]+\]]], s{{[0-9]+}}, 0x204
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]{{$}}
|
|
define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 {
|
|
%fabs = tail call float @llvm.fabs.f32(float %x)
|
|
%cmp = fcmp oeq float %fabs, 0x7FF0000000000000
|
|
%ext = zext i1 %cmp to i32
|
|
store i32 %ext, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vcmp_vcnd_f16:
|
|
; GFX1032: v_cmp_neq_f16_e64 [[C:vcc_lo|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}}
|
|
; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]
|
|
|
|
; GFX1064: v_cmp_neq_f16_e64 [[C:vcc|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}}
|
|
; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]{{$}}
|
|
define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 {
|
|
%cmp = fcmp oeq half %x, 0x7FF0000000000000
|
|
%sel = select i1 %cmp, half 1.0, half %x
|
|
store half %sel, ptr addrspace(1) %out, align 2
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vop3_cmp_f32_sop_and:
|
|
; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}}
|
|
; GFX1032: v_cmp_nle_f32_e64 [[C2:s[0-9]+]], 1.0, v{{[0-9]+}}
|
|
; GFX1032: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]]
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]]
|
|
; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}}
|
|
; GFX1064: v_cmp_nle_f32_e64 [[C2:s\[[0-9:]+\]]], 1.0, v{{[0-9]+}}
|
|
; GFX1064: s_and_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]]
|
|
define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load float, ptr addrspace(1) %gep, align 4
|
|
%cmp = fcmp ugt float %load, 0.0
|
|
%cmp2 = fcmp ult float %load, 1.0
|
|
%and = and i1 %cmp, %cmp2
|
|
%sel = select i1 %and, float 1.0, float 2.0
|
|
store float %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vop3_cmp_i32_sop_xor:
|
|
; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}}
|
|
; GFX1032: v_cmp_gt_i32_e64 [[C2:s[0-9]+]], 1, v{{[0-9]+}}
|
|
; GFX1032: s_xor_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]]
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
|
|
; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}}
|
|
; GFX1064: v_cmp_gt_i32_e64 [[C2:s\[[0-9:]+\]]], 1, v{{[0-9]+}}
|
|
; GFX1064: s_xor_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
|
|
define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
%cmp = icmp sgt i32 %load, 0
|
|
%cmp2 = icmp slt i32 %load, 1
|
|
%xor = xor i1 %cmp, %cmp2
|
|
%sel = select i1 %xor, i32 1, i32 2
|
|
store i32 %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vop3_cmp_u32_sop_or:
|
|
; GFX1032: v_cmp_lt_u32_e32 vcc_lo, 3, v{{[0-9]+}}
|
|
; GFX1032: v_cmp_gt_u32_e64 [[C2:s[0-9]+]], 2, v{{[0-9]+}}
|
|
; GFX1032: s_or_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]]
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
|
|
; GFX1064: v_cmp_lt_u32_e32 vcc, 3, v{{[0-9]+}}
|
|
; GFX1064: v_cmp_gt_u32_e64 [[C2:s\[[0-9:]+\]]], 2, v{{[0-9]+}}
|
|
; GFX1064: s_or_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
|
|
define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
%cmp = icmp ugt i32 %load, 3
|
|
%cmp2 = icmp ult i32 %load, 2
|
|
%or = or i1 %cmp, %cmp2
|
|
%sel = select i1 %or, i32 1, i32 2
|
|
store i32 %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_mask_if:
|
|
; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
|
|
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
|
|
; GCN: s_cbranch_execz
|
|
define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%cmp = icmp ugt i32 %lid, 10
|
|
br i1 %cmp, label %if, label %endif
|
|
|
|
if:
|
|
store i32 0, ptr addrspace(1) %arg, align 4
|
|
br label %endif
|
|
|
|
endif:
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_loop_with_if:
|
|
; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
|
|
; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}}
|
|
; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
|
|
; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
|
|
; GCN: s_cbranch_execz
|
|
; GCN: .LBB{{.*}}:
|
|
; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
|
|
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
|
|
; GCN: s_cbranch_execz
|
|
; GCN: ; %bb.{{[0-9]+}}:
|
|
; GCN: .LBB{{.*}}:
|
|
; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}}
|
|
; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
|
|
; GCN: ; %bb.{{[0-9]+}}:
|
|
; GCN: ; %bb.{{[0-9]+}}:
|
|
; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}}
|
|
; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}]
|
|
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
|
|
; GCN: s_cbranch_execz .LBB
|
|
; GCN: ; %bb.{{[0-9]+}}:
|
|
; GCN: .LBB{{.*}}:
|
|
; GCN: s_endpgm
|
|
define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
br label %bb2
|
|
|
|
bb1:
|
|
ret void
|
|
|
|
bb2:
|
|
%tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ]
|
|
%tmp4 = icmp slt i32 %tmp3, %tmp
|
|
br i1 %tmp4, label %bb5, label %bb11
|
|
|
|
bb5:
|
|
%tmp6 = sext i32 %tmp3 to i64
|
|
%tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load i32, ptr addrspace(1) %tmp7, align 4
|
|
%tmp9 = icmp sgt i32 %tmp8, 10
|
|
br i1 %tmp9, label %bb10, label %bb11
|
|
|
|
bb10:
|
|
store i32 %tmp, ptr addrspace(1) %tmp7, align 4
|
|
br label %bb13
|
|
|
|
bb11:
|
|
%tmp12 = sdiv i32 %tmp3, 2
|
|
br label %bb13
|
|
|
|
bb13:
|
|
%tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ]
|
|
%tmp15 = add nsw i32 %tmp14, 1
|
|
%tmp16 = icmp slt i32 %tmp14, 255
|
|
br i1 %tmp16, label %bb2, label %bb1
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_loop_with_if_else_break:
|
|
; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
|
|
; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
|
|
; GCN: s_cbranch_execz
|
|
; GCN: ; %bb.{{[0-9]+}}: ; %.preheader
|
|
; GCN: .LBB{{.*}}:
|
|
|
|
; GCN: global_store_dword
|
|
; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo
|
|
; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc
|
|
; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo
|
|
; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec
|
|
; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo
|
|
; GFX1064: s_and_b64 [[MASK0]], [[MASK0]], exec
|
|
; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]]
|
|
; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]]
|
|
; GCN: .LBB{{.*}}: ; %Flow
|
|
; GFX1032: s_and_b32 [[TMP0:s[0-9]+]], exec_lo, [[MASK1]]
|
|
; GFX1064: s_and_b64 [[TMP0:s\[[0-9:]+\]]], exec, [[MASK1]]
|
|
; GFX1032: s_or_b32 [[ACC:s[0-9]+]], [[TMP0]], [[ACC]]
|
|
; GFX1064: s_or_b64 [[ACC:s\[[0-9:]+\]]], [[TMP0]], [[ACC]]
|
|
; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[ACC]]
|
|
; GFX1064: s_andn2_b64 exec, exec, [[ACC]]
|
|
; GCN: s_cbranch_execz
|
|
; GCN: .LBB{{.*}}:
|
|
|
|
; GFX1032-DAG: s_or_b32 [[MASK1]], [[MASK1]], exec_lo
|
|
; GFX1064-DAG: s_or_b64 [[MASK1]], [[MASK1]], exec
|
|
; GCN-DAG: global_load_dword [[LOAD:v[0-9]+]]
|
|
; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]]
|
|
; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]]
|
|
define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = icmp eq i32 %tmp, 0
|
|
br i1 %tmp1, label %.loopexit, label %.preheader
|
|
|
|
.preheader:
|
|
br label %bb2
|
|
|
|
bb2:
|
|
%tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ]
|
|
%tmp4 = zext i32 %tmp3 to i64
|
|
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
|
|
%tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
|
|
%tmp7 = icmp sgt i32 %tmp6, 10
|
|
br i1 %tmp7, label %bb8, label %.loopexit
|
|
|
|
bb8:
|
|
store i32 %tmp, ptr addrspace(1) %tmp5, align 4
|
|
%tmp9 = add nuw nsw i32 %tmp3, 1
|
|
%tmp10 = icmp ult i32 %tmp9, 256
|
|
%tmp11 = icmp ult i32 %tmp9, %tmp
|
|
%tmp12 = and i1 %tmp10, %tmp11
|
|
br i1 %tmp12, label %bb2, label %.loopexit
|
|
|
|
.loopexit:
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_addc_vop2b:
|
|
; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo
|
|
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}}
|
|
define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
|
|
%tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
|
|
%tmp5 = add nsw i64 %tmp4, %arg1
|
|
store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_subbrev_vop2b:
|
|
; GFX1032: v_sub_co_u32 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
|
|
; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
|
|
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
|
|
; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
|
|
define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
|
|
%tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
|
|
%tmp5 = sub nsw i64 %tmp4, %arg1
|
|
store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_subb_vop2b:
|
|
; GFX1032: v_sub_co_u32 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
|
; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
|
|
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
|
; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
|
|
define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
|
|
%tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
|
|
%tmp5 = sub nsw i64 %arg1, %tmp4
|
|
store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_udiv64:
|
|
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GCN: s_addc_u32 s{{[0-9]+}}, 0, s{{[0-9]+}}
|
|
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0
|
|
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GCN: s_addc_u32 s{{[0-9]+}}, 0, s{{[0-9]+}}
|
|
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
|
|
bb:
|
|
%tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1
|
|
%tmp1 = load i64, ptr addrspace(1) %tmp, align 8
|
|
%tmp2 = load i64, ptr addrspace(1) %arg, align 8
|
|
%tmp3 = udiv i64 %tmp1, %tmp2
|
|
%tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 2
|
|
store i64 %tmp3, ptr addrspace(1) %tmp4, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_div_scale_f32:
|
|
; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
|
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
|
|
|
|
%a = load volatile float, ptr addrspace(1) %gep.0, align 4
|
|
%b = load volatile float, ptr addrspace(1) %gep.1, align 4
|
|
|
|
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
|
|
%result0 = extractvalue { float, i1 } %result, 0
|
|
store float %result0, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_div_scale_f64:
|
|
; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
|
|
; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
|
|
define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 {
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
|
|
|
%a = load volatile double, ptr addrspace(1) %gep.0, align 8
|
|
%b = load volatile double, ptr addrspace(1) %gep.1, align 8
|
|
|
|
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
|
|
%result0 = extractvalue { double, i1 } %result, 0
|
|
store double %result0, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_mad_i64_i32:
|
|
; GFX1032: v_mad_i64_i32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
|
|
; GFX1064: v_mad_i64_i32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
|
|
define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|
%sext0 = sext i32 %arg0 to i64
|
|
%sext1 = sext i32 %arg1 to i64
|
|
%mul = mul i64 %sext0, %sext1
|
|
%mad = add i64 %mul, %arg2
|
|
ret i64 %mad
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_mad_u64_u32:
|
|
; GFX1032: v_mad_u64_u32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
|
|
; GFX1064: v_mad_u64_u32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
|
|
define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|
%sext0 = zext i32 %arg0 to i64
|
|
%sext1 = zext i32 %arg1 to i64
|
|
%mul = mul i64 %sext0, %sext1
|
|
%mad = add i64 %mul, %arg2
|
|
ret i64 %mad
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_div_fmas_f32:
|
|
; GCN: s_bitcmp1_b32 s{{[0-9]+}}, 0
|
|
; GFX1032: s_cselect_b32 vcc_lo, -1, 0
|
|
; GFX1064: s_cselect_b64 vcc, -1, 0
|
|
; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i1 %d) nounwind {
|
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
|
|
store float %result, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_div_fmas_f64:
|
|
; GCN: s_bitcmp1_b32 s{{[0-9]+}}, 0
|
|
; GFX1032: s_cselect_b32 vcc_lo, -1, 0
|
|
; GFX1064: s_cselect_b64 vcc, -1, 0
|
|
; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
|
|
define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
|
|
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
|
|
store double %result, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
|
|
; GFX1032: s_mov_b32 [[VCC:vcc_lo]], 0{{$}}
|
|
; GFX1064: s_mov_b64 [[VCC:vcc]], 0{{$}}
|
|
; GFX1032: s_and_saveexec_b32 [[SAVE:s[0-9]+]], s{{[0-9]+}}{{$}}
|
|
; GFX1064: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], s[{{[0-9:]+}}]{{$}}
|
|
|
|
; GCN: load_dword [[LOAD:v[0-9]+]]
|
|
; GCN: v_cmp_ne_u32_e32 [[VCC]], 0, [[LOAD]]
|
|
|
|
; GCN: .LBB{{[0-9_]+}}:
|
|
; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE]]
|
|
; GFX1064: s_or_b64 exec, exec, [[SAVE]]
|
|
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
|
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 {
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
%gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
|
|
%gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
|
%gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
|
|
%gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2
|
|
|
|
%a = load float, ptr addrspace(1) %gep.a
|
|
%b = load float, ptr addrspace(1) %gep.b
|
|
%c = load float, ptr addrspace(1) %gep.c
|
|
|
|
%cmp0 = icmp eq i32 %tid, 0
|
|
br i1 %cmp0, label %bb, label %exit
|
|
|
|
bb:
|
|
%val = load volatile i32, ptr addrspace(1) %dummy
|
|
%cmp1 = icmp ne i32 %val, 0
|
|
br label %exit
|
|
|
|
exit:
|
|
%cond = phi i1 [false, %entry], [%cmp1, %bb]
|
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
|
|
store float %result, ptr addrspace(1) %gep.out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}fdiv_f32:
|
|
; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
|
|
; GCN-NOT: vcc
|
|
; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
|
|
entry:
|
|
%fdiv = fdiv float %a, %b
|
|
store float %fdiv, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_br_cc_f16:
|
|
; GFX1032: v_cmp_nlt_f16_e32 vcc_lo,
|
|
; GFX1064: v_cmp_nlt_f16_e32 vcc,
|
|
; GCN-NEXT: s_cbranch_vccnz
|
|
define amdgpu_kernel void @test_br_cc_f16(
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b) {
|
|
entry:
|
|
%a.val = load half, ptr addrspace(1) %a
|
|
%b.val = load half, ptr addrspace(1) %b
|
|
%fcmp = fcmp olt half %a.val, %b.val
|
|
br i1 %fcmp, label %one, label %two
|
|
|
|
one:
|
|
store half %a.val, ptr addrspace(1) %r
|
|
ret void
|
|
|
|
two:
|
|
store half %b.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_brcc_i1:
|
|
; GCN: s_bitcmp0_b32 s{{[0-9]+}}, 0
|
|
; GCN-NEXT: s_cbranch_scc1
|
|
define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 {
|
|
%cmp0 = icmp ne i1 %val, 0
|
|
br i1 %cmp0, label %store, label %end
|
|
|
|
store:
|
|
store i32 222, ptr addrspace(1) %out
|
|
ret void
|
|
|
|
end:
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_preserve_condition_undef_flag:
|
|
; GFX1032-DAG: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0
|
|
; GFX1032-DAG: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0
|
|
; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0
|
|
; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}}
|
|
; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]]
|
|
; GFX1064-DAG: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0
|
|
; GFX1064-DAG: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0
|
|
; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0
|
|
; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
|
|
; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}]
|
|
; GFX1064: s_and_b64 vcc, exec, [[OR2]]
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 {
|
|
bb0:
|
|
%tmp = icmp sgt i32 %arg1, 4
|
|
%undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
|
|
%tmp4 = select i1 %undef, float %arg, float 1.000000e+00
|
|
%tmp5 = fcmp ogt float %arg2, 0.000000e+00
|
|
%tmp6 = fcmp olt float %arg2, 1.000000e+00
|
|
%tmp7 = fcmp olt float %arg, %tmp4
|
|
%tmp8 = and i1 %tmp5, %tmp6
|
|
%tmp9 = and i1 %tmp8, %tmp7
|
|
br i1 %tmp9, label %bb1, label %bb2
|
|
|
|
bb1:
|
|
store volatile i32 0, ptr addrspace(1) undef
|
|
br label %bb2
|
|
|
|
bb2:
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_invert_true_phi_cond_break_loop:
|
|
; GFX1032: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, -1
|
|
; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
; GFX1064: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], -1
|
|
; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
|
|
define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 {
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp = sub i32 %id, %arg
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %Flow, %bb
|
|
%lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
|
|
%lsr.iv.next = add i32 %lsr.iv, 1
|
|
%cmp0 = icmp slt i32 %lsr.iv.next, 0
|
|
br i1 %cmp0, label %bb4, label %Flow
|
|
|
|
bb4: ; preds = %bb1
|
|
%load = load volatile i32, ptr addrspace(1) undef, align 4
|
|
%cmp1 = icmp sge i32 %tmp, %load
|
|
br label %Flow
|
|
|
|
Flow: ; preds = %bb4, %bb1
|
|
%tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
|
|
%tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
|
|
br i1 %tmp3, label %bb1, label %bb9
|
|
|
|
bb9: ; preds = %Flow
|
|
store volatile i32 7, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_movrels_extract_neg_offset_vgpr:
|
|
; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 1, v{{[0-9]+}}
|
|
; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc_lo
|
|
; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 2, v{{[0-9]+}}
|
|
; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc_lo
|
|
; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 3, v{{[0-9]+}}
|
|
; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc_lo
|
|
; GFX1064: v_cmp_eq_u32_e32 vcc, 1, v{{[0-9]+}}
|
|
; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
|
|
; GFX1064: v_cmp_ne_u32_e32 vcc, 2, v{{[0-9]+}}
|
|
; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc
|
|
; GFX1064: v_cmp_ne_u32_e32 vcc, 3, v{{[0-9]+}}
|
|
; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
|
|
define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) %out) #0 {
|
|
entry:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%index = add i32 %id, -512
|
|
%value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
|
store i32 %value, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_set_inactive:
|
|
; GFX1032: s_not_b32 exec_lo, exec_lo
|
|
; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 42
|
|
; GFX1032: s_not_b32 exec_lo, exec_lo
|
|
; GFX1064: s_not_b64 exec, exec{{$}}
|
|
; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 42
|
|
; GFX1064: s_not_b64 exec, exec{{$}}
|
|
define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 {
|
|
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
|
|
store i32 %tmp, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_set_inactive_64:
|
|
; GFX1032: s_not_b32 exec_lo, exec_lo
|
|
; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0
|
|
; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0
|
|
; GFX1032: s_not_b32 exec_lo, exec_lo
|
|
; GFX1064: s_not_b64 exec, exec{{$}}
|
|
; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0
|
|
; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0
|
|
; GFX1064: s_not_b64 exec, exec{{$}}
|
|
define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
|
|
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
|
|
store i64 %tmp, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_kill_i1_terminator_float:
|
|
; GFX1032: s_mov_b32 exec_lo, 0
|
|
; GFX1064: s_mov_b64 exec, 0
|
|
define amdgpu_ps void @test_kill_i1_terminator_float() #0 {
|
|
call void @llvm.amdgcn.kill(i1 false)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_kill_i1_terminator_i1:
|
|
; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo
|
|
; GFX1032: s_or_b32 [[OR:s[0-9]+]],
|
|
; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[OR]], exec_lo
|
|
; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]]
|
|
; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]]
|
|
; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec
|
|
; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]],
|
|
; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[OR]], exec
|
|
; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]]
|
|
; GFX1064: s_and_b64 exec, exec, [[MASK]]
|
|
define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
|
|
%c1 = icmp slt i32 %a, %b
|
|
%c2 = icmp slt i32 %c, %d
|
|
%x = or i1 %c1, %c2
|
|
call void @llvm.amdgcn.kill(i1 %x)
|
|
call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_loop_vcc:
|
|
; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
|
|
; GFX1064: v_cmp_lt_f32_e32 vcc,
|
|
; GCN: s_cbranch_vccz
|
|
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
|
|
%c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
|
|
%cc = fcmp ogt float %ctr.iv, 7.0
|
|
br i1 %cc, label %break, label %body
|
|
|
|
body:
|
|
%c.iv0 = extractelement <4 x float> %c.iv, i32 0
|
|
%c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
|
|
%ctr.next = fadd float %ctr.iv, 2.0
|
|
br label %loop
|
|
|
|
break:
|
|
ret <4 x float> %c.iv
|
|
}
|
|
|
|
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
|
; GCN-LABEL: {{^}}test_wwm1:
|
|
; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1
|
|
; GFX1032: s_mov_b32 exec_lo, [[SAVE]]
|
|
; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1
|
|
; GFX1064: s_mov_b64 exec, [[SAVE]]
|
|
define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
|
|
main_body:
|
|
%out = fadd float %src0, %src1
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
ret float %out.0
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_wwm2:
|
|
; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 16, v{{[0-9]+}}
|
|
; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo
|
|
; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1
|
|
; GFX1032: s_mov_b32 exec_lo, [[SAVE2]]
|
|
; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]]
|
|
; GFX1064: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
|
|
; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}}
|
|
; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1
|
|
; GFX1064: s_mov_b64 exec, [[SAVE2]]
|
|
; GFX1064: s_or_b64 exec, exec, [[SAVE1]]
|
|
define amdgpu_ps float @test_wwm2(i32 inreg %idx) {
|
|
main_body:
|
|
; use mbcnt to make sure the branch is divergent
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
%cc = icmp uge i32 %hi, 16
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
if:
|
|
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
|
%out = fadd float %src, %src
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
%out.1 = fadd float %src, %out.0
|
|
br label %endif
|
|
|
|
endif:
|
|
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
|
ret float %out.2
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_strict_wwm1:
|
|
; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1
|
|
; GFX1032: s_mov_b32 exec_lo, [[SAVE]]
|
|
; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1
|
|
; GFX1064: s_mov_b64 exec, [[SAVE]]
|
|
define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
|
|
main_body:
|
|
%out = fadd float %src0, %src1
|
|
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
|
ret float %out.0
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_strict_wwm2:
|
|
; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 16, v{{[0-9]+}}
|
|
; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo
|
|
; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1
|
|
; GFX1032: s_mov_b32 exec_lo, [[SAVE2]]
|
|
; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]]
|
|
; GFX1064: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
|
|
; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}}
|
|
; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1
|
|
; GFX1064: s_mov_b64 exec, [[SAVE2]]
|
|
; GFX1064: s_or_b64 exec, exec, [[SAVE1]]
|
|
define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) {
|
|
main_body:
|
|
; use mbcnt to make sure the branch is divergent
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
%cc = icmp uge i32 %hi, 16
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
if:
|
|
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
|
%out = fadd float %src, %src
|
|
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
|
%out.1 = fadd float %src, %out.0
|
|
br label %endif
|
|
|
|
endif:
|
|
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
|
ret float %out.2
|
|
}
|
|
|
|
|
|
; GCN-LABEL: {{^}}test_wqm1:
|
|
; GFX1032: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
|
|
; GFX1032: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX1032: s_and_b32 exec_lo, exec_lo, [[ORIG]]
|
|
; GFX1064: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec{{$}}
|
|
; GFX1064: s_wqm_b64 exec, exec{{$}}
|
|
; GFX1064: s_and_b64 exec, exec, [[ORIG]]
|
|
define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 {
|
|
main_body:
|
|
%inst23 = extractelement <2 x float> %pos, i32 0
|
|
%inst24 = extractelement <2 x float> %pos, i32 1
|
|
%inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
|
|
%inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
|
|
%inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
|
|
%inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0)
|
|
ret <4 x float> %tex
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_wqm2:
|
|
; GFX1032: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX1032: s_and_b32 exec_lo, exec_lo, s{{[0-9]+}}
|
|
; GFX1064: s_wqm_b64 exec, exec{{$}}
|
|
; GFX1064: s_and_b64 exec, exec, s[{{[0-9:]+}}]
|
|
define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 {
|
|
main_body:
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
|
%out = fadd float %src0, %src1
|
|
%out.0 = bitcast float %out to i32
|
|
%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
|
|
%out.2 = bitcast i32 %out.1 to float
|
|
ret float %out.2
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_intr_fcmp_i64:
|
|
; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}}
|
|
; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
|
|
; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
|
|
; GFX1064: v_cmp_eq_f32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
|
|
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
|
|
; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
|
|
; GCN: store_dwordx2 v{{[0-9]+}}, v[[[V_LO]]:[[V_HI]]], s
|
|
define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
|
|
%temp = call float @llvm.fabs.f32(float %a)
|
|
%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
|
|
store i64 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_intr_icmp_i64:
|
|
; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}}
|
|
; GFX1032-DAG: v_cmp_eq_u32_e64 [[C_LO:vcc_lo|s[0-9]+]], 0x64, {{s[0-9]+}}
|
|
; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[C_LO]]
|
|
; GFX1064: v_cmp_eq_u32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}}
|
|
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
|
|
; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
|
|
; GCN: store_dwordx2 v{{[0-9]+}}, v[[[V_LO]]:[[V_HI]]], s
|
|
define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
|
|
%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
|
|
store i64 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_intr_fcmp_i32:
|
|
; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
|
|
; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
|
|
; GFX1064: v_cmp_eq_f32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
|
|
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
|
|
; GCN: store_dword v{{[0-9]+}}, v[[V_LO]], s
|
|
define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
|
|
%temp = call float @llvm.fabs.f32(float %a)
|
|
%result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
|
|
store i32 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_intr_icmp_i32:
|
|
; GFX1032-DAG: v_cmp_eq_u32_e64 s[[C_LO:[0-9]+]], 0x64, {{s[0-9]+}}
|
|
; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
|
|
; GFX1064: v_cmp_eq_u32_e64 s[[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
|
|
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
|
|
; GCN: store_dword v{{[0-9]+}}, v[[V_LO]], s
|
|
define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) {
|
|
%result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
|
|
store i32 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_wqm_vote:
|
|
; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0
|
|
; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo
|
|
; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo
|
|
; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[WQM]], exec_lo
|
|
; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]]
|
|
; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]]
|
|
; GFX1064: v_cmp_neq_f32_e32 vcc, 0
|
|
; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec
|
|
; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc
|
|
; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[WQM]], exec
|
|
; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]]
|
|
; GFX1064: s_and_b64 exec, exec, [[MASK]]
|
|
define amdgpu_ps void @test_wqm_vote(float %a) {
|
|
%c1 = fcmp une float %a, 0.0
|
|
%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
|
|
call void @llvm.amdgcn.kill(i1 %c2)
|
|
call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_branch_true:
|
|
; GFX1032: s_mov_b32 vcc_lo, exec_lo
|
|
; GFX1064: s_mov_b64 vcc, exec
|
|
define amdgpu_kernel void @test_branch_true() #2 {
|
|
entry:
|
|
br i1 true, label %for.end, label %for.body.lr.ph
|
|
|
|
for.body.lr.ph: ; preds = %entry
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %for.body.lr.ph
|
|
br i1 undef, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_ps_live:
|
|
; GFX1032: s_mov_b32 [[C:s[0-9]+]], exec_lo
|
|
; GFX1064: s_mov_b64 [[C:s\[[0-9:]+\]]], exec{{$}}
|
|
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]
|
|
define amdgpu_ps float @test_ps_live() #0 {
|
|
%live = call i1 @llvm.amdgcn.ps.live()
|
|
%live.32 = zext i1 %live to i32
|
|
%r = bitcast i32 %live.32 to float
|
|
ret float %r
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64:
|
|
; GFX1032: v_cmp_neq_f64_e64 [[C:s[0-9]+]], s[{{[0-9:]+}}], 1.0
|
|
; GFX1032: s_and_b32 vcc_lo, exec_lo, [[C]]
|
|
; GFX1064: v_cmp_neq_f64_e64 [[C:s\[[0-9:]+\]]], s[{{[0-9:]+}}], 1.0
|
|
; GFX1064: s_and_b64 vcc, exec, [[C]]
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
entry:
|
|
%v = load double, ptr addrspace(1) %in
|
|
%cc = fcmp oeq double %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fadd double %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi double [ %v, %entry ], [ %u, %if ]
|
|
store double %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vgprblocks_w32_attr:
|
|
; Test that the wave size can be overridden in function attributes and that the block size is correct as a result
|
|
; GFX10DEFWAVE: ; VGPRBlocks: 1
|
|
define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e,
|
|
float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 {
|
|
main_body:
|
|
%s = fadd float %a, %b
|
|
%s.1 = fadd float %s, %c
|
|
%s.2 = fadd float %s.1, %d
|
|
%s.3 = fadd float %s.2, %e
|
|
%s.4 = fadd float %s.3, %f
|
|
%s.5 = fadd float %s.4, %g
|
|
%s.6 = fadd float %s.5, %h
|
|
%s.7 = fadd float %s.6, %i
|
|
%s.8 = fadd float %s.7, %j
|
|
%s.9 = fadd float %s.8, %k
|
|
%s.10 = fadd float %s.9, %l
|
|
ret float %s.10
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vgprblocks_w64_attr:
|
|
; Test that the wave size can be overridden in function attributes and that the block size is correct as a result
|
|
; GFX10DEFWAVE: ; VGPRBlocks: 2
|
|
define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e,
|
|
float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 {
|
|
main_body:
|
|
%s = fadd float %a, %b
|
|
%s.1 = fadd float %s, %c
|
|
%s.2 = fadd float %s.1, %d
|
|
%s.3 = fadd float %s.2, %e
|
|
%s.4 = fadd float %s.3, %f
|
|
%s.5 = fadd float %s.4, %g
|
|
%s.6 = fadd float %s.5, %h
|
|
%s.7 = fadd float %s.6, %i
|
|
%s.8 = fadd float %s.7, %j
|
|
%s.9 = fadd float %s.8, %k
|
|
%s.10 = fadd float %s.9, %l
|
|
ret float %s.10
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}icmp64:
|
|
; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v
|
|
; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v
|
|
define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%mul4 = mul nsw i32 %s, %n
|
|
%cmp = icmp slt i32 0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem = urem i32 %id, %s
|
|
%icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32)
|
|
%shr = lshr i64 %icmp, 1
|
|
%notmask = shl nsw i64 -1, 0
|
|
%and = and i64 %notmask, %shr
|
|
%or = or i64 %and, -9223372036854775808
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
|
|
%cast = trunc i64 %cttz to i32
|
|
%cmp3 = icmp ugt i32 10, %cast
|
|
%cmp6 = icmp ne i32 %rem, 0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}fcmp64:
|
|
; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v
|
|
; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v
|
|
define amdgpu_kernel void @fcmp64(float %n, float %s) {
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%id.f = uitofp i32 %id to float
|
|
%mul4 = fmul float %s, %n
|
|
%cmp = fcmp ult float 0.0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem.f = frem float %id.f, %s
|
|
%fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1)
|
|
%shr = lshr i64 %fcmp, 1
|
|
%notmask = shl nsw i64 -1, 0
|
|
%and = and i64 %notmask, %shr
|
|
%or = or i64 %and, -9223372036854775808
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
|
|
%cast = trunc i64 %cttz to i32
|
|
%cmp3 = icmp ugt i32 10, %cast
|
|
%cmp6 = fcmp one float %rem.f, 0.0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}icmp32:
|
|
; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v
|
|
; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v
|
|
define amdgpu_kernel void @icmp32(i32 %n, i32 %s) {
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%mul4 = mul nsw i32 %s, %n
|
|
%cmp = icmp slt i32 0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem = urem i32 %id, %s
|
|
%icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32)
|
|
%shr = lshr i32 %icmp, 1
|
|
%notmask = shl nsw i32 -1, 0
|
|
%and = and i32 %notmask, %shr
|
|
%or = or i32 %and, 2147483648
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
|
|
%cmp3 = icmp ugt i32 10, %cttz
|
|
%cmp6 = icmp ne i32 %rem, 0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}fcmp32:
|
|
; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v
|
|
; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v
|
|
define amdgpu_kernel void @fcmp32(float %n, float %s) {
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%id.f = uitofp i32 %id to float
|
|
%mul4 = fmul float %s, %n
|
|
%cmp = fcmp ult float 0.0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem.f = frem float %id.f, %s
|
|
%fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1)
|
|
%shr = lshr i32 %fcmp, 1
|
|
%notmask = shl nsw i32 -1, 0
|
|
%and = and i32 %notmask, %shr
|
|
%or = or i32 %and, 2147483648
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
|
|
%cmp3 = icmp ugt i32 10, %cttz
|
|
%cmp6 = fcmp one float %rem.f, 0.0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
declare void @external_void_func_void() #1
|
|
|
|
; Test save/restore of VGPR needed for SGPR spilling.
|
|
|
|
; GCN-LABEL: {{^}}callee_no_stack_with_call:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: s_waitcnt_vscnt
|
|
; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
|
|
; GCN-NEXT: s_mov_b32 s33, s32
|
|
; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
|
; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}}
|
|
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
|
|
|
|
; GFX1064: s_addk_i32 s32, 0x400
|
|
; GFX1032: s_addk_i32 s32, 0x200
|
|
; GCN-NEXT: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
|
|
|
|
; GCN-DAG: v_writelane_b32 v40, s30, 0
|
|
|
|
; GCN-DAG: v_writelane_b32 v40, s31, 1
|
|
; GCN: s_swappc_b64
|
|
; GCN-DAG: v_readlane_b32 s30, v40, 0
|
|
; GCN-DAG: v_readlane_b32 s31, v40, 1
|
|
|
|
; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
|
|
; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
|
|
; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
|
|
; GCN-NEXT: s_clause 0x1
|
|
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33
|
|
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4
|
|
; GCN-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
|
|
; GFX1064: s_addk_i32 s32, 0xfc00
|
|
; GFX1032: s_addk_i32 s32, 0xfe00
|
|
; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: s_setpc_b64
|
|
define void @callee_no_stack_with_call() #1 {
|
|
call void @external_void_func_void()
|
|
ret void
|
|
}
|
|
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
declare float @llvm.fabs.f32(float)
|
|
declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
|
|
declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1)
|
|
declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
|
|
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1)
|
|
declare i1 @llvm.amdgcn.class.f32(float, i32)
|
|
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
|
|
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
|
|
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
|
|
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
|
|
declare float @llvm.amdgcn.strict.wwm.f32(float)
|
|
declare float @llvm.amdgcn.wwm.f32(float)
|
|
declare i32 @llvm.amdgcn.wqm.i32(i32)
|
|
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
|
|
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32)
|
|
declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg)
|
|
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
|
|
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
|
|
declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32)
|
|
declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32)
|
|
declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32)
|
|
declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32)
|
|
declare void @llvm.amdgcn.kill(i1)
|
|
declare i1 @llvm.amdgcn.wqm.vote(i1)
|
|
declare i1 @llvm.amdgcn.ps.live()
|
|
declare i64 @llvm.cttz.i64(i64, i1)
|
|
declare i32 @llvm.cttz.i32(i32, i1)
|
|
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5
|
|
|
|
attributes #0 = { nounwind readnone speculatable }
|
|
attributes #1 = { nounwind }
|
|
attributes #2 = { nounwind readnone optnone noinline }
|
|
attributes #3 = { "target-features"="+wavefrontsize32" }
|
|
attributes #4 = { "target-features"="+wavefrontsize64" }
|
|
attributes #5 = { inaccessiblememonly nounwind }
|