si-wqm sometimes needs to save the LiveMask in the entry block. Later on, while looking for a place to enter WQM/WWM, it unconditionally skips over the first COPY instruction in the entry block. This is incorrect for functions where the LiveMask doesn't need to be saved, and therefore the first COPY is more likely a COPY from a function argument and might need to be in some non-exact mode. This patch fixes the issue by also checking that the source of the COPY is the EXEC register. This produces different code in 3 of the existing tests: In wwm-reserved.ll, a SGPR copy is now inside the WWM area rather than outside. This is benign. In wave32.ll, we end up with an extra register copy. This is because the first COPY in the block is now part of the WWM block, so si-pre-allocate-wwm-regs will allocate a new register for its destination (when it was outside of the WWM region, the register allocator could just re-use the same register). We might be able to improve this in si-pre-allocate-wwm-regs but I haven't looked into it. The same thing happens in dual-source-blend-export.ll, but for that one it's harder to see because of the scheduling changes. I've uploaded the before/after si-wqm output for it here: https://reviews.llvm.org/differential/diff/553445/ Differential Revision: https://reviews.llvm.org/D158841
2960 lines
117 KiB
LLVM
2960 lines
117 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s
|
|
|
|
define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
|
|
; GFX1032-LABEL: test_vopc_i32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vopc_i32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
%cmp = icmp sgt i32 %load, 0
|
|
%sel = select i1 %cmp, i32 1, i32 2
|
|
store i32 %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
|
|
; GFX1032-LABEL: test_vopc_f32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vopc_f32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load float, ptr addrspace(1) %gep, align 4
|
|
%cmp = fcmp ugt float %load, 0.0
|
|
%sel = select i1 %cmp, float 1.0, float 2.0
|
|
store float %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_vopc_vcmp(float %x) {
|
|
; GFX1032-LABEL: test_vopc_vcmp:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
|
|
; GFX1032-NEXT: s_cbranch_scc0 .LBB2_1
|
|
; GFX1032-NEXT: s_endpgm
|
|
; GFX1032-NEXT: .LBB2_1:
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX1032-NEXT: exp null off, off, off, off done vm
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vopc_vcmp:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_andn2_b64 exec, exec, vcc
|
|
; GFX1064-NEXT: s_cbranch_scc0 .LBB2_1
|
|
; GFX1064-NEXT: s_endpgm
|
|
; GFX1064-NEXT: .LBB2_1:
|
|
; GFX1064-NEXT: s_mov_b64 exec, 0
|
|
; GFX1064-NEXT: exp null off, off, off, off done vm
|
|
; GFX1064-NEXT: s_endpgm
|
|
%cmp = fcmp oge float %x, 0.0
|
|
call void @llvm.amdgcn.kill(i1 %cmp)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
|
|
; GFX1032-LABEL: test_vopc_2xf16:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_le_f16_sdwa vcc_lo, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc_lo
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vopc_2xf16:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load <2 x half>, ptr addrspace(1) %gep, align 4
|
|
%elt = extractelement <2 x half> %load, i32 1
|
|
%cmp = fcmp ugt half %elt, 0.0
|
|
%sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load
|
|
store <2 x half> %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 {
|
|
; GFX1032-LABEL: test_vopc_class:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vopc_class:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 0x204
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%fabs = tail call float @llvm.fabs.f32(float %x)
|
|
%cmp = fcmp oeq float %fabs, 0x7FF0000000000000
|
|
%ext = zext i1 %cmp to i32
|
|
store i32 %ext, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 {
|
|
; GFX1032-LABEL: test_vcmp_vcnd_f16:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo
|
|
; GFX1032-NEXT: global_store_short v1, v0, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vcmp_vcnd_f16:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc
|
|
; GFX1064-NEXT: global_store_short v1, v0, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%cmp = fcmp oeq half %x, 0x7FF0000000000000
|
|
%sel = select i1 %cmp, half 1.0, half %x
|
|
store half %sel, ptr addrspace(1) %out, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
|
|
; GFX1032-LABEL: test_vop3_cmp_f32_sop_and:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1
|
|
; GFX1032-NEXT: v_cmp_nle_f32_e64 s0, 1.0, v1
|
|
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s0
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vop3_cmp_f32_sop_and:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1
|
|
; GFX1064-NEXT: v_cmp_nle_f32_e64 s[0:1], 1.0, v1
|
|
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1]
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load float, ptr addrspace(1) %gep, align 4
|
|
%cmp = fcmp ugt float %load, 0.0
|
|
%cmp2 = fcmp ult float %load, 1.0
|
|
%and = and i1 %cmp, %cmp2
|
|
%sel = select i1 %and, float 1.0, float 2.0
|
|
store float %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
|
|
; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1
|
|
; GFX1032-NEXT: v_cmp_gt_i32_e64 s0, 1, v1
|
|
; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
|
|
; GFX1064-NEXT: v_cmp_gt_i32_e64 s[0:1], 1, v1
|
|
; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1]
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
%cmp = icmp sgt i32 %load, 0
|
|
%cmp2 = icmp slt i32 %load, 1
|
|
%xor = xor i1 %cmp, %cmp2
|
|
%sel = select i1 %xor, i32 1, i32 2
|
|
store i32 %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) {
|
|
; GFX1032-LABEL: test_vop3_cmp_u32_sop_or:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v1
|
|
; GFX1032-NEXT: v_cmp_gt_u32_e64 s0, 2, v1
|
|
; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vop3_cmp_u32_sop_or:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1
|
|
; GFX1064-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v1
|
|
; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1]
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
|
|
%load = load i32, ptr addrspace(1) %gep, align 4
|
|
%cmp = icmp ugt i32 %load, 3
|
|
%cmp2 = icmp ult i32 %load, 2
|
|
%or = or i1 %cmp, %cmp2
|
|
%sel = select i1 %or, i32 1, i32 2
|
|
store i32 %sel, ptr addrspace(1) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
|
|
; GFX1032-LABEL: test_mask_if:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %if
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_store_dword v0, v0, s[0:1]
|
|
; GFX1032-NEXT: .LBB9_2: ; %endif
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_mask_if:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %if
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_store_dword v0, v0, s[0:1]
|
|
; GFX1064-NEXT: .LBB9_2: ; %endif
|
|
; GFX1064-NEXT: s_endpgm
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%cmp = icmp ugt i32 %lid, 10
|
|
br i1 %cmp, label %if, label %endif
|
|
|
|
if:
|
|
store i32 0, ptr addrspace(1) %arg, align 4
|
|
br label %endif
|
|
|
|
endif:
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
|
|
; GFX1032-LABEL: test_loop_with_if:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: s_mov_b32 s2, 0
|
|
; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX1032-NEXT: s_branch .LBB10_2
|
|
; GFX1032-NEXT: .LBB10_1: ; %bb13
|
|
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
|
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4
|
|
; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4
|
|
; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
|
|
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB10_8
|
|
; GFX1032-NEXT: .LBB10_2: ; %bb2
|
|
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
|
|
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
|
|
; GFX1032-NEXT: s_mov_b32 s3, 0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
|
|
; GFX1032-NEXT: ; %bb.3: ; %bb5
|
|
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
|
|
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
|
|
; GFX1032-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
|
|
; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
|
|
; GFX1032-NEXT: global_load_dword v4, v[2:3], off
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4
|
|
; GFX1032-NEXT: s_and_b32 s6, vcc_lo, exec_lo
|
|
; GFX1032-NEXT: s_or_b32 s4, s4, s6
|
|
; GFX1032-NEXT: .LBB10_4: ; %Flow
|
|
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
|
|
; GFX1032-NEXT: ; implicit-def: $vgpr4
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s5, s4
|
|
; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s5
|
|
; GFX1032-NEXT: ; %bb.5: ; %bb11
|
|
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1
|
|
; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo
|
|
; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4
|
|
; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4
|
|
; GFX1032-NEXT: ; %bb.6: ; %Flow1
|
|
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s4, s3
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB10_1
|
|
; GFX1032-NEXT: ; %bb.7: ; %bb10
|
|
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v4, v1
|
|
; GFX1032-NEXT: global_store_dword v[2:3], v0, off
|
|
; GFX1032-NEXT: s_branch .LBB10_1
|
|
; GFX1032-NEXT: .LBB10_8: ; %bb1
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_loop_with_if:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX1064-NEXT: s_branch .LBB10_2
|
|
; GFX1064-NEXT: .LBB10_1: ; %bb13
|
|
; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4
|
|
; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4
|
|
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB10_8
|
|
; GFX1064-NEXT: .LBB10_2: ; %bb2
|
|
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1064-NEXT: v_cmp_ge_i32_e64 s[6:7], v1, v0
|
|
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0
|
|
; GFX1064-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB10_4
|
|
; GFX1064-NEXT: ; %bb.3: ; %bb5
|
|
; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1064-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX1064-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
|
|
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
|
|
; GFX1064-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_add_co_u32 v2, vcc, s0, v2
|
|
; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s1, v3, vcc
|
|
; GFX1064-NEXT: global_load_dword v4, v[2:3], off
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4
|
|
; GFX1064-NEXT: s_and_b64 s[10:11], vcc, exec
|
|
; GFX1064-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
|
|
; GFX1064-NEXT: .LBB10_4: ; %Flow
|
|
; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX1064-NEXT: ; implicit-def: $vgpr4
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
|
|
; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
|
|
; GFX1064-NEXT: ; %bb.5: ; %bb11
|
|
; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1064-NEXT: v_lshrrev_b32_e32 v4, 31, v1
|
|
; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
|
; GFX1064-NEXT: v_add_nc_u32_e32 v4, v1, v4
|
|
; GFX1064-NEXT: v_ashrrev_i32_e32 v4, 1, v4
|
|
; GFX1064-NEXT: ; %bb.6: ; %Flow1
|
|
; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB10_1
|
|
; GFX1064-NEXT: ; %bb.7: ; %bb10
|
|
; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v4, v1
|
|
; GFX1064-NEXT: global_store_dword v[2:3], v0, off
|
|
; GFX1064-NEXT: s_branch .LBB10_1
|
|
; GFX1064-NEXT: .LBB10_8: ; %bb1
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
br label %bb2
|
|
|
|
bb1:
|
|
ret void
|
|
|
|
bb2:
|
|
%tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ]
|
|
%tmp4 = icmp slt i32 %tmp3, %tmp
|
|
br i1 %tmp4, label %bb5, label %bb11
|
|
|
|
bb5:
|
|
%tmp6 = sext i32 %tmp3 to i64
|
|
%tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load i32, ptr addrspace(1) %tmp7, align 4
|
|
%tmp9 = icmp sgt i32 %tmp8, 10
|
|
br i1 %tmp9, label %bb10, label %bb11
|
|
|
|
bb10:
|
|
store i32 %tmp, ptr addrspace(1) %tmp7, align 4
|
|
br label %bb13
|
|
|
|
bb11:
|
|
%tmp12 = sdiv i32 %tmp3, 2
|
|
br label %bb13
|
|
|
|
bb13:
|
|
%tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ]
|
|
%tmp15 = add nsw i32 %tmp14, 1
|
|
%tmp16 = icmp slt i32 %tmp14, 255
|
|
br i1 %tmp16, label %bb2, label %bb1
|
|
}
|
|
|
|
|
|
|
|
define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
|
|
; GFX1032-LABEL: test_loop_with_if_else_break:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_mov_b32 s2, 0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
|
|
; GFX1032-NEXT: ; %bb.1: ; %.preheader
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: s_mov_b32 s3, 0
|
|
; GFX1032-NEXT: ; implicit-def: $sgpr4
|
|
; GFX1032-NEXT: s_branch .LBB11_4
|
|
; GFX1032-NEXT: .LBB11_2: ; %bb8
|
|
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
|
|
; GFX1032-NEXT: s_add_i32 s3, s3, 1
|
|
; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1
|
|
; GFX1032-NEXT: s_add_u32 s0, s0, 4
|
|
; GFX1032-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
|
|
; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo
|
|
; GFX1032-NEXT: s_or_b32 s4, s4, s5
|
|
; GFX1032-NEXT: .LBB11_3: ; %Flow
|
|
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
|
|
; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4
|
|
; GFX1032-NEXT: s_or_b32 s2, s5, s2
|
|
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
|
|
; GFX1032-NEXT: .LBB11_4: ; %bb2
|
|
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v3, v2, s[0:1]
|
|
; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3
|
|
; GFX1032-NEXT: s_cbranch_vccz .LBB11_2
|
|
; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
|
|
; GFX1032-NEXT: ; implicit-def: $sgpr3
|
|
; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1
|
|
; GFX1032-NEXT: s_branch .LBB11_3
|
|
; GFX1032-NEXT: .LBB11_6: ; %.loopexit
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_loop_with_if_else_break:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_mov_b32 s6, 0
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
|
|
; GFX1064-NEXT: ; %bb.1: ; %.preheader
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5
|
|
; GFX1064-NEXT: s_branch .LBB11_4
|
|
; GFX1064-NEXT: .LBB11_2: ; %bb8
|
|
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
|
|
; GFX1064-NEXT: s_add_i32 s6, s6, 1
|
|
; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX1064-NEXT: v_cmp_ge_u32_e32 vcc, s6, v1
|
|
; GFX1064-NEXT: s_add_u32 s0, s0, 4
|
|
; GFX1064-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
|
|
; GFX1064-NEXT: s_and_b64 s[8:9], vcc, exec
|
|
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
|
|
; GFX1064-NEXT: .LBB11_3: ; %Flow
|
|
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
|
|
; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5]
|
|
; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
|
|
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
|
|
; GFX1064-NEXT: .LBB11_4: ; %bb2
|
|
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v3, v2, s[0:1]
|
|
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], exec
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v3
|
|
; GFX1064-NEXT: s_cbranch_vccz .LBB11_2
|
|
; GFX1064-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
|
|
; GFX1064-NEXT: ; implicit-def: $sgpr6
|
|
; GFX1064-NEXT: ; implicit-def: $sgpr0_sgpr1
|
|
; GFX1064-NEXT: s_branch .LBB11_3
|
|
; GFX1064-NEXT: .LBB11_6: ; %.loopexit
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp1 = icmp eq i32 %tmp, 0
|
|
br i1 %tmp1, label %.loopexit, label %.preheader
|
|
|
|
.preheader:
|
|
br label %bb2
|
|
|
|
bb2:
|
|
%tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ]
|
|
%tmp4 = zext i32 %tmp3 to i64
|
|
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
|
|
%tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
|
|
%tmp7 = icmp sgt i32 %tmp6, 10
|
|
br i1 %tmp7, label %bb8, label %.loopexit
|
|
|
|
bb8:
|
|
store i32 %tmp, ptr addrspace(1) %tmp5, align 4
|
|
%tmp9 = add nuw nsw i32 %tmp3, 1
|
|
%tmp10 = icmp ult i32 %tmp9, 256
|
|
%tmp11 = icmp ult i32 %tmp9, %tmp
|
|
%tmp12 = and i1 %tmp10, %tmp11
|
|
br i1 %tmp12, label %bb2, label %.loopexit
|
|
|
|
.loopexit:
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
|
|
; GFX1032-LABEL: test_addc_vop2b:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_addc_vop2b:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2
|
|
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
|
|
%tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
|
|
%tmp5 = add nsw i64 %tmp4, %arg1
|
|
store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
|
|
; GFX1032-LABEL: test_subbrev_vop2b:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
|
|
; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_subbrev_vop2b:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2
|
|
; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
|
|
%tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
|
|
%tmp5 = sub nsw i64 %tmp4, %arg1
|
|
store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
|
|
; GFX1032-LABEL: test_subb_vop2b:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
|
|
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_subb_vop2b:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
|
|
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
|
|
%tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
|
|
%tmp5 = sub nsw i64 %arg1, %tmp4
|
|
store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
|
|
; GFX1032-LABEL: test_udiv64:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5]
|
|
; GFX1032-NEXT: s_mov_b32 s8, 0
|
|
; GFX1032-NEXT: s_cmp_lg_u64 s[8:9], 0
|
|
; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4
|
|
; GFX1032-NEXT: ; %bb.1:
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s5
|
|
; GFX1032-NEXT: s_sub_u32 s9, 0, s4
|
|
; GFX1032-NEXT: s_subb_u32 s10, 0, s5
|
|
; GFX1032-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
|
; GFX1032-NEXT: v_rcp_f32_e32 v0, v0
|
|
; GFX1032-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
|
; GFX1032-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
|
; GFX1032-NEXT: v_trunc_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
|
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0
|
|
; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX1032-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX1032-NEXT: s_mul_i32 s11, s9, s0
|
|
; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s1
|
|
; GFX1032-NEXT: s_mul_i32 s12, s10, s1
|
|
; GFX1032-NEXT: s_add_i32 s11, s13, s11
|
|
; GFX1032-NEXT: s_mul_i32 s14, s9, s1
|
|
; GFX1032-NEXT: s_add_i32 s11, s11, s12
|
|
; GFX1032-NEXT: s_mul_hi_u32 s13, s1, s14
|
|
; GFX1032-NEXT: s_mul_hi_u32 s15, s0, s14
|
|
; GFX1032-NEXT: s_mul_i32 s12, s0, s14
|
|
; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11
|
|
; GFX1032-NEXT: s_mul_i32 s1, s1, s11
|
|
; GFX1032-NEXT: s_mul_hi_u32 s16, s0, s11
|
|
; GFX1032-NEXT: s_add_u32 s1, s13, s1
|
|
; GFX1032-NEXT: s_addc_u32 s13, 0, s14
|
|
; GFX1032-NEXT: s_add_u32 s1, s1, s12
|
|
; GFX1032-NEXT: s_mul_i32 s11, s0, s11
|
|
; GFX1032-NEXT: s_addc_u32 s1, s13, s15
|
|
; GFX1032-NEXT: s_addc_u32 s12, s16, 0
|
|
; GFX1032-NEXT: s_add_u32 s1, s1, s11
|
|
; GFX1032-NEXT: s_addc_u32 s11, 0, s12
|
|
; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1
|
|
; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
|
|
; GFX1032-NEXT: s_addc_u32 s0, s0, s11
|
|
; GFX1032-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX1032-NEXT: s_mul_i32 s11, s9, s0
|
|
; GFX1032-NEXT: s_mul_hi_u32 s12, s9, s1
|
|
; GFX1032-NEXT: s_mul_i32 s10, s10, s1
|
|
; GFX1032-NEXT: s_add_i32 s11, s12, s11
|
|
; GFX1032-NEXT: s_mul_i32 s9, s9, s1
|
|
; GFX1032-NEXT: s_add_i32 s11, s11, s10
|
|
; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s9
|
|
; GFX1032-NEXT: s_mul_i32 s13, s0, s9
|
|
; GFX1032-NEXT: s_mul_hi_u32 s9, s1, s9
|
|
; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11
|
|
; GFX1032-NEXT: s_mul_i32 s1, s1, s11
|
|
; GFX1032-NEXT: s_mul_hi_u32 s10, s0, s11
|
|
; GFX1032-NEXT: s_add_u32 s1, s9, s1
|
|
; GFX1032-NEXT: s_addc_u32 s9, 0, s14
|
|
; GFX1032-NEXT: s_add_u32 s1, s1, s13
|
|
; GFX1032-NEXT: s_mul_i32 s11, s0, s11
|
|
; GFX1032-NEXT: s_addc_u32 s1, s9, s12
|
|
; GFX1032-NEXT: s_addc_u32 s9, s10, 0
|
|
; GFX1032-NEXT: s_add_u32 s1, s1, s11
|
|
; GFX1032-NEXT: s_addc_u32 s9, 0, s9
|
|
; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1
|
|
; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
|
|
; GFX1032-NEXT: s_addc_u32 s0, s0, s9
|
|
; GFX1032-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX1032-NEXT: s_mul_i32 s10, s6, s0
|
|
; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s0
|
|
; GFX1032-NEXT: s_mul_hi_u32 s11, s7, s0
|
|
; GFX1032-NEXT: s_mul_i32 s0, s7, s0
|
|
; GFX1032-NEXT: s_mul_hi_u32 s12, s6, s1
|
|
; GFX1032-NEXT: s_mul_hi_u32 s13, s7, s1
|
|
; GFX1032-NEXT: s_mul_i32 s1, s7, s1
|
|
; GFX1032-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX1032-NEXT: s_addc_u32 s9, 0, s9
|
|
; GFX1032-NEXT: s_add_u32 s1, s10, s1
|
|
; GFX1032-NEXT: s_addc_u32 s1, s9, s13
|
|
; GFX1032-NEXT: s_addc_u32 s9, s11, 0
|
|
; GFX1032-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX1032-NEXT: s_addc_u32 s9, 0, s9
|
|
; GFX1032-NEXT: s_mul_hi_u32 s0, s4, s1
|
|
; GFX1032-NEXT: s_mul_i32 s11, s4, s9
|
|
; GFX1032-NEXT: s_mul_i32 s12, s4, s1
|
|
; GFX1032-NEXT: s_add_i32 s0, s0, s11
|
|
; GFX1032-NEXT: v_sub_co_u32 v0, s11, s6, s12
|
|
; GFX1032-NEXT: s_mul_i32 s10, s5, s1
|
|
; GFX1032-NEXT: s_add_i32 s0, s0, s10
|
|
; GFX1032-NEXT: v_sub_co_u32 v1, s12, v0, s4
|
|
; GFX1032-NEXT: s_sub_i32 s10, s7, s0
|
|
; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
|
|
; GFX1032-NEXT: s_subb_u32 s10, s10, s5
|
|
; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
|
|
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
|
|
; GFX1032-NEXT: s_subb_u32 s10, s10, 0
|
|
; GFX1032-NEXT: s_cmp_ge_u32 s10, s5
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
|
|
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
|
|
; GFX1032-NEXT: s_cmp_eq_u32 s10, s5
|
|
; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0
|
|
; GFX1032-NEXT: s_add_u32 s10, s1, 1
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
|
|
; GFX1032-NEXT: s_addc_u32 s12, s9, 0
|
|
; GFX1032-NEXT: s_add_u32 s13, s1, 2
|
|
; GFX1032-NEXT: s_addc_u32 s14, s9, 0
|
|
; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
|
|
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v0
|
|
; GFX1032-NEXT: s_subb_u32 s0, s7, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, s13
|
|
; GFX1032-NEXT: s_cmp_ge_u32 s0, s5
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
|
|
; GFX1032-NEXT: s_cselect_b32 s7, -1, 0
|
|
; GFX1032-NEXT: s_cmp_eq_u32 s0, s5
|
|
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
|
|
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s14
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo
|
|
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo
|
|
; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
|
|
; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3
|
|
; GFX1032-NEXT: .LBB15_2:
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4
|
|
; GFX1032-NEXT: s_sub_i32 s1, 0, s4
|
|
; GFX1032-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
|
; GFX1032-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
|
; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0
|
|
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX1032-NEXT: s_mul_i32 s1, s1, s0
|
|
; GFX1032-NEXT: s_mul_hi_u32 s1, s0, s1
|
|
; GFX1032-NEXT: s_add_i32 s0, s0, s1
|
|
; GFX1032-NEXT: s_mul_hi_u32 s0, s6, s0
|
|
; GFX1032-NEXT: s_mul_i32 s1, s0, s4
|
|
; GFX1032-NEXT: s_add_i32 s5, s0, 1
|
|
; GFX1032-NEXT: s_sub_i32 s1, s6, s1
|
|
; GFX1032-NEXT: s_sub_i32 s6, s1, s4
|
|
; GFX1032-NEXT: s_cmp_ge_u32 s1, s4
|
|
; GFX1032-NEXT: s_cselect_b32 s0, s5, s0
|
|
; GFX1032-NEXT: s_cselect_b32 s1, s6, s1
|
|
; GFX1032-NEXT: s_add_i32 s5, s0, 1
|
|
; GFX1032-NEXT: s_cmp_ge_u32 s1, s4
|
|
; GFX1032-NEXT: s_mov_b32 s1, 0
|
|
; GFX1032-NEXT: s_cselect_b32 s0, s5, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX1032-NEXT: .LBB15_3:
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
|
|
; GFX1032-NEXT: s_endpgm
|
|
; GFX1032-NEXT: .LBB15_4:
|
|
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX1032-NEXT: s_branch .LBB15_2
|
|
;
|
|
; GFX1064-LABEL: test_udiv64:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5]
|
|
; GFX1064-NEXT: s_mov_b32 s0, 0
|
|
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4
|
|
; GFX1064-NEXT: ; %bb.1:
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s5
|
|
; GFX1064-NEXT: s_sub_u32 s9, 0, s4
|
|
; GFX1064-NEXT: s_subb_u32 s10, 0, s5
|
|
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
|
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
|
|
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
|
; GFX1064-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
|
; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
|
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
|
|
; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
|
|
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX1064-NEXT: s_mul_i32 s1, s9, s8
|
|
; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s0
|
|
; GFX1064-NEXT: s_mul_i32 s11, s10, s0
|
|
; GFX1064-NEXT: s_add_i32 s1, s12, s1
|
|
; GFX1064-NEXT: s_mul_i32 s13, s9, s0
|
|
; GFX1064-NEXT: s_add_i32 s1, s1, s11
|
|
; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s13
|
|
; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
|
|
; GFX1064-NEXT: s_mul_i32 s11, s8, s13
|
|
; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1
|
|
; GFX1064-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX1064-NEXT: s_mul_hi_u32 s15, s8, s1
|
|
; GFX1064-NEXT: s_add_u32 s0, s12, s0
|
|
; GFX1064-NEXT: s_addc_u32 s12, 0, s13
|
|
; GFX1064-NEXT: s_add_u32 s0, s0, s11
|
|
; GFX1064-NEXT: s_mul_i32 s1, s8, s1
|
|
; GFX1064-NEXT: s_addc_u32 s0, s12, s14
|
|
; GFX1064-NEXT: s_addc_u32 s11, s15, 0
|
|
; GFX1064-NEXT: s_add_u32 s0, s0, s1
|
|
; GFX1064-NEXT: s_addc_u32 s11, 0, s11
|
|
; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
|
|
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX1064-NEXT: s_addc_u32 s8, s8, s11
|
|
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX1064-NEXT: s_mul_i32 s1, s9, s8
|
|
; GFX1064-NEXT: s_mul_hi_u32 s11, s9, s0
|
|
; GFX1064-NEXT: s_mul_i32 s10, s10, s0
|
|
; GFX1064-NEXT: s_add_i32 s1, s11, s1
|
|
; GFX1064-NEXT: s_mul_i32 s9, s9, s0
|
|
; GFX1064-NEXT: s_add_i32 s1, s1, s10
|
|
; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s9
|
|
; GFX1064-NEXT: s_mul_i32 s12, s8, s9
|
|
; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s9
|
|
; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1
|
|
; GFX1064-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s1
|
|
; GFX1064-NEXT: s_add_u32 s0, s9, s0
|
|
; GFX1064-NEXT: s_addc_u32 s9, 0, s13
|
|
; GFX1064-NEXT: s_add_u32 s0, s0, s12
|
|
; GFX1064-NEXT: s_mul_i32 s1, s8, s1
|
|
; GFX1064-NEXT: s_addc_u32 s0, s9, s11
|
|
; GFX1064-NEXT: s_addc_u32 s9, s10, 0
|
|
; GFX1064-NEXT: s_add_u32 s0, s0, s1
|
|
; GFX1064-NEXT: s_addc_u32 s9, 0, s9
|
|
; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0
|
|
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX1064-NEXT: s_addc_u32 s0, s8, s9
|
|
; GFX1064-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX1064-NEXT: s_mul_i32 s9, s6, s0
|
|
; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s0
|
|
; GFX1064-NEXT: s_mul_hi_u32 s10, s7, s0
|
|
; GFX1064-NEXT: s_mul_i32 s0, s7, s0
|
|
; GFX1064-NEXT: s_mul_hi_u32 s11, s6, s1
|
|
; GFX1064-NEXT: s_mul_hi_u32 s12, s7, s1
|
|
; GFX1064-NEXT: s_mul_i32 s1, s7, s1
|
|
; GFX1064-NEXT: s_add_u32 s9, s11, s9
|
|
; GFX1064-NEXT: s_addc_u32 s8, 0, s8
|
|
; GFX1064-NEXT: s_add_u32 s1, s9, s1
|
|
; GFX1064-NEXT: s_addc_u32 s1, s8, s12
|
|
; GFX1064-NEXT: s_addc_u32 s8, s10, 0
|
|
; GFX1064-NEXT: s_add_u32 s10, s1, s0
|
|
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
|
|
; GFX1064-NEXT: s_mul_hi_u32 s0, s4, s10
|
|
; GFX1064-NEXT: s_mul_i32 s1, s4, s11
|
|
; GFX1064-NEXT: s_mul_i32 s9, s4, s10
|
|
; GFX1064-NEXT: s_add_i32 s12, s0, s1
|
|
; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9
|
|
; GFX1064-NEXT: s_mul_i32 s8, s5, s10
|
|
; GFX1064-NEXT: s_add_i32 s12, s12, s8
|
|
; GFX1064-NEXT: v_sub_co_u32 v1, s[8:9], v0, s4
|
|
; GFX1064-NEXT: s_sub_i32 s13, s7, s12
|
|
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX1064-NEXT: s_subb_u32 s13, s13, s5
|
|
; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
|
|
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
|
|
; GFX1064-NEXT: s_subb_u32 s8, s13, 0
|
|
; GFX1064-NEXT: s_cmp_ge_u32 s8, s5
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
|
|
; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
|
|
; GFX1064-NEXT: s_cmp_eq_u32 s8, s5
|
|
; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX1064-NEXT: s_add_u32 s8, s10, 1
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
|
|
; GFX1064-NEXT: s_addc_u32 s9, s11, 0
|
|
; GFX1064-NEXT: s_add_u32 s13, s10, 2
|
|
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
|
|
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
|
|
; GFX1064-NEXT: s_subb_u32 s0, s7, s12
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, s13
|
|
; GFX1064-NEXT: s_cmp_ge_u32 s0, s5
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
|
; GFX1064-NEXT: s_cselect_b32 s7, -1, 0
|
|
; GFX1064-NEXT: s_cmp_eq_u32 s0, s5
|
|
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
|
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s14
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1]
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc
|
|
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc
|
|
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
|
|
; GFX1064-NEXT: .LBB15_2:
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4
|
|
; GFX1064-NEXT: s_sub_i32 s1, 0, s4
|
|
; GFX1064-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
|
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
|
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
|
|
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX1064-NEXT: s_mul_i32 s1, s1, s0
|
|
; GFX1064-NEXT: s_mul_hi_u32 s1, s0, s1
|
|
; GFX1064-NEXT: s_add_i32 s0, s0, s1
|
|
; GFX1064-NEXT: s_mul_hi_u32 s0, s6, s0
|
|
; GFX1064-NEXT: s_mul_i32 s1, s0, s4
|
|
; GFX1064-NEXT: s_add_i32 s5, s0, 1
|
|
; GFX1064-NEXT: s_sub_i32 s1, s6, s1
|
|
; GFX1064-NEXT: s_sub_i32 s6, s1, s4
|
|
; GFX1064-NEXT: s_cmp_ge_u32 s1, s4
|
|
; GFX1064-NEXT: s_cselect_b32 s0, s5, s0
|
|
; GFX1064-NEXT: s_cselect_b32 s1, s6, s1
|
|
; GFX1064-NEXT: s_add_i32 s5, s0, 1
|
|
; GFX1064-NEXT: s_cmp_ge_u32 s1, s4
|
|
; GFX1064-NEXT: s_mov_b32 s1, 0
|
|
; GFX1064-NEXT: s_cselect_b32 s0, s5, s0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX1064-NEXT: .LBB15_3:
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
|
|
; GFX1064-NEXT: s_endpgm
|
|
; GFX1064-NEXT: .LBB15_4:
|
|
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX1064-NEXT: s_branch .LBB15_2
|
|
bb:
|
|
%tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1
|
|
%tmp1 = load i64, ptr addrspace(1) %tmp, align 8
|
|
%tmp2 = load i64, ptr addrspace(1) %arg, align 8
|
|
%tmp3 = udiv i64 %tmp1, %tmp2
|
|
%tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 2
|
|
store i64 %tmp3, ptr addrspace(1) %tmp4, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GFX1032-LABEL: test_div_scale_f32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_div_scale_f32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
|
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
|
|
|
|
%a = load volatile float, ptr addrspace(1) %gep.0, align 4
|
|
%b = load volatile float, ptr addrspace(1) %gep.1, align 4
|
|
|
|
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
|
|
%result0 = extractvalue { float, i1 } %result, 0
|
|
store float %result0, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 {
|
|
; GFX1032-LABEL: test_div_scale_f64:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_div_scale_f64:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
|
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
|
|
|
|
%a = load volatile double, ptr addrspace(1) %gep.0, align 8
|
|
%b = load volatile double, ptr addrspace(1) %gep.1, align 8
|
|
|
|
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
|
|
%result0 = extractvalue { double, i1 } %result, 0
|
|
store double %result0, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|
; GFX1032-LABEL: test_mad_i64_i32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mad_i64_i32 v[0:1], s4, v0, v1, v[2:3]
|
|
; GFX1032-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1064-LABEL: test_mad_i64_i32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
|
|
; GFX1064-NEXT: s_setpc_b64 s[30:31]
|
|
%sext0 = sext i32 %arg0 to i64
|
|
%sext1 = sext i32 %arg1 to i64
|
|
%mul = mul i64 %sext0, %sext1
|
|
%mad = add i64 %mul, %arg2
|
|
ret i64 %mad
|
|
}
|
|
|
|
define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
|
|
; GFX1032-LABEL: test_mad_u64_u32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s4, v0, v1, v[2:3]
|
|
; GFX1032-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1064-LABEL: test_mad_u64_u32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
|
|
; GFX1064-NEXT: s_setpc_b64 s[30:31]
|
|
%sext0 = zext i32 %arg0 to i64
|
|
%sext1 = zext i32 %arg1 to i64
|
|
%mul = mul i64 %sext0, %sext1
|
|
%mad = add i64 %mul, %arg2
|
|
ret i64 %mad
|
|
}
|
|
|
|
define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i1 %d) nounwind {
|
|
; GFX1032-LABEL: test_div_fmas_f32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s5
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s6
|
|
; GFX1032-NEXT: s_bitcmp1_b32 s7, 0
|
|
; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0
|
|
; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1
|
|
; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_div_fmas_f32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s5
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s6
|
|
; GFX1064-NEXT: s_bitcmp1_b32 s7, 0
|
|
; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1
|
|
; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
|
|
store float %result, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
|
|
; GFX1032-LABEL: test_div_fmas_f64:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x44
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s9
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1032-NEXT: v_mov_b32_e32 v3, s11
|
|
; GFX1032-NEXT: s_bitcmp1_b32 s2, 0
|
|
; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0
|
|
; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_div_fmas_f64:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x44
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s9
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1064-NEXT: v_mov_b32_e32 v3, s11
|
|
; GFX1064-NEXT: s_bitcmp1_b32 s2, 0
|
|
; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0
|
|
; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
|
|
store double %result, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
|
|
|
|
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 {
|
|
; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX1032-NEXT: s_mov_b32 null, 0
|
|
; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
|
|
; GFX1032-NEXT: s_mov_b32 vcc_lo, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7]
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB22_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %bb
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo
|
|
; GFX1032-NEXT: .LBB22_2: ; %exit
|
|
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] offset:8
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
|
|
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
|
; GFX1064-NEXT: s_mov_b32 null, 0
|
|
; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
|
|
; GFX1064-NEXT: s_mov_b64 vcc, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7]
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB22_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %bb
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: global_load_dword v0, v0, s[8:9] glc dlc
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_and_b64 vcc, vcc, exec
|
|
; GFX1064-NEXT: .LBB22_2: ; %exit
|
|
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] offset:8
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
%gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
|
|
%gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
|
%gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
|
|
%gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2
|
|
|
|
%a = load float, ptr addrspace(1) %gep.a
|
|
%b = load float, ptr addrspace(1) %gep.b
|
|
%c = load float, ptr addrspace(1) %gep.c
|
|
|
|
%cmp0 = icmp eq i32 %tid, 0
|
|
br i1 %cmp0, label %bb, label %exit
|
|
|
|
bb:
|
|
%val = load volatile i32, ptr addrspace(1) %dummy
|
|
%cmp1 = icmp ne i32 %val, 0
|
|
br label %exit
|
|
|
|
exit:
|
|
%cond = phi i1 [false, %entry], [%cmp1, %bb]
|
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
|
|
store float %result, ptr addrspace(1) %gep.out, align 4
|
|
ret void
|
|
}
|
|
|
|
|
|
define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
|
|
; GFX1032-LABEL: fdiv_f32:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2
|
|
; GFX1032-NEXT: v_rcp_f32_e32 v1, v0
|
|
; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0
|
|
; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1
|
|
; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2
|
|
; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1
|
|
; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2
|
|
; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1
|
|
; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2
|
|
; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2
|
|
; GFX1032-NEXT: global_store_dword v1, v0, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: fdiv_f32:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2
|
|
; GFX1064-NEXT: v_rcp_f32_e32 v1, v0
|
|
; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0
|
|
; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1
|
|
; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2
|
|
; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1
|
|
; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2
|
|
; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1
|
|
; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2
|
|
; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2
|
|
; GFX1064-NEXT: global_store_dword v1, v0, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%fdiv = fdiv float %a, %b
|
|
store float %fdiv, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_br_cc_f16(
|
|
; GFX1032-LABEL: test_br_cc_f16:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7]
|
|
; GFX1032-NEXT: global_load_ushort v2, v0, s[2:3]
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2
|
|
; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %one
|
|
; GFX1032-NEXT: global_store_short v0, v1, s[4:5]
|
|
; GFX1032-NEXT: s_endpgm
|
|
; GFX1032-NEXT: .LBB24_2: ; %two
|
|
; GFX1032-NEXT: global_store_short v0, v2, s[4:5]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_br_cc_f16:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7]
|
|
; GFX1064-NEXT: global_load_ushort v2, v0, s[2:3]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2
|
|
; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %one
|
|
; GFX1064-NEXT: global_store_short v0, v1, s[4:5]
|
|
; GFX1064-NEXT: s_endpgm
|
|
; GFX1064-NEXT: .LBB24_2: ; %two
|
|
; GFX1064-NEXT: global_store_short v0, v2, s[4:5]
|
|
; GFX1064-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b) {
|
|
entry:
|
|
%a.val = load half, ptr addrspace(1) %a
|
|
%b.val = load half, ptr addrspace(1) %b
|
|
%fcmp = fcmp olt half %a.val, %b.val
|
|
br i1 %fcmp, label %one, label %two
|
|
|
|
one:
|
|
store half %a.val, ptr addrspace(1) %r
|
|
ret void
|
|
|
|
two:
|
|
store half %b.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 {
|
|
; GCN-LABEL: test_brcc_i1:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dword s2, s[0:1], 0x34
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_bitcmp0_b32 s2, 0
|
|
; GCN-NEXT: s_cbranch_scc1 .LBB25_2
|
|
; GCN-NEXT: ; %bb.1: ; %store
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0xde
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GCN-NEXT: .LBB25_2: ; %end
|
|
; GCN-NEXT: s_endpgm
|
|
%cmp0 = icmp ne i1 %val, 0
|
|
br i1 %cmp0, label %store, label %end
|
|
|
|
store:
|
|
store i32 222, ptr addrspace(1) %out
|
|
ret void
|
|
|
|
end:
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 {
|
|
; GFX1032-LABEL: test_preserve_condition_undef_flag:
|
|
; GFX1032: ; %bb.0: ; %bb0
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x24
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s2, 1.0
|
|
; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s3, 1.0
|
|
; GFX1032-NEXT: v_cmp_ngt_f32_e64 s2, s2, 0
|
|
; GFX1032-NEXT: s_or_b32 s0, s0, s1
|
|
; GFX1032-NEXT: s_or_b32 s0, s0, s2
|
|
; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
|
; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %bb1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: global_store_dword v[0:1], v0, off
|
|
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1032-NEXT: .LBB26_2: ; %bb2
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_preserve_condition_undef_flag:
|
|
; GFX1064: ; %bb.0: ; %bb0
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x24
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0
|
|
; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0
|
|
; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0
|
|
; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
|
|
; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
|
|
; GFX1064-NEXT: s_and_b64 vcc, exec, s[0:1]
|
|
; GFX1064-NEXT: s_cbranch_vccnz .LBB26_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %bb1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: global_store_dword v[0:1], v0, off
|
|
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1064-NEXT: .LBB26_2: ; %bb2
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb0:
|
|
%tmp = icmp sgt i32 %arg1, 4
|
|
%undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
|
|
%tmp4 = select i1 %undef, float %arg, float 1.000000e+00
|
|
%tmp5 = fcmp ogt float %arg2, 0.000000e+00
|
|
%tmp6 = fcmp olt float %arg2, 1.000000e+00
|
|
%tmp7 = fcmp olt float %arg, %tmp4
|
|
%tmp8 = and i1 %tmp5, %tmp6
|
|
%tmp9 = and i1 %tmp8, %tmp7
|
|
br i1 %tmp9, label %bb1, label %bb2
|
|
|
|
bb1:
|
|
store volatile i32 0, ptr addrspace(1) undef
|
|
br label %bb2
|
|
|
|
bb2:
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 {
|
|
; GFX1032-LABEL: test_invert_true_phi_cond_break_loop:
|
|
; GFX1032: ; %bb.0: ; %bb
|
|
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24
|
|
; GFX1032-NEXT: ; implicit-def: $sgpr1
|
|
; GFX1032-NEXT: ; implicit-def: $sgpr2
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_subrev_nc_u32_e32 v0, s0, v0
|
|
; GFX1032-NEXT: s_mov_b32 s0, 0
|
|
; GFX1032-NEXT: s_branch .LBB27_2
|
|
; GFX1032-NEXT: .LBB27_1: ; %Flow
|
|
; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1
|
|
; GFX1032-NEXT: s_xor_b32 s3, s1, -1
|
|
; GFX1032-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX1032-NEXT: s_and_b32 s3, exec_lo, s3
|
|
; GFX1032-NEXT: s_or_b32 s0, s3, s0
|
|
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB27_4
|
|
; GFX1032-NEXT: .LBB27_2: ; %bb1
|
|
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo
|
|
; GFX1032-NEXT: s_cmp_gt_i32 s2, -1
|
|
; GFX1032-NEXT: s_cbranch_scc1 .LBB27_1
|
|
; GFX1032-NEXT: ; %bb.3: ; %bb4
|
|
; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1
|
|
; GFX1032-NEXT: global_load_dword v1, v[0:1], off glc dlc
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
|
|
; GFX1032-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1
|
|
; GFX1032-NEXT: s_and_b32 s3, vcc_lo, exec_lo
|
|
; GFX1032-NEXT: s_or_b32 s1, s1, s3
|
|
; GFX1032-NEXT: s_branch .LBB27_1
|
|
; GFX1032-NEXT: .LBB27_4: ; %bb9
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 7
|
|
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1032-NEXT: ds_write_b32 v0, v0
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_invert_true_phi_cond_break_loop:
|
|
; GFX1064: ; %bb.0: ; %bb
|
|
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x24
|
|
; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3
|
|
; GFX1064-NEXT: ; implicit-def: $sgpr4
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_subrev_nc_u32_e32 v0, s0, v0
|
|
; GFX1064-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX1064-NEXT: s_branch .LBB27_2
|
|
; GFX1064-NEXT: .LBB27_1: ; %Flow
|
|
; GFX1064-NEXT: ; in Loop: Header=BB27_2 Depth=1
|
|
; GFX1064-NEXT: s_xor_b64 s[6:7], s[2:3], -1
|
|
; GFX1064-NEXT: s_add_i32 s4, s4, 1
|
|
; GFX1064-NEXT: s_and_b64 s[6:7], exec, s[6:7]
|
|
; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
|
|
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB27_4
|
|
; GFX1064-NEXT: .LBB27_2: ; %bb1
|
|
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], exec
|
|
; GFX1064-NEXT: s_cmp_gt_i32 s4, -1
|
|
; GFX1064-NEXT: s_cbranch_scc1 .LBB27_1
|
|
; GFX1064-NEXT: ; %bb.3: ; %bb4
|
|
; GFX1064-NEXT: ; in Loop: Header=BB27_2 Depth=1
|
|
; GFX1064-NEXT: global_load_dword v1, v[0:1], off glc dlc
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
|
|
; GFX1064-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
|
; GFX1064-NEXT: s_and_b64 s[6:7], vcc, exec
|
|
; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
|
|
; GFX1064-NEXT: s_branch .LBB27_1
|
|
; GFX1064-NEXT: .LBB27_4: ; %bb9
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 7
|
|
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX1064-NEXT: ds_write_b32 v0, v0
|
|
; GFX1064-NEXT: s_endpgm
|
|
bb:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp = sub i32 %id, %arg
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %Flow, %bb
|
|
%lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
|
|
%lsr.iv.next = add i32 %lsr.iv, 1
|
|
%cmp0 = icmp slt i32 %lsr.iv.next, 0
|
|
br i1 %cmp0, label %bb4, label %Flow
|
|
|
|
bb4: ; preds = %bb1
|
|
%load = load volatile i32, ptr addrspace(1) undef, align 4
|
|
%cmp1 = icmp sge i32 %tmp, %load
|
|
br label %Flow
|
|
|
|
Flow: ; preds = %bb4, %bb1
|
|
%tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
|
|
%tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
|
|
br i1 %tmp3, label %bb1, label %bb9
|
|
|
|
bb9: ; preds = %Flow
|
|
store volatile i32 7, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) %out) #0 {
|
|
; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
|
|
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
|
|
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 2, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc_lo
|
|
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc_lo
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
|
|
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc
|
|
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%index = add i32 %id, -512
|
|
%value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
|
store i32 %value, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 {
|
|
; GFX1032-LABEL: test_set_inactive:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 42
|
|
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: global_store_dword v1, v0, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_set_inactive:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX1064-NEXT: s_not_b64 exec, exec
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 42
|
|
; GFX1064-NEXT: s_not_b64 exec, exec
|
|
; GFX1064-NEXT: global_store_dword v1, v0, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
|
|
store i32 %tmp, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
|
|
; GFX1032-LABEL: test_set_inactive_64:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_set_inactive_64:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX1064-NEXT: s_not_b64 exec, exec
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1064-NEXT: s_not_b64 exec, exec
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
|
|
store i64 %tmp, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_kill_i1_terminator_float() #0 {
|
|
; GFX1032-LABEL: test_kill_i1_terminator_float:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo
|
|
; GFX1032-NEXT: s_cbranch_scc0 .LBB31_1
|
|
; GFX1032-NEXT: s_endpgm
|
|
; GFX1032-NEXT: .LBB31_1:
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX1032-NEXT: exp null off, off, off, off done vm
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_kill_i1_terminator_float:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_andn2_b64 exec, exec, exec
|
|
; GFX1064-NEXT: s_cbranch_scc0 .LBB31_1
|
|
; GFX1064-NEXT: s_endpgm
|
|
; GFX1064-NEXT: .LBB31_1:
|
|
; GFX1064-NEXT: s_mov_b64 exec, 0
|
|
; GFX1064-NEXT: exp null off, off, off, off done vm
|
|
; GFX1064-NEXT: s_endpgm
|
|
call void @llvm.amdgcn.kill(i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
|
|
; GFX1032-LABEL: test_kill_i1_terminator_i1:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v1
|
|
; GFX1032-NEXT: v_cmp_lt_i32_e64 s0, v2, v3
|
|
; GFX1032-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: s_xor_b32 s0, s0, exec_lo
|
|
; GFX1032-NEXT: s_andn2_b32 s1, s1, s0
|
|
; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: exp mrt0 off, off, off, off
|
|
; GFX1032-NEXT: s_endpgm
|
|
; GFX1032-NEXT: ; %bb.1:
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_kill_i1_terminator_i1:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
|
|
; GFX1064-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
|
|
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: s_xor_b64 s[0:1], s[0:1], exec
|
|
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
|
|
; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: exp mrt0 off, off, off, off
|
|
; GFX1064-NEXT: s_endpgm
|
|
; GFX1064-NEXT: ; %bb.1:
|
|
; GFX1064-NEXT: s_mov_b64 exec, 0
|
|
; GFX1064-NEXT: s_endpgm
|
|
%c1 = icmp slt i32 %a, %b
|
|
%c2 = icmp slt i32 %c, %d
|
|
%x = or i1 %c1, %c2
|
|
call void @llvm.amdgcn.kill(i1 %x)
|
|
call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
|
|
; GFX1032-LABEL: test_loop_vcc:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
|
|
; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: v_mov_b32_e32 v7, v3
|
|
; GFX1032-NEXT: v_mov_b32_e32 v6, v2
|
|
; GFX1032-NEXT: v_mov_b32_e32 v5, v1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v4, v0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v8, 0
|
|
; GFX1032-NEXT: s_branch .LBB33_2
|
|
; GFX1032-NEXT: .LBB33_1: ; %body
|
|
; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1
|
|
; GFX1032-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB33_4
|
|
; GFX1032-NEXT: .LBB33_2: ; %loop
|
|
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, v6
|
|
; GFX1032-NEXT: v_mov_b32_e32 v3, v7
|
|
; GFX1032-NEXT: s_cbranch_vccz .LBB33_1
|
|
; GFX1032-NEXT: ; %bb.3:
|
|
; GFX1032-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
|
|
; GFX1032-NEXT: ; implicit-def: $vgpr8
|
|
; GFX1032-NEXT: .LBB33_4: ; %break
|
|
; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_loop_vcc:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX1064-NEXT: s_wqm_b64 exec, exec
|
|
; GFX1064-NEXT: v_mov_b32_e32 v7, v3
|
|
; GFX1064-NEXT: v_mov_b32_e32 v6, v2
|
|
; GFX1064-NEXT: v_mov_b32_e32 v5, v1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v4, v0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v8, 0
|
|
; GFX1064-NEXT: s_branch .LBB33_2
|
|
; GFX1064-NEXT: .LBB33_1: ; %body
|
|
; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1
|
|
; GFX1064-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB33_4
|
|
; GFX1064-NEXT: .LBB33_2: ; %loop
|
|
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, v6
|
|
; GFX1064-NEXT: v_mov_b32_e32 v3, v7
|
|
; GFX1064-NEXT: s_cbranch_vccz .LBB33_1
|
|
; GFX1064-NEXT: ; %bb.3:
|
|
; GFX1064-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
|
|
; GFX1064-NEXT: ; implicit-def: $vgpr8
|
|
; GFX1064-NEXT: .LBB33_4: ; %break
|
|
; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
|
|
%c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
|
|
%cc = fcmp ogt float %ctr.iv, 7.0
|
|
br i1 %cc, label %break, label %body
|
|
|
|
body:
|
|
%c.iv0 = extractelement <4 x float> %c.iv, i32 0
|
|
%c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
|
|
%ctr.next = fadd float %ctr.iv, 2.0
|
|
br label %loop
|
|
|
|
break:
|
|
ret <4 x float> %c.iv
|
|
}
|
|
|
|
define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
|
|
; GFX1032-LABEL: test_wwm1:
|
|
; GFX1032: ; %bb.0: ; %main_body
|
|
; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_wwm1:
|
|
; GFX1064: ; %bb.0: ; %main_body
|
|
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2
|
|
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
main_body:
|
|
%out = fadd float %src0, %src1
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
ret float %out.0
|
|
}
|
|
|
|
define amdgpu_ps float @test_wwm2(i32 inreg %idx) {
|
|
; GFX1032-LABEL: test_wwm2:
|
|
; GFX1032: ; %bb.0: ; %main_body
|
|
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
|
|
; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
|
|
; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB35_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %if
|
|
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX1032-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_add_f32_e32 v2, v1, v1
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0
|
|
; GFX1032-NEXT: .LBB35_2: ; %endif
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_wwm2:
|
|
; GFX1064: ; %bb.0: ; %main_body
|
|
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
|
|
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
|
|
; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB35_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %if
|
|
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX1064-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_add_f32_e32 v2, v1, v1
|
|
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0
|
|
; GFX1064-NEXT: .LBB35_2: ; %endif
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
main_body:
|
|
; use mbcnt to make sure the branch is divergent
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
%cc = icmp uge i32 %hi, 16
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
if:
|
|
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
|
|
%out = fadd float %src, %src
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
%out.1 = fadd float %src, %out.0
|
|
br label %endif
|
|
|
|
endif:
|
|
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
|
ret float %out.2
|
|
}
|
|
|
|
define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
|
|
; GFX1032-LABEL: test_strict_wwm1:
|
|
; GFX1032: ; %bb.0: ; %main_body
|
|
; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_strict_wwm1:
|
|
; GFX1064: ; %bb.0: ; %main_body
|
|
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v3, v0
|
|
; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2
|
|
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
main_body:
|
|
%out = fadd float %src0, %src1
|
|
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
|
ret float %out.0
|
|
}
|
|
|
|
define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) {
|
|
; GFX1032-LABEL: test_strict_wwm2:
|
|
; GFX1032: ; %bb.0: ; %main_body
|
|
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
|
|
; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
|
|
; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
|
|
; GFX1032-NEXT: s_cbranch_execz .LBB37_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %if
|
|
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX1032-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_add_f32_e32 v2, v1, v1
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0
|
|
; GFX1032-NEXT: .LBB37_2: ; %endif
|
|
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_strict_wwm2:
|
|
; GFX1064: ; %bb.0: ; %main_body
|
|
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
|
|
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
|
|
; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX1064-NEXT: s_cbranch_execz .LBB37_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %if
|
|
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX1064-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_add_f32_e32 v2, v1, v1
|
|
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, v2
|
|
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0
|
|
; GFX1064-NEXT: .LBB37_2: ; %endif
|
|
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
main_body:
|
|
; use mbcnt to make sure the branch is divergent
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
%cc = icmp uge i32 %hi, 16
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
if:
|
|
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
|
|
%out = fadd float %src, %src
|
|
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
|
%out.1 = fadd float %src, %out.0
|
|
br label %endif
|
|
|
|
endif:
|
|
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
|
ret float %out.2
|
|
}
|
|
|
|
|
|
define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 {
|
|
; GFX1032-LABEL: test_wqm1:
|
|
; GFX1032: ; %bb.0: ; %main_body
|
|
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
|
|
; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: s_mov_b32 m0, s3
|
|
; GFX1032-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
|
|
; GFX1032-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
|
|
; GFX1032-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
|
|
; GFX1032-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
|
|
; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0
|
|
; GFX1032-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_wqm1:
|
|
; GFX1064: ; %bb.0: ; %main_body
|
|
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX1064-NEXT: s_wqm_b64 exec, exec
|
|
; GFX1064-NEXT: s_mov_b32 m0, s3
|
|
; GFX1064-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
|
|
; GFX1064-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
|
|
; GFX1064-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
|
|
; GFX1064-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
|
|
; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX1064-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
main_body:
|
|
%inst23 = extractelement <2 x float> %pos, i32 0
|
|
%inst24 = extractelement <2 x float> %pos, i32 1
|
|
%inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
|
|
%inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
|
|
%inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
|
|
%inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0)
|
|
ret <4 x float> %tex
|
|
}
|
|
|
|
define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 {
|
|
; GFX1032-LABEL: test_wqm2:
|
|
; GFX1032: ; %bb.0: ; %main_body
|
|
; GFX1032-NEXT: s_mov_b32 s2, exec_lo
|
|
; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
|
|
; GFX1032-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: v_add_f32_e32 v0, v2, v3
|
|
; GFX1032-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s2
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_wqm2:
|
|
; GFX1064: ; %bb.0: ; %main_body
|
|
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX1064-NEXT: s_wqm_b64 exec, exec
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
|
|
; GFX1064-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: v_add_f32_e32 v0, v2, v3
|
|
; GFX1064-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3]
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
main_body:
|
|
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
|
|
%out = fadd float %src0, %src1
|
|
%out.0 = bitcast float %out to i32
|
|
%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
|
|
%out.2 = bitcast i32 %out.1 to float
|
|
ret float %out.2
|
|
}
|
|
|
|
define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
|
|
; GFX1032-LABEL: test_intr_fcmp_i64:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3|
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_intr_fcmp_i64:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3|
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%temp = call float @llvm.fabs.f32(float %a)
|
|
%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
|
|
store i64 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
|
|
; GFX1032-LABEL: test_intr_icmp_i64:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_intr_icmp_i64:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
|
|
store i64 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
|
|
; GFX1032-LABEL: test_intr_fcmp_i32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3|
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s2
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_intr_fcmp_i32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3|
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s2
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%temp = call float @llvm.fabs.f32(float %a)
|
|
%result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
|
|
store i32 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) {
|
|
; GFX1032-LABEL: test_intr_icmp_i32:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_clause 0x1
|
|
; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_intr_icmp_i32:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_clause 0x1
|
|
; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX1064-NEXT: s_endpgm
|
|
%result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
|
|
store i32 %result, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @test_wqm_vote(float %a) {
|
|
; GFX1032-LABEL: test_wqm_vote:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1032-NEXT: s_wqm_b32 s1, vcc_lo
|
|
; GFX1032-NEXT: s_xor_b32 s1, s1, exec_lo
|
|
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
|
; GFX1032-NEXT: s_cbranch_scc0 .LBB44_2
|
|
; GFX1032-NEXT: ; %bb.1:
|
|
; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0
|
|
; GFX1032-NEXT: exp mrt0 off, off, off, off
|
|
; GFX1032-NEXT: s_endpgm
|
|
; GFX1032-NEXT: .LBB44_2:
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX1032-NEXT: exp null off, off, off, off done vm
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_wqm_vote:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1064-NEXT: s_wqm_b64 s[2:3], vcc
|
|
; GFX1064-NEXT: s_xor_b64 s[2:3], s[2:3], exec
|
|
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
|
; GFX1064-NEXT: s_cbranch_scc0 .LBB44_2
|
|
; GFX1064-NEXT: ; %bb.1:
|
|
; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX1064-NEXT: exp mrt0 off, off, off, off
|
|
; GFX1064-NEXT: s_endpgm
|
|
; GFX1064-NEXT: .LBB44_2:
|
|
; GFX1064-NEXT: s_mov_b64 exec, 0
|
|
; GFX1064-NEXT: exp null off, off, off, off done vm
|
|
; GFX1064-NEXT: s_endpgm
|
|
%c1 = fcmp une float %a, 0.0
|
|
%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
|
|
call void @llvm.amdgcn.kill(i1 %c2)
|
|
call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_branch_true() #2 {
|
|
; GFX1032-LABEL: test_branch_true:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_mov_b32 vcc_lo, exec_lo
|
|
; GFX1032-NEXT: s_cbranch_execnz .LBB45_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %for.body.lr.ph
|
|
; GFX1032-NEXT: s_branch .LBB45_3
|
|
; GFX1032-NEXT: .LBB45_2: ; %Flow
|
|
; GFX1032-NEXT: s_branch .LBB45_5
|
|
; GFX1032-NEXT: .LBB45_3: ; %for.body
|
|
; GFX1032-NEXT: s_mov_b32 vcc_lo, 0
|
|
; GFX1032-NEXT: ; %bb.4: ; %for.end.loopexit
|
|
; GFX1032-NEXT: s_branch .LBB45_2
|
|
; GFX1032-NEXT: .LBB45_5: ; %for.end
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_branch_true:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_mov_b64 vcc, exec
|
|
; GFX1064-NEXT: s_cbranch_execnz .LBB45_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %for.body.lr.ph
|
|
; GFX1064-NEXT: s_branch .LBB45_3
|
|
; GFX1064-NEXT: .LBB45_2: ; %Flow
|
|
; GFX1064-NEXT: s_branch .LBB45_5
|
|
; GFX1064-NEXT: .LBB45_3: ; %for.body
|
|
; GFX1064-NEXT: s_mov_b64 vcc, 0
|
|
; GFX1064-NEXT: ; %bb.4: ; %for.end.loopexit
|
|
; GFX1064-NEXT: s_branch .LBB45_2
|
|
; GFX1064-NEXT: .LBB45_5: ; %for.end
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
br i1 true, label %for.end, label %for.body.lr.ph
|
|
|
|
for.body.lr.ph: ; preds = %entry
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %for.body.lr.ph
|
|
br i1 undef, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps float @test_ps_live() #0 {
|
|
; GFX1032-LABEL: test_ps_live:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
|
|
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
|
; GFX1032-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1064-LABEL: test_ps_live:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
|
; GFX1064-NEXT: ; return to shader part epilog
|
|
%live = call i1 @llvm.amdgcn.ps.live()
|
|
%live.32 = zext i1 %live to i32
|
|
%r = bitcast i32 %live.32 to float
|
|
ret float %r
|
|
}
|
|
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0
|
|
; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4
|
|
; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2
|
|
; GFX1032-NEXT: ; %bb.1: ; %if
|
|
; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
|
|
; GFX1032-NEXT: s_branch .LBB47_3
|
|
; GFX1032-NEXT: .LBB47_2:
|
|
; GFX1032-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX1032-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX1032-NEXT: .LBB47_3: ; %endif
|
|
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
|
|
; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5]
|
|
; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2
|
|
; GFX1064-NEXT: ; %bb.1: ; %if
|
|
; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
|
|
; GFX1064-NEXT: s_branch .LBB47_3
|
|
; GFX1064-NEXT: .LBB47_2:
|
|
; GFX1064-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX1064-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX1064-NEXT: .LBB47_3: ; %endif
|
|
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%v = load double, ptr addrspace(1) %in
|
|
%cc = fcmp oeq double %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fadd double %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi double [ %v, %entry ], [ %u, %if ]
|
|
store double %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e,
|
|
; GCN-LABEL: test_vgprblocks_w32_attr:
|
|
; GCN: ; %bb.0: ; %main_body
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v2
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v3
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v4
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v5
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v6
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v7
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v8
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v9
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v10
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v11
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 {
|
|
main_body:
|
|
%s = fadd float %a, %b
|
|
%s.1 = fadd float %s, %c
|
|
%s.2 = fadd float %s.1, %d
|
|
%s.3 = fadd float %s.2, %e
|
|
%s.4 = fadd float %s.3, %f
|
|
%s.5 = fadd float %s.4, %g
|
|
%s.6 = fadd float %s.5, %h
|
|
%s.7 = fadd float %s.6, %i
|
|
%s.8 = fadd float %s.7, %j
|
|
%s.9 = fadd float %s.8, %k
|
|
%s.10 = fadd float %s.9, %l
|
|
ret float %s.10
|
|
}
|
|
|
|
define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e,
|
|
; GCN-LABEL: test_vgprblocks_w64_attr:
|
|
; GCN: ; %bb.0: ; %main_body
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v2
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v3
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v4
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v5
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v6
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v7
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v8
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v9
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v10
|
|
; GCN-NEXT: v_add_f32_e32 v0, v0, v11
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 {
|
|
main_body:
|
|
%s = fadd float %a, %b
|
|
%s.1 = fadd float %s, %c
|
|
%s.2 = fadd float %s.1, %d
|
|
%s.3 = fadd float %s.2, %e
|
|
%s.4 = fadd float %s.3, %f
|
|
%s.5 = fadd float %s.4, %g
|
|
%s.6 = fadd float %s.5, %h
|
|
%s.7 = fadd float %s.6, %i
|
|
%s.8 = fadd float %s.7, %j
|
|
%s.9 = fadd float %s.8, %k
|
|
%s.10 = fadd float %s.9, %l
|
|
ret float %s.10
|
|
}
|
|
|
|
define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
|
|
; GFX1032-LABEL: icmp64:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0
|
|
; GFX1032-NEXT: s_sub_i32 s1, 0, s0
|
|
; GFX1032-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
|
|
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1
|
|
; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000
|
|
; GFX1032-NEXT: s_add_i32 s1, s1, 32
|
|
; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2
|
|
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
|
|
; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1
|
|
; GFX1032-NEXT: v_mul_lo_u32 v1, v1, s0
|
|
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, v0, v1
|
|
; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
|
|
; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
|
|
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
|
|
; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1032-NEXT: s_min_u32 s0, s0, s1
|
|
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
|
|
; GFX1032-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1032-NEXT: ; divergent unreachable
|
|
; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: icmp64:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0
|
|
; GFX1064-NEXT: s_sub_i32 s1, 0, s0
|
|
; GFX1064-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
|
|
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_mul_lo_u32 v2, s1, v1
|
|
; GFX1064-NEXT: v_mul_hi_u32 v2, v1, v2
|
|
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2
|
|
; GFX1064-NEXT: v_mul_hi_u32 v1, v0, v1
|
|
; GFX1064-NEXT: v_mul_lo_u32 v1, v1, s0
|
|
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, v0, v1
|
|
; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
|
|
; GFX1064-NEXT: s_bitset1_b32 s1, 31
|
|
; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1064-NEXT: s_ff1_i32_b32 s1, s1
|
|
; GFX1064-NEXT: s_add_i32 s1, s1, 32
|
|
; GFX1064-NEXT: s_min_u32 s0, s0, s1
|
|
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX1064-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1064-NEXT: ; divergent unreachable
|
|
; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%mul4 = mul nsw i32 %s, %n
|
|
%cmp = icmp slt i32 0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem = urem i32 %id, %s
|
|
%icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32)
|
|
%shr = lshr i64 %icmp, 1
|
|
%notmask = shl nsw i64 -1, 0
|
|
%and = and i64 %notmask, %shr
|
|
%or = or i64 %and, -9223372036854775808
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
|
|
%cast = trunc i64 %cttz to i32
|
|
%cmp3 = icmp ugt i32 10, %cast
|
|
%cmp6 = icmp ne i32 %rem, 0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp64(float %n, float %s) {
|
|
; GFX1032-LABEL: fcmp64:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0
|
|
; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0
|
|
; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000
|
|
; GFX1032-NEXT: v_rcp_f32_e32 v2, v1
|
|
; GFX1032-NEXT: s_add_i32 s1, s1, 32
|
|
; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0
|
|
; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2
|
|
; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2
|
|
; GFX1032-NEXT: v_fma_f32 v5, -v1, v3, v4
|
|
; GFX1032-NEXT: v_fmac_f32_e32 v3, v5, v2
|
|
; GFX1032-NEXT: v_fma_f32 v1, -v1, v3, v4
|
|
; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3
|
|
; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0
|
|
; GFX1032-NEXT: v_trunc_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0
|
|
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
|
|
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1032-NEXT: s_min_u32 s0, s0, s1
|
|
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
|
|
; GFX1032-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1032-NEXT: ; divergent unreachable
|
|
; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: fcmp64:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0
|
|
; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0
|
|
; GFX1064-NEXT: v_rcp_f32_e32 v2, v1
|
|
; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0
|
|
; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2
|
|
; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2
|
|
; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4
|
|
; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2
|
|
; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4
|
|
; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3
|
|
; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0
|
|
; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0
|
|
; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
|
|
; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_bitset1_b32 s1, 31
|
|
; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1064-NEXT: s_ff1_i32_b32 s1, s1
|
|
; GFX1064-NEXT: s_add_i32 s1, s1, 32
|
|
; GFX1064-NEXT: s_min_u32 s0, s0, s1
|
|
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX1064-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1064-NEXT: ; divergent unreachable
|
|
; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%id.f = uitofp i32 %id to float
|
|
%mul4 = fmul float %s, %n
|
|
%cmp = fcmp ult float 0.0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem.f = frem float %id.f, %s
|
|
%fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1)
|
|
%shr = lshr i64 %fcmp, 1
|
|
%notmask = shl nsw i64 -1, 0
|
|
%and = and i64 %notmask, %shr
|
|
%or = or i64 %and, -9223372036854775808
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
|
|
%cast = trunc i64 %cttz to i32
|
|
%cmp3 = icmp ugt i32 10, %cast
|
|
%cmp6 = fcmp one float %rem.f, 0.0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @icmp32(i32 %n, i32 %s) {
|
|
; GFX1032-LABEL: icmp32:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0
|
|
; GFX1032-NEXT: s_sub_i32 s1, 0, s0
|
|
; GFX1032-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
|
|
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1
|
|
; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2
|
|
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
|
|
; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1
|
|
; GFX1032-NEXT: v_mul_lo_u32 v1, v1, s0
|
|
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, v0, v1
|
|
; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
|
|
; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0
|
|
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
|
|
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
|
|
; GFX1032-NEXT: s_bitset1_b32 s0, 31
|
|
; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
|
|
; GFX1032-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1032-NEXT: ; divergent unreachable
|
|
; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: icmp32:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0
|
|
; GFX1064-NEXT: s_sub_i32 s1, 0, s0
|
|
; GFX1064-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
|
|
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_mul_lo_u32 v2, s1, v1
|
|
; GFX1064-NEXT: v_mul_hi_u32 v2, v1, v2
|
|
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2
|
|
; GFX1064-NEXT: v_mul_hi_u32 v1, v0, v1
|
|
; GFX1064-NEXT: v_mul_lo_u32 v1, v1, s0
|
|
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, v0, v1
|
|
; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0
|
|
; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0
|
|
; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1
|
|
; GFX1064-NEXT: s_bitset1_b32 s0, 31
|
|
; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX1064-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1064-NEXT: ; divergent unreachable
|
|
; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%mul4 = mul nsw i32 %s, %n
|
|
%cmp = icmp slt i32 0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem = urem i32 %id, %s
|
|
%icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32)
|
|
%shr = lshr i32 %icmp, 1
|
|
%notmask = shl nsw i32 -1, 0
|
|
%and = and i32 %notmask, %shr
|
|
%or = or i32 %and, 2147483648
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
|
|
%cmp3 = icmp ugt i32 10, %cttz
|
|
%cmp6 = icmp ne i32 %rem, 0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @fcmp32(float %n, float %s) {
|
|
; GFX1032-LABEL: fcmp32:
|
|
; GFX1032: ; %bb.0: ; %entry
|
|
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28
|
|
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0
|
|
; GFX1032-NEXT: v_rcp_f32_e32 v2, v1
|
|
; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0
|
|
; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2
|
|
; GFX1032-NEXT: v_div_scale_f32 v3, vcc_lo, v0, s0, v0
|
|
; GFX1032-NEXT: v_mul_f32_e32 v4, v3, v2
|
|
; GFX1032-NEXT: v_fma_f32 v5, -v1, v4, v3
|
|
; GFX1032-NEXT: v_fmac_f32_e32 v4, v5, v2
|
|
; GFX1032-NEXT: v_fma_f32 v1, -v1, v4, v3
|
|
; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v4
|
|
; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0
|
|
; GFX1032-NEXT: v_trunc_f32_e32 v1, v1
|
|
; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0
|
|
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
|
|
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
|
|
; GFX1032-NEXT: s_bitset1_b32 s0, 31
|
|
; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
|
|
; GFX1032-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1032-NEXT: ; divergent unreachable
|
|
; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1032-NEXT: s_endpgm
|
|
;
|
|
; GFX1064-LABEL: fcmp32:
|
|
; GFX1064: ; %bb.0: ; %entry
|
|
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28
|
|
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0
|
|
; GFX1064-NEXT: v_rcp_f32_e32 v2, v1
|
|
; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0
|
|
; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2
|
|
; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0
|
|
; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2
|
|
; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3
|
|
; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2
|
|
; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3
|
|
; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4
|
|
; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0
|
|
; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
|
|
; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0
|
|
; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1
|
|
; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
|
|
; GFX1064-NEXT: s_bitset1_b32 s0, 31
|
|
; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
|
|
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
|
|
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX1064-NEXT: ; %bb.1: ; %if.then
|
|
; GFX1064-NEXT: ; divergent unreachable
|
|
; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock
|
|
; GFX1064-NEXT: s_endpgm
|
|
entry:
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%id.f = uitofp i32 %id to float
|
|
%mul4 = fmul float %s, %n
|
|
%cmp = fcmp ult float 0.0, %mul4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %entry
|
|
%rem.f = frem float %id.f, %s
|
|
%fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1)
|
|
%shr = lshr i32 %fcmp, 1
|
|
%notmask = shl nsw i32 -1, 0
|
|
%and = and i32 %notmask, %shr
|
|
%or = or i32 %and, 2147483648
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
|
|
%cmp3 = icmp ugt i32 10, %cttz
|
|
%cmp6 = fcmp one float %rem.f, 0.0
|
|
%brmerge = or i1 %cmp6, %cmp3
|
|
br i1 %brmerge, label %if.end2, label %if.then
|
|
|
|
if.then: ; preds = %if.end
|
|
unreachable
|
|
|
|
if.end2: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
declare void @external_void_func_void() #1
|
|
|
|
define void @callee_no_stack_with_call() #1 {
|
|
; GFX1032-LABEL: callee_no_stack_with_call:
|
|
; GFX1032: ; %bb.0:
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1032-NEXT: s_mov_b32 s16, s33
|
|
; GFX1032-NEXT: s_mov_b32 s33, s32
|
|
; GFX1032-NEXT: s_or_saveexec_b32 s17, -1
|
|
; GFX1032-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, s17
|
|
; GFX1032-NEXT: s_addk_i32 s32, 0x200
|
|
; GFX1032-NEXT: v_writelane_b32 v40, s16, 2
|
|
; GFX1032-NEXT: s_getpc_b64 s[16:17]
|
|
; GFX1032-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4
|
|
; GFX1032-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12
|
|
; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
|
; GFX1032-NEXT: v_writelane_b32 v40, s30, 0
|
|
; GFX1032-NEXT: v_writelane_b32 v40, s31, 1
|
|
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
|
; GFX1032-NEXT: v_readlane_b32 s31, v40, 1
|
|
; GFX1032-NEXT: v_readlane_b32 s30, v40, 0
|
|
; GFX1032-NEXT: v_readlane_b32 s4, v40, 2
|
|
; GFX1032-NEXT: s_or_saveexec_b32 s5, -1
|
|
; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1032-NEXT: s_mov_b32 exec_lo, s5
|
|
; GFX1032-NEXT: s_addk_i32 s32, 0xfe00
|
|
; GFX1032-NEXT: s_mov_b32 s33, s4
|
|
; GFX1032-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1032-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1064-LABEL: callee_no_stack_with_call:
|
|
; GFX1064: ; %bb.0:
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX1064-NEXT: s_mov_b32 s16, s33
|
|
; GFX1064-NEXT: s_mov_b32 s33, s32
|
|
; GFX1064-NEXT: s_or_saveexec_b64 s[18:19], -1
|
|
; GFX1064-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
|
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1064-NEXT: s_mov_b64 exec, s[18:19]
|
|
; GFX1064-NEXT: s_addk_i32 s32, 0x400
|
|
; GFX1064-NEXT: v_writelane_b32 v40, s16, 2
|
|
; GFX1064-NEXT: s_getpc_b64 s[16:17]
|
|
; GFX1064-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4
|
|
; GFX1064-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12
|
|
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
|
; GFX1064-NEXT: v_writelane_b32 v40, s30, 0
|
|
; GFX1064-NEXT: v_writelane_b32 v40, s31, 1
|
|
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
|
; GFX1064-NEXT: v_readlane_b32 s31, v40, 1
|
|
; GFX1064-NEXT: v_readlane_b32 s30, v40, 0
|
|
; GFX1064-NEXT: v_readlane_b32 s4, v40, 2
|
|
; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1
|
|
; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
|
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX1064-NEXT: s_mov_b64 exec, s[6:7]
|
|
; GFX1064-NEXT: s_addk_i32 s32, 0xfc00
|
|
; GFX1064-NEXT: s_mov_b32 s33, s4
|
|
; GFX1064-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX1064-NEXT: s_setpc_b64 s[30:31]
|
|
call void @external_void_func_void()
|
|
ret void
|
|
}
|
|
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
declare float @llvm.fabs.f32(float)
|
|
declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
|
|
declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1)
|
|
declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
|
|
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1)
|
|
declare i1 @llvm.amdgcn.class.f32(float, i32)
|
|
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
|
|
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
|
|
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
|
|
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
|
|
declare float @llvm.amdgcn.strict.wwm.f32(float)
|
|
declare float @llvm.amdgcn.wwm.f32(float)
|
|
declare i32 @llvm.amdgcn.wqm.i32(i32)
|
|
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
|
|
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32)
|
|
declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg)
|
|
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
|
|
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
|
|
declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32)
|
|
declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32)
|
|
declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32)
|
|
declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32)
|
|
declare void @llvm.amdgcn.kill(i1)
|
|
declare i1 @llvm.amdgcn.wqm.vote(i1)
|
|
declare i1 @llvm.amdgcn.ps.live()
|
|
declare i64 @llvm.cttz.i64(i64, i1)
|
|
declare i32 @llvm.cttz.i32(i32, i1)
|
|
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5
|
|
|
|
attributes #0 = { nounwind readnone speculatable }
|
|
attributes #1 = { nounwind }
|
|
attributes #2 = { nounwind readnone optnone noinline }
|
|
attributes #3 = { "target-features"="+wavefrontsize32" }
|
|
attributes #4 = { "target-features"="+wavefrontsize64" }
|
|
attributes #5 = { inaccessiblememonly nounwind }
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; GFX10DEFWAVE: {{.*}}
|