Files
clang-p2996/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
Brox Chen 8a0c2e7567 [AMDGPU][True16][MC][CodeGen] true16 for v_cndmask_b16 (#119736)
Support true16 format for v_cndmask_b16 in MC and CodeGen in true16 and
fake16 flow.

Since we are replacing `v_cndmask_b16` to `v_cndmask_b16_t16/fake16`, we
have to at least update the fake16 codeGen to get codeGen test passing.
For this case, we have to update the true16 and with fake16 together,
otherwise some of the true16 tests will fail
2025-01-16 17:18:28 -05:00

2523 lines
103 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fabs.f16(half)
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
; All nan values are converted to 0xffffffff
define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cnd_nan_nosgpr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s8, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_cmp_eq_u32 s8, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cnd_nan_nosgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_cnd_nan_nosgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s2, 0
; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cnd_nan_nosgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s2, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_cnd_nan_nosgpr:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_eq_u32 s2, 0
; GFX12-NEXT: s_cselect_b64 vcc, -1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
%f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
%f = load float, ptr addrspace(1) %f.gep
%setcc = icmp ne i32 %c, 0
%select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
store float %select, ptr addrspace(1) %out
ret void
}
; This requires slightly trickier SGPR operand legalization since the
; single constant bus SGPR usage is the last operand, and it should
; never be moved.
; However on GFX10 constant bus is limited to 2 scalar operands, not one.
; All nan values are converted to 0xffffffff
define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
; SI-LABEL: v_cnd_nan:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cnd_nan:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_cnd_nan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s2, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cnd_nan:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s2, 0
; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_cnd_nan:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_eq_u32 s2, 0
; GFX12-NEXT: s_cselect_b32 s2, s3, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%setcc = icmp ne i32 %c, 0
%select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
store float %select, ptr addrspace(1) %out
ret void
}
; Test different compare and select operand types for optimal code
; shrinking.
; (select (cmp (sgprX, constant)), constant, sgprZ)
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s5
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_nlg_f32 s0, 0
; GFX12-NEXT: s_cselect_b32 s0, s1, 1.0
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, float 1.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_nlg_f32 s2, 0
; GFX12-NEXT: s_cselect_b32 s2, s2, 1.0
; GFX12-NEXT: v_mov_b32_e32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, float 1.0, float %x
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s5
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5]
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5]
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_nlg_f32 s0, 0
; GFX12-NEXT: s_cselect_b32 s0, s1, 0
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, float 0.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3]
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_nlg_f32 s2, 0
; GFX12-NEXT: s_cselect_b32 s2, s2, 0
; GFX12-NEXT: v_mov_b32_e32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, float 0.0, float %x
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[0:1]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_nlg_f32 s2, 0
; GFX12-NEXT: s_cselect_b64 vcc, -1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%z = load float, ptr addrspace(1) %z.gep
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, float 0.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[0:1]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_nlg_f32 s2, 0
; GFX12-NEXT: s_cselect_b64 vcc, -1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%z = load float, ptr addrspace(1) %z.gep
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, float 1.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s8, s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_mov_b32_e32 v3, s8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX12-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%x = load float, ptr addrspace(1) %x.gep
%setcc = fcmp olt float %x, 0.0
%select = select i1 %setcc, float 1.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile float, ptr addrspace(1) %z.gep
%setcc = fcmp ult float %x, 0.0
%select = select i1 %setcc, float 1.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX12-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile i32, ptr addrspace(1) %x.gep
%z = load volatile i32, ptr addrspace(1) %z.gep
%setcc = icmp slt i32 %x, 0
%select = select i1 %setcc, i32 2, i32 %z
store i32 %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[0:1], v4, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile i64, ptr addrspace(1) %x.gep
%z = load volatile i64, ptr addrspace(1) %z.gep
%setcc = icmp slt i64 %x, 0
%select = select i1 %setcc, i64 2, i64 %z
store i64 %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_mov_b32_e32 v5, v2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile <4 x float>, ptr addrspace(1) %z.gep
%setcc = fcmp ugt float %x, 4.0
%select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
store <4 x float> %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_mov_b32_e32 v5, v2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile <4 x float>, ptr addrspace(1) %z.gep
%setcc = fcmp ugt float %x, 4.0
%select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
store <4 x float> %select, ptr addrspace(1) %out.gep
ret void
}
; This must be swapped as a vector type before the condition has
; multiple uses.
define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_mov_b32_e32 v5, v2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile <4 x float>, ptr addrspace(1) %z.gep
%setcc = fcmp ugt float 4.0, %x
%select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
store <4 x float> %select, ptr addrspace(1) %out.gep
ret void
}
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
; SI-NEXT: v_and_b32_e32 v3, 1, v3
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: buffer_store_byte v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_load_dword v2, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ubyte v3, v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; VI-NEXT: v_and_b32_e32 v3, 1, v3
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v2, v1, s[10:11] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GFX10-NEXT: v_and_b32_e32 v1, 1, v3
; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX10-NEXT: global_store_byte v0, v1, s[8:9]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v1, s[8:9]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX12-NEXT: global_store_b8 v0, v1, s[8:9]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile i32, ptr addrspace(1) %x.gep
%z = load volatile i1, ptr addrspace(1) %z.gep
%setcc = icmp slt i32 %x, 0
%select = select i1 %setcc, i1 true, i1 %z
store i1 %select, ptr addrspace(1) %out.gep
ret void
}
; Different types compared vs. selected
define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_mov_b32_e32 v4, v2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000
; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile double, ptr addrspace(1) %z.gep
%setcc = fcmp ult float %x, 0.0
%select = select i1 %setcc, double 1.0, double %z
store double %select, ptr addrspace(1) %out.gep
ret void
}
; Different types compared vs. selected
define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_mov_b32_e32 v4, v2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile i64, ptr addrspace(1) %z.gep
%setcc = fcmp one float %x, 0.0
%select = select i1 %setcc, i64 3, i64 %z
store i64 %select, ptr addrspace(1) %out.gep
ret void
}
; Different types compared vs. selected
define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5
; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
; GFX12-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile i32, ptr addrspace(1) %x.gep
%z = load volatile float, ptr addrspace(1) %z.gep
%setcc = icmp ugt i32 %x, 1
%select = select i1 %setcc, float 4.0, float %z
store float %select, ptr addrspace(1) %out.gep
ret void
}
; FIXME: Should be able to handle multiple uses
define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2
; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc
; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v2, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5
; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc
; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX12-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
%z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
%x = load volatile float, ptr addrspace(1) %x.gep
%z = load volatile float, ptr addrspace(1) %z.gep
%setcc = fcmp ugt float 4.0, %x
%select0 = select i1 %setcc, float -1.0, float %z
%select1 = select i1 %setcc, float -2.0, float %z
store volatile float %select0, ptr addrspace(1) %out.gep
store volatile float %select1, ptr addrspace(1) %out.gep
ret void
}
; Source modifiers abs/neg only work for f32
define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cndmask_abs_neg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cndmask_abs_neg_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_cndmask_abs_neg_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v0, v0, s[0:1]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX10-NEXT: global_store_short v2, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cndmask_abs_neg_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_cndmask_abs_neg_f16:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cselect_b64 vcc, -1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX12-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX12-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX12-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
%f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
%f = load half, ptr addrspace(1) %f.gep
%f.abs = call half @llvm.fabs.f16(half %f)
%f.neg = fneg half %f
%setcc = icmp ne i32 %c, 0
%select = select i1 %setcc, half %f.abs, half %f.neg
store half %select, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cndmask_abs_neg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s8, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cndmask_abs_neg_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_cndmask_abs_neg_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cndmask_abs_neg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_cndmask_abs_neg_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
%f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
%f = load float, ptr addrspace(1) %f.gep
%f.abs = call float @llvm.fabs.f32(float %f)
%f.neg = fneg float %f
%setcc = icmp ne i32 %c, 0
%select = select i1 %setcc, float %f.abs, float %f.neg
store float %select, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
; SI-LABEL: v_cndmask_abs_neg_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cndmask_abs_neg_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_cndmask_abs_neg_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cndmask_abs_neg_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_cndmask_abs_neg_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[0:1], v0, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cselect_b64 vcc, -1, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; GFX12-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
%f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
%f = load double, ptr addrspace(1) %f.gep
%f.abs = call double @llvm.fabs.f64(double %f)
%f.neg = fneg double %f
%setcc = icmp ne i32 %c, 0
%select = select i1 %setcc, double %f.abs, double %f.neg
store double %select, ptr addrspace(1) %out
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }