Files
clang-p2996/llvm/test/CodeGen/AMDGPU/llvm.round.ll
Christudasan Devadasan 229e118559 [AMDGPU] Codegen support for constrained multi-dword sloads (#96163)
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constrained version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.
2024-07-23 13:59:15 +05:30

1062 lines
44 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX6-LABEL: round_f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e32 v0, s6
; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: round_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f32_e32 v0, s6
; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
; GFX8-NEXT: s_brev_b32 s4, -2
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: round_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; R600-LABEL: round_f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: TRUNC * T0.W, KC0[2].Z,
; R600-NEXT: ADD * T1.W, KC0[2].Z, -PV.W,
; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5,
; R600-NEXT: BFI_INT * T1.W, literal.x, PV.W, KC0[2].Z,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T0.X, T0.W, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = call float @llvm.round.f32(float %x) #1
store float %result, ptr addrspace(1) %out
ret void
}
; The vector tests are really difficult to verify, since it can be hard to
; predict how the scheduler will order the instructions. We already have
; a test for the scalar case, so the vector tests just check that the
; compiler doesn't crash.
define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 {
; GFX6-LABEL: round_v2f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_brev_b32 s8, -2
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e32 v0, s3
; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v2, s3
; GFX6-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s2
; GFX6-NEXT: v_sub_f32_e32 v2, s2, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: round_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_brev_b32 s8, -2
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f32_e32 v0, s3
; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s2
; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_brev_b32 s8, -2
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e32 v0, s7
; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v2, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: round_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s3
; GFX11-NEXT: v_trunc_f32_e32 v2, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_f32_e32 v1, s3, v0
; GFX11-NEXT: v_sub_f32_e32 v3, s2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v1|, 0.5
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: v_add_f32_e32 v1, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; R600-LABEL: round_v2f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: TRUNC * T0.W, KC0[3].X,
; R600-NEXT: ADD T1.W, KC0[3].X, -PV.W,
; R600-NEXT: TRUNC * T2.W, KC0[2].W,
; R600-NEXT: ADD T3.W, KC0[2].W, -PS,
; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5,
; R600-NEXT: BFI_INT T1.W, literal.x, PS, KC0[3].X,
; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T0.Y, T0.W, PV.W,
; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[2].W,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T0.X, T2.W, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
store <2 x float> %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 {
; GFX6-LABEL: round_v4f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: s_brev_b32 s10, -2
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e32 v0, s7
; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
; GFX6-NEXT: v_mov_b32_e32 v2, s7
; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX6-NEXT: v_add_f32_e32 v3, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s6
; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX6-NEXT: v_add_f32_e32 v2, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s5
; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v4, s5
; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v4
; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s4
; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v5, s4
; GFX6-NEXT: v_bfi_b32 v4, s10, v4, v5
; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: round_v4f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_brev_b32 s10, -2
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f32_e32 v0, s7
; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s6
; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s5
; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4
; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s4
; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_brev_b32 s10, -2
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e32 v0, s7
; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2
; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s5
; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v4
; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s4
; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: v_bfi_b32 v4, s10, v4, v5
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: round_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
; GFX11-NEXT: v_trunc_f32_e32 v4, s5
; GFX11-NEXT: v_trunc_f32_e32 v5, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5
; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4
; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; R600-LABEL: round_v4f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: TRUNC * T0.W, KC0[4].X,
; R600-NEXT: ADD T1.W, KC0[4].X, -PV.W,
; R600-NEXT: TRUNC * T2.W, KC0[3].W,
; R600-NEXT: TRUNC T0.Z, KC0[3].Z,
; R600-NEXT: ADD T3.W, KC0[3].W, -PS,
; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5,
; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[4].X,
; R600-NEXT: SETGE T1.Z, |PV.W|, 0.5,
; R600-NEXT: ADD * T1.W, KC0[3].Z, -PV.Z,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: TRUNC * T3.W, KC0[3].Y,
; R600-NEXT: ADD T1.Y, KC0[3].Y, -PV.W,
; R600-NEXT: SETGE T2.Z, |T1.W|, 0.5,
; R600-NEXT: BFI_INT T1.W, literal.x, T1.Z, KC0[3].W,
; R600-NEXT: ADD * T4.W, T0.W, T0.Y,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T4.Z, T2.W, PV.W,
; R600-NEXT: BFI_INT T0.W, literal.x, PV.Z, KC0[3].Z,
; R600-NEXT: SETGE * T1.W, |PV.Y|, 0.5,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T4.Y, T0.Z, PV.W,
; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[3].Y,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T4.X, T3.W, PV.W,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
store <4 x float> %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 {
; GFX6-LABEL: round_v8f32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9
; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: s_mov_b32 s15, 0xf000
; GFX6-NEXT: s_mov_b32 s14, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e32 v0, s7
; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v2, s7
; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX6-NEXT: v_add_f32_e32 v3, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s6
; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX6-NEXT: v_add_f32_e32 v2, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s5
; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v4, s5
; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4
; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
; GFX6-NEXT: v_trunc_f32_e32 v0, s4
; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v5, s4
; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5
; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
; GFX6-NEXT: v_trunc_f32_e32 v4, s11
; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v6, s11
; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX6-NEXT: v_add_f32_e32 v7, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v4, s10
; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v6, s10
; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX6-NEXT: v_add_f32_e32 v6, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v4, s9
; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v8, s9
; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8
; GFX6-NEXT: v_add_f32_e32 v5, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v4, s8
; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v9, s8
; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9
; GFX6-NEXT: v_add_f32_e32 v4, v4, v8
; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: round_v8f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
; GFX8-NEXT: s_brev_b32 s2, -2
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f32_e32 v0, s7
; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s6
; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s5
; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v4
; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
; GFX8-NEXT: v_trunc_f32_e32 v0, s4
; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_bfi_b32 v4, s2, v4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
; GFX8-NEXT: v_trunc_f32_e32 v4, s11
; GFX8-NEXT: v_sub_f32_e32 v5, s11, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v6, s11
; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX8-NEXT: v_add_f32_e32 v7, v4, v5
; GFX8-NEXT: v_trunc_f32_e32 v4, s10
; GFX8-NEXT: v_sub_f32_e32 v5, s10, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX8-NEXT: v_add_f32_e32 v6, v4, v5
; GFX8-NEXT: v_trunc_f32_e32 v4, s9
; GFX8-NEXT: v_sub_f32_e32 v5, s9, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v8, s9
; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v8
; GFX8-NEXT: v_add_f32_e32 v5, v4, v5
; GFX8-NEXT: v_trunc_f32_e32 v4, s8
; GFX8-NEXT: v_sub_f32_e32 v8, s8, v4
; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v9, s8
; GFX8-NEXT: v_bfi_b32 v8, s2, v8, v9
; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v8f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: s_mov_b32 s15, 0xf000
; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e32 v0, s7
; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s6
; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s5
; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4
; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
; GFX9-NEXT: v_trunc_f32_e32 v0, s4
; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, s11
; GFX9-NEXT: v_sub_f32_e32 v5, s11, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v6, s11
; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX9-NEXT: v_add_f32_e32 v7, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v4, s10
; GFX9-NEXT: v_sub_f32_e32 v5, s10, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v6, s10
; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX9-NEXT: v_add_f32_e32 v6, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v4, s9
; GFX9-NEXT: v_sub_f32_e32 v5, s9, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v8, s9
; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v8
; GFX9-NEXT: v_add_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v4, s8
; GFX9-NEXT: v_sub_f32_e32 v8, s8, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v9, s8
; GFX9-NEXT: v_bfi_b32 v8, s2, v8, v9
; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: round_v8f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
; GFX11-NEXT: v_trunc_f32_e32 v4, s5
; GFX11-NEXT: v_trunc_f32_e32 v8, s4
; GFX11-NEXT: v_trunc_f32_e32 v5, s11
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
; GFX11-NEXT: v_sub_f32_e32 v7, s5, v4
; GFX11-NEXT: v_trunc_f32_e32 v9, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8
; GFX11-NEXT: v_trunc_f32_e32 v6, s10
; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9
; GFX11-NEXT: v_trunc_f32_e32 v10, s8
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5
; GFX11-NEXT: v_add_f32_e32 v1, v4, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4
; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11
; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v7, v5, v12
; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10
; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2
; GFX11-NEXT: v_add_f32_e32 v6, v6, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2
; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8
; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; R600-LABEL: round_v8f32:
; R600: ; %bb.0:
; R600-NEXT: ALU 50, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: TRUNC * T0.W, KC0[6].X,
; R600-NEXT: ADD T0.Z, KC0[6].X, -PV.W,
; R600-NEXT: TRUNC * T1.W, KC0[5].X,
; R600-NEXT: TRUNC * T2.W, KC0[4].W,
; R600-NEXT: ADD T1.Z, KC0[4].W, -PV.W,
; R600-NEXT: ADD T3.W, KC0[5].X, -T1.W,
; R600-NEXT: SETGE * T4.W, |T0.Z|, 0.5,
; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[6].X,
; R600-NEXT: SETGE T0.Z, |PV.W|, 0.5,
; R600-NEXT: SETGE T3.W, |PV.Z|, 0.5,
; R600-NEXT: TRUNC * T4.W, KC0[5].Y,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T1.Y, KC0[5].Y, -PS,
; R600-NEXT: BFI_INT T1.Z, literal.x, PV.W, KC0[4].W,
; R600-NEXT: BFI_INT T3.W, literal.x, PV.Z, KC0[5].X,
; R600-NEXT: TRUNC * T5.W, KC0[4].Z,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: TRUNC T0.Z, KC0[4].Y,
; R600-NEXT: TRUNC * T6.W, KC0[5].W,
; R600-NEXT: ADD * T7.W, KC0[4].Z, -T5.W,
; R600-NEXT: TRUNC T0.X, KC0[5].Z,
; R600-NEXT: SETGE T2.Y, |PV.W|, 0.5,
; R600-NEXT: ADD T2.Z, KC0[5].W, -T6.W, BS:VEC_102/SCL_221
; R600-NEXT: ADD T7.W, KC0[4].Y, -T0.Z,
; R600-NEXT: ADD * T3.W, T1.W, T3.W,
; R600-NEXT: SETGE T1.X, |PV.W|, 0.5,
; R600-NEXT: SETGE T4.Y, |PV.Z|, 0.5,
; R600-NEXT: ADD T3.Z, T2.W, T1.Z,
; R600-NEXT: BFI_INT T1.W, literal.x, PV.Y, KC0[4].Z,
; R600-NEXT: ADD * T2.W, KC0[5].Z, -PV.X,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: SETGE T2.X, |PS|, 0.5,
; R600-NEXT: ADD T3.Y, T5.W, PV.W,
; R600-NEXT: BFI_INT T1.Z, literal.x, PV.Y, KC0[5].W,
; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[4].Y,
; R600-NEXT: ADD * T0.W, T0.W, T0.Y,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: ADD T3.X, T0.Z, PV.W,
; R600-NEXT: ADD T0.Z, T6.W, PV.Z,
; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[5].Z,
; R600-NEXT: SETGE * T2.W, |T1.Y|, 0.5,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; R600-NEXT: ADD T0.Y, T0.X, PV.W,
; R600-NEXT: BFI_INT * T1.W, literal.y, PS, KC0[5].Y,
; R600-NEXT: 2(2.802597e-45), 2147483647(nan)
; R600-NEXT: ADD T0.X, T4.W, PV.W,
; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; R600-NEXT: LSHR * T2.X, PV.W, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
store <8 x float> %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
; GFX6-LABEL: round_f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: v_trunc_f32_e32 v1, v0
; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: v_bfi_b32 v0, s2, v2, v0
; GFX6-NEXT: v_add_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: round_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
; GFX8-NEXT: s_movk_i32 s5, 0x7fff
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_trunc_f16_e32 v1, s4
; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
; GFX9-NEXT: s_movk_i32 s5, 0x7fff
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f16_e32 v1, s4
; GFX9-NEXT: v_sub_f16_e32 v2, s4, v1
; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v2
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: round_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0
; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; R600-LABEL: round_f16:
; R600: ; %bb.0:
; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: FLT16_TO_FLT32 * T0.W, KC0[2].Z,
; R600-NEXT: TRUNC * T1.W, PV.W,
; R600-NEXT: ADD * T2.W, T0.W, -PV.W,
; R600-NEXT: SETGE * T2.W, |PV.W|, 0.5,
; R600-NEXT: BFI_INT T0.W, literal.x, PV.W, T0.W,
; R600-NEXT: AND_INT * T2.W, KC0[2].Y, literal.y,
; R600-NEXT: 2147483647(nan), 3(4.203895e-45)
; R600-NEXT: ADD * T0.W, T1.W, PV.W,
; R600-NEXT: FLT32_TO_FLT16 T0.W, PV.W,
; R600-NEXT: LSHL * T1.W, T2.W, literal.x,
; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; R600-NEXT: LSHL T0.X, PV.W, PS,
; R600-NEXT: LSHL * T0.W, literal.x, PS,
; R600-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; R600-NEXT: MOV T0.Y, 0.0,
; R600-NEXT: MOV * T0.Z, 0.0,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%x.arg.trunc = trunc i32 %x.arg to i16
%x = bitcast i16 %x.arg.trunc to half
%result = call half @llvm.round.f16(half %x) #1
store half %result, ptr addrspace(1) %out
ret void
}
; Should be scalarized
define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX6-LABEL: round_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GFX6-NEXT: v_trunc_f32_e32 v3, v1
; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3
; GFX6-NEXT: v_trunc_f32_e32 v2, v0
; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, 0.5
; GFX6-NEXT: v_sub_f32_e32 v4, v0, v2
; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[2:3]
; GFX6-NEXT: v_bfi_b32 v1, s4, v5, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5
; GFX6-NEXT: v_add_f32_e32 v1, v3, v1
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[2:3]
; GFX6-NEXT: v_bfi_b32 v0, s4, v3, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_add_f32_e32 v0, v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: round_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
; GFX8-NEXT: s_movk_i32 s6, 0x7fff
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s5, s4, 16
; GFX8-NEXT: v_trunc_f16_e32 v1, s5
; GFX8-NEXT: v_sub_f16_e32 v2, s5, v1
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_bfi_b32 v2, s6, v2, v3
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_trunc_f16_e32 v2, s4
; GFX8-NEXT: v_sub_f16_e32 v3, s4, v2
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_bfi_b32 v0, s6, v0, v3
; GFX8-NEXT: v_add_f16_e32 v0, v2, v0
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
; GFX9-NEXT: s_movk_i32 s6, 0x7fff
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_trunc_f16_e32 v1, s5
; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1
; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3
; GFX9-NEXT: v_add_f16_e32 v1, v1, v2
; GFX9-NEXT: v_trunc_f16_e32 v2, s4
; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2
; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3
; GFX9-NEXT: v_add_f16_e32 v0, v2, v0
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: round_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s5, s4, 16
; GFX11-NEXT: v_trunc_f16_e32 v1, s4
; GFX11-NEXT: v_trunc_f16_e32 v0, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1
; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4
; GFX11-NEXT: v_add_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; R600-LABEL: round_v2f16:
; R600: ; %bb.0:
; R600-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; R600-NEXT: FLT16_TO_FLT32 * T0.W, PV.W,
; R600-NEXT: FLT16_TO_FLT32 T1.W, KC0[2].Z,
; R600-NEXT: TRUNC * T2.W, PV.W,
; R600-NEXT: ADD T3.W, T0.W, -PS,
; R600-NEXT: TRUNC * T4.W, PV.W,
; R600-NEXT: ADD T5.W, T1.W, -PS,
; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5,
; R600-NEXT: BFI_INT T0.W, literal.x, PS, T0.W,
; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: BFI_INT T1.W, literal.x, PS, T1.W, BS:VEC_021/SCL_122
; R600-NEXT: ADD * T0.W, T2.W, PV.W,
; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
; R600-NEXT: FLT32_TO_FLT16 T0.W, PS,
; R600-NEXT: ADD * T1.W, T4.W, PV.W,
; R600-NEXT: FLT32_TO_FLT16 T1.W, PS,
; R600-NEXT: LSHL * T0.W, PV.W, literal.x,
; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; R600-NEXT: OR_INT T0.X, PV.W, PS,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%in = bitcast i32 %in.arg to <2 x half>
%result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)
store <2 x half> %result, ptr addrspace(1) %out
ret void
}
declare float @llvm.round.f32(float) #1
declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
declare half @llvm.round.f16(half) #1
declare <2 x half> @llvm.round.v2f16(<2 x half>) #1
declare <4 x half> @llvm.round.v4f16(<4 x half>) #1
declare <8 x half> @llvm.round.v8f16(<8 x half>) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }