gfx11 chips may, in some conditions, behave incorrectly with S_CLAUSE instructions (hard clauses) containing more than 32 operations (that is, whose arguments exceed 0x1f). However, gfx10 targets will work successfully with clauses of up to length 63. Therefore, define the MaxHardClauseLength property on GCNSubtarget and make it a subtarget feature via tablegen, thus allowing us to specify, both now and in the future, the maximum viable size of clauses on various hardware from the tablegen definition. If MaxHardClauseLength is 0, which is the default, the hardware does not support hard clauses.
3135 lines
131 KiB
LLVM
3135 lines
131 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
|
|
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
|
|
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
|
|
|
|
define amdgpu_kernel void @select_f16(
|
|
; SI-LABEL: select_f16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
|
|
; SI-NEXT: s_mov_b32 s18, s2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s16, s6
|
|
; SI-NEXT: s_mov_b32 s17, s7
|
|
; SI-NEXT: s_mov_b32 s19, s3
|
|
; SI-NEXT: s_mov_b32 s20, s8
|
|
; SI-NEXT: s_mov_b32 s21, s9
|
|
; SI-NEXT: s_mov_b32 s22, s2
|
|
; SI-NEXT: s_mov_b32 s23, s3
|
|
; SI-NEXT: s_mov_b32 s8, s10
|
|
; SI-NEXT: s_mov_b32 s9, s11
|
|
; SI-NEXT: s_mov_b32 s10, s2
|
|
; SI-NEXT: s_mov_b32 s11, s3
|
|
; SI-NEXT: s_mov_b32 s14, s2
|
|
; SI-NEXT: s_mov_b32 s15, s3
|
|
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s0, s4
|
|
; SI-NEXT: s_mov_b32 s1, s5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_f16:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
|
|
; VI-NEXT: s_mov_b32 s18, s2
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s16, s6
|
|
; VI-NEXT: s_mov_b32 s17, s7
|
|
; VI-NEXT: s_mov_b32 s19, s3
|
|
; VI-NEXT: s_mov_b32 s20, s8
|
|
; VI-NEXT: s_mov_b32 s21, s9
|
|
; VI-NEXT: s_mov_b32 s22, s2
|
|
; VI-NEXT: s_mov_b32 s23, s3
|
|
; VI-NEXT: s_mov_b32 s8, s10
|
|
; VI-NEXT: s_mov_b32 s9, s11
|
|
; VI-NEXT: s_mov_b32 s10, s2
|
|
; VI-NEXT: s_mov_b32 s11, s3
|
|
; VI-NEXT: s_mov_b32 s14, s2
|
|
; VI-NEXT: s_mov_b32 s15, s3
|
|
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s0, s4
|
|
; VI-NEXT: s_mov_b32 s1, s5
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_f16:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
|
|
; GFX11-NEXT: s_mov_b32 s14, -1
|
|
; GFX11-NEXT: s_mov_b32 s15, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s18, s14
|
|
; GFX11-NEXT: s_mov_b32 s19, s15
|
|
; GFX11-NEXT: s_mov_b32 s22, s14
|
|
; GFX11-NEXT: s_mov_b32 s23, s15
|
|
; GFX11-NEXT: s_mov_b32 s26, s14
|
|
; GFX11-NEXT: s_mov_b32 s27, s15
|
|
; GFX11-NEXT: s_mov_b32 s2, s14
|
|
; GFX11-NEXT: s_mov_b32 s3, s15
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s16, s6
|
|
; GFX11-NEXT: s_mov_b32 s17, s7
|
|
; GFX11-NEXT: s_mov_b32 s20, s8
|
|
; GFX11-NEXT: s_mov_b32 s21, s9
|
|
; GFX11-NEXT: s_mov_b32 s24, s10
|
|
; GFX11-NEXT: s_mov_b32 s25, s11
|
|
; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s4
|
|
; GFX11-NEXT: s_mov_b32 s13, s5
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
|
|
; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%a.val = load volatile half, ptr addrspace(1) %a
|
|
%b.val = load volatile half, ptr addrspace(1) %b
|
|
%c.val = load volatile half, ptr addrspace(1) %c
|
|
%d.val = load volatile half, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt half %a.val, %b.val
|
|
%r.val = select i1 %fcmp, half %c.val, half %d.val
|
|
store half %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_f16_imm_a(
|
|
; SI-LABEL: select_f16_imm_a:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_f16_imm_a:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_f16_imm_a:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
|
|
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%b.val = load volatile half, ptr addrspace(1) %b
|
|
%c.val = load volatile half, ptr addrspace(1) %c
|
|
%d.val = load volatile half, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt half 0xH3800, %b.val
|
|
%r.val = select i1 %fcmp, half %c.val, half %d.val
|
|
store half %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_f16_imm_b(
|
|
; SI-LABEL: select_f16_imm_b:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_f16_imm_b:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_f16_imm_b:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
|
|
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %c,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%a.val = load volatile half, ptr addrspace(1) %a
|
|
%c.val = load volatile half, ptr addrspace(1) %c
|
|
%d.val = load volatile half, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt half %a.val, 0xH3800
|
|
%r.val = select i1 %fcmp, half %c.val, half %d.val
|
|
store half %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_f16_imm_c(
|
|
; SI-LABEL: select_f16_imm_c:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_f16_imm_c:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_f16_imm_c:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
|
|
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%a.val = load volatile half, ptr addrspace(1) %a
|
|
%b.val = load volatile half, ptr addrspace(1) %b
|
|
%d.val = load volatile half, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt half %a.val, %b.val
|
|
%r.val = select i1 %fcmp, half 0xH3800, half %d.val
|
|
store half %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_f16_imm_d(
|
|
; SI-LABEL: select_f16_imm_d:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_f16_imm_d:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_f16_imm_d:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
|
|
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c) {
|
|
entry:
|
|
%a.val = load volatile half, ptr addrspace(1) %a
|
|
%b.val = load volatile half, ptr addrspace(1) %b
|
|
%c.val = load volatile half, ptr addrspace(1) %c
|
|
%fcmp = fcmp olt half %a.val, %b.val
|
|
%r.val = select i1 %fcmp, half %c.val, half 0xH3800
|
|
store half %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_v2f16(
|
|
; SI-LABEL: select_v2f16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: s_mov_b32 s18, s2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s16, s6
|
|
; SI-NEXT: s_mov_b32 s17, s7
|
|
; SI-NEXT: s_mov_b32 s19, s3
|
|
; SI-NEXT: s_mov_b32 s20, s8
|
|
; SI-NEXT: s_mov_b32 s21, s9
|
|
; SI-NEXT: s_mov_b32 s22, s2
|
|
; SI-NEXT: s_mov_b32 s23, s3
|
|
; SI-NEXT: s_mov_b32 s14, s2
|
|
; SI-NEXT: s_mov_b32 s15, s3
|
|
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; SI-NEXT: s_mov_b32 s8, s10
|
|
; SI-NEXT: s_mov_b32 s9, s11
|
|
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; SI-NEXT: s_mov_b32 s10, s2
|
|
; SI-NEXT: s_mov_b32 s11, s3
|
|
; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0
|
|
; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0
|
|
; SI-NEXT: s_mov_b32 s0, s4
|
|
; SI-NEXT: s_mov_b32 s1, s5
|
|
; SI-NEXT: s_waitcnt vmcnt(3)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
|
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_v2f16:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
|
|
; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
; VI-NEXT: s_mov_b32 s14, s2
|
|
; VI-NEXT: s_mov_b32 s15, s3
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s16, s6
|
|
; VI-NEXT: s_mov_b32 s17, s7
|
|
; VI-NEXT: s_mov_b32 s18, s2
|
|
; VI-NEXT: s_mov_b32 s19, s3
|
|
; VI-NEXT: s_mov_b32 s20, s8
|
|
; VI-NEXT: s_mov_b32 s21, s9
|
|
; VI-NEXT: s_mov_b32 s22, s2
|
|
; VI-NEXT: s_mov_b32 s23, s3
|
|
; VI-NEXT: s_mov_b32 s8, s10
|
|
; VI-NEXT: s_mov_b32 s9, s11
|
|
; VI-NEXT: s_mov_b32 s10, s2
|
|
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0
|
|
; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0
|
|
; VI-NEXT: s_mov_b32 s11, s3
|
|
; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
|
|
; VI-NEXT: s_mov_b32 s0, s4
|
|
; VI-NEXT: s_mov_b32 s1, s5
|
|
; VI-NEXT: s_waitcnt vmcnt(3)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(2)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; VI-NEXT: s_waitcnt vmcnt(1)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_v2f16:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
|
|
; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
|
|
; GFX11-NEXT: s_mov_b32 s2, -1
|
|
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s2
|
|
; GFX11-NEXT: s_mov_b32 s15, s3
|
|
; GFX11-NEXT: s_mov_b32 s22, s2
|
|
; GFX11-NEXT: s_mov_b32 s23, s3
|
|
; GFX11-NEXT: s_mov_b32 s18, s2
|
|
; GFX11-NEXT: s_mov_b32 s19, s3
|
|
; GFX11-NEXT: s_mov_b32 s26, s2
|
|
; GFX11-NEXT: s_mov_b32 s27, s3
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s20, s8
|
|
; GFX11-NEXT: s_mov_b32 s21, s9
|
|
; GFX11-NEXT: s_mov_b32 s16, s6
|
|
; GFX11-NEXT: s_mov_b32 s17, s7
|
|
; GFX11-NEXT: s_mov_b32 s24, s10
|
|
; GFX11-NEXT: s_mov_b32 s25, s11
|
|
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
|
|
; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0
|
|
; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0
|
|
; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0
|
|
; GFX11-NEXT: s_mov_b32 s0, s4
|
|
; GFX11-NEXT: s_mov_b32 s1, s5
|
|
; GFX11-NEXT: s_waitcnt vmcnt(3)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
|
|
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%a.val = load <2 x half>, ptr addrspace(1) %a
|
|
%b.val = load <2 x half>, ptr addrspace(1) %b
|
|
%c.val = load <2 x half>, ptr addrspace(1) %c
|
|
%d.val = load <2 x half>, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt <2 x half> %a.val, %b.val
|
|
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
|
|
store <2 x half> %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_v2f16_imm_a(
|
|
; SI-LABEL: select_v2f16_imm_a:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; SI-NEXT: s_mov_b32 s2, 0x3f200000
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_v2f16_imm_a:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; VI-NEXT: s_movk_i32 s2, 0x3900
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: s_waitcnt vmcnt(2)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_v2f16_imm_a:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
|
|
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
|
|
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%b.val = load <2 x half>, ptr addrspace(1) %b
|
|
%c.val = load <2 x half>, ptr addrspace(1) %c
|
|
%d.val = load <2 x half>, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
|
|
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
|
|
store <2 x half> %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_v2f16_imm_b(
|
|
; SI-LABEL: select_v2f16_imm_b:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; SI-NEXT: s_mov_b32 s2, 0x3f200000
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
|
|
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_v2f16_imm_b:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; VI-NEXT: s_movk_i32 s2, 0x3900
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: s_waitcnt vmcnt(2)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_v2f16_imm_b:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
|
|
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
|
|
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
|
|
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %c,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%a.val = load <2 x half>, ptr addrspace(1) %a
|
|
%c.val = load <2 x half>, ptr addrspace(1) %c
|
|
%d.val = load <2 x half>, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
|
|
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
|
|
store <2 x half> %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_v2f16_imm_c(
|
|
; SI-LABEL: select_v2f16_imm_c:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
|
|
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
|
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
|
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_v2f16_imm_c:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
|
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
|
|
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: s_waitcnt vmcnt(2)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(1)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
|
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_v2f16_imm_c:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
|
|
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
|
|
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
|
|
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %d) {
|
|
entry:
|
|
%a.val = load <2 x half>, ptr addrspace(1) %a
|
|
%b.val = load <2 x half>, ptr addrspace(1) %b
|
|
%d.val = load <2 x half>, ptr addrspace(1) %d
|
|
%fcmp = fcmp olt <2 x half> %a.val, %b.val
|
|
%r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
|
|
store <2 x half> %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @select_v2f16_imm_d(
|
|
; SI-LABEL: select_v2f16_imm_d:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
|
|
; SI-NEXT: s_mov_b32 s11, 0xf000
|
|
; SI-NEXT: s_mov_b32 s10, -1
|
|
; SI-NEXT: s_mov_b32 s14, s10
|
|
; SI-NEXT: s_mov_b32 s15, s11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s12, s2
|
|
; SI-NEXT: s_mov_b32 s13, s3
|
|
; SI-NEXT: s_mov_b32 s16, s4
|
|
; SI-NEXT: s_mov_b32 s17, s5
|
|
; SI-NEXT: s_mov_b32 s18, s10
|
|
; SI-NEXT: s_mov_b32 s19, s11
|
|
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
|
|
; SI-NEXT: s_mov_b32 s4, s6
|
|
; SI-NEXT: s_mov_b32 s5, s7
|
|
; SI-NEXT: s_mov_b32 s6, s10
|
|
; SI-NEXT: s_mov_b32 s7, s11
|
|
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
|
|
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
|
|
; SI-NEXT: s_mov_b32 s8, s0
|
|
; SI-NEXT: s_mov_b32 s9, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
|
|
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: select_v2f16_imm_d:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
|
; VI-NEXT: s_mov_b32 s11, 0xf000
|
|
; VI-NEXT: s_mov_b32 s10, -1
|
|
; VI-NEXT: s_mov_b32 s18, s10
|
|
; VI-NEXT: s_mov_b32 s19, s11
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s16, s4
|
|
; VI-NEXT: s_mov_b32 s17, s5
|
|
; VI-NEXT: s_mov_b32 s14, s10
|
|
; VI-NEXT: s_mov_b32 s12, s2
|
|
; VI-NEXT: s_mov_b32 s13, s3
|
|
; VI-NEXT: s_mov_b32 s15, s11
|
|
; VI-NEXT: s_mov_b32 s4, s6
|
|
; VI-NEXT: s_mov_b32 s5, s7
|
|
; VI-NEXT: s_mov_b32 s6, s10
|
|
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
|
|
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
|
|
; VI-NEXT: s_mov_b32 s7, s11
|
|
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
|
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
|
|
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
|
|
; VI-NEXT: s_mov_b32 s8, s0
|
|
; VI-NEXT: s_mov_b32 s9, s1
|
|
; VI-NEXT: s_waitcnt vmcnt(2)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(1)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: select_v2f16_imm_d:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
|
|
; GFX11-NEXT: s_mov_b32 s10, -1
|
|
; GFX11-NEXT: s_mov_b32 s11, 0x31016000
|
|
; GFX11-NEXT: s_mov_b32 s18, s10
|
|
; GFX11-NEXT: s_mov_b32 s19, s11
|
|
; GFX11-NEXT: s_mov_b32 s14, s10
|
|
; GFX11-NEXT: s_mov_b32 s15, s11
|
|
; GFX11-NEXT: s_mov_b32 s22, s10
|
|
; GFX11-NEXT: s_mov_b32 s23, s11
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mov_b32 s16, s4
|
|
; GFX11-NEXT: s_mov_b32 s17, s5
|
|
; GFX11-NEXT: s_mov_b32 s12, s2
|
|
; GFX11-NEXT: s_mov_b32 s13, s3
|
|
; GFX11-NEXT: s_mov_b32 s20, s6
|
|
; GFX11-NEXT: s_mov_b32 s21, s7
|
|
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
|
|
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
|
|
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
|
|
; GFX11-NEXT: s_mov_b32 s8, s0
|
|
; GFX11-NEXT: s_mov_b32 s9, s1
|
|
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
|
|
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
|
|
; GFX11-NEXT: s_nop 0
|
|
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
; GFX11-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c) {
|
|
entry:
|
|
%a.val = load <2 x half>, ptr addrspace(1) %a
|
|
%b.val = load <2 x half>, ptr addrspace(1) %b
|
|
%c.val = load <2 x half>, ptr addrspace(1) %c
|
|
%fcmp = fcmp olt <2 x half> %a.val, %b.val
|
|
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
|
|
store <2 x half> %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define <4 x half> @v_select_v4f16(<4 x half> %a, <4 x half> %b, i32 %cond) {
|
|
; SI-LABEL: v_select_v4f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_or_b32_e32 v2, v2, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
|
|
; SI-NEXT: v_or_b32_e32 v3, v6, v3
|
|
; SI-NEXT: v_or_b32_e32 v1, v4, v1
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_select_v4f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_select_v4f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq i32 %cond, 0
|
|
%select = select i1 %cmp, <4 x half> %a, <4 x half> %b
|
|
ret <4 x half> %select
|
|
}
|
|
|
|
define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond) {
|
|
; SI-LABEL: v_vselect_v4f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_vselect_v4f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
|
|
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
|
|
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
|
|
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
|
|
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_vselect_v4f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
|
|
; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq <4 x i32> %cond, zeroinitializer
|
|
%select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
|
|
ret <4 x half> %select
|
|
}
|
|
|
|
define <8 x half> @v_select_v8f16(<8 x half> %a, <8 x half> %b, i32 %cond) {
|
|
; SI-LABEL: v_select_v8f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
|
|
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_or_b32_e32 v6, v6, v7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15
|
|
; SI-NEXT: v_or_b32_e32 v4, v4, v5
|
|
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13
|
|
; SI-NEXT: v_or_b32_e32 v2, v2, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
|
|
; SI-NEXT: v_or_b32_e32 v7, v14, v7
|
|
; SI-NEXT: v_or_b32_e32 v5, v12, v5
|
|
; SI-NEXT: v_or_b32_e32 v3, v10, v3
|
|
; SI-NEXT: v_or_b32_e32 v1, v8, v1
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_select_v8f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_select_v8f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq i32 %cond, 0
|
|
%select = select i1 %cmp, <8 x half> %a, <8 x half> %b
|
|
ret <8 x half> %select
|
|
}
|
|
|
|
define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond) {
|
|
; SI-LABEL: v_vselect_v8f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
|
|
; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
|
|
; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
|
|
; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
|
|
; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
|
|
; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_vselect_v8f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3
|
|
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
|
|
; VI-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
|
|
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
|
|
; VI-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
|
|
; VI-NEXT: v_cndmask_b32_e32 v11, v17, v16, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
|
|
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
|
|
; VI-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v9
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v11
|
|
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v13
|
|
; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v15
|
|
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_vselect_v8f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v3
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v6
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10
|
|
; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
|
|
; GFX11-NEXT: v_perm_b32 v3, v15, v3, 0x5040100
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq <8 x i32> %cond, zeroinitializer
|
|
%select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
|
|
ret <8 x half> %select
|
|
}
|
|
|
|
define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) {
|
|
; SI-LABEL: v_select_v16f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
|
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
|
|
; SI-NEXT: v_or_b32_e32 v12, v12, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v29
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
|
|
; SI-NEXT: v_or_b32_e32 v14, v14, v15
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v28
|
|
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
|
|
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
|
|
; SI-NEXT: v_or_b32_e32 v10, v10, v11
|
|
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v27
|
|
; SI-NEXT: v_or_b32_e32 v13, v15, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v26
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_or_b32_e32 v11, v15, v11
|
|
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
|
|
; SI-NEXT: v_or_b32_e32 v6, v6, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v26, v30
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
|
|
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
|
|
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
|
|
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
|
|
; SI-NEXT: v_or_b32_e32 v2, v2, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17
|
|
; SI-NEXT: v_or_b32_e32 v8, v8, v9
|
|
; SI-NEXT: v_or_b32_e32 v9, v24, v25
|
|
; SI-NEXT: v_or_b32_e32 v22, v22, v23
|
|
; SI-NEXT: v_or_b32_e32 v4, v4, v5
|
|
; SI-NEXT: v_or_b32_e32 v5, v20, v21
|
|
; SI-NEXT: v_or_b32_e32 v3, v18, v3
|
|
; SI-NEXT: v_or_b32_e32 v1, v16, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v28
|
|
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_or_b32_e32 v7, v26, v7
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v15, v22, v6, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v16, v7, v14, vcc
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v15
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v8, v9
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v10, v11
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v12, v13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15
|
|
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
|
|
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v14, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_select_v16f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_select_v16f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v2 :: v_dual_cndmask_b32 v3, v11, v3
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v4, v12, v4 :: v_dual_cndmask_b32 v5, v13, v5
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq i32 %cond, 0
|
|
%select = select i1 %cmp, <16 x half> %a, <16 x half> %b
|
|
ret <16 x half> %select
|
|
}
|
|
|
|
define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> %cond) {
|
|
; SI-LABEL: v_vselect_v16f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7]
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v17
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v17, v27
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
|
|
; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9]
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v18
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v18, v28
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
|
|
; SI-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[10:11]
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v19
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v19, v29
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
|
|
; SI-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[12:13]
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v20
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v20, v30
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
|
|
; SI-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[14:15]
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v21
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e64 v5, v16, v5, s[4:5]
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v22
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v23
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[16:17]
|
|
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v24
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc
|
|
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v25
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc
|
|
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v26
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc
|
|
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52
|
|
; SI-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
|
|
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc
|
|
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60
|
|
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
|
|
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64
|
|
; SI-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
|
|
; SI-NEXT: s_waitcnt vmcnt(2)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v14, v20, v14, vcc
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_vselect_v16f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; VI-NEXT: s_mov_b64 exec, s[4:5]
|
|
; VI-NEXT: v_writelane_b32 v31, s30, 0
|
|
; VI-NEXT: v_writelane_b32 v31, s31, 1
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29
|
|
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6
|
|
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v24
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27
|
|
; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31]
|
|
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
|
|
; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19
|
|
; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29]
|
|
; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
|
|
; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
|
|
; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11]
|
|
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v20
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23
|
|
; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27]
|
|
; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3
|
|
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v11
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[22:23], 0, v21
|
|
; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25]
|
|
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2
|
|
; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v22
|
|
; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23]
|
|
; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9
|
|
; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21]
|
|
; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
|
|
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21
|
|
; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0
|
|
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8
|
|
; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7]
|
|
; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v26
|
|
; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19]
|
|
; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[8:9]
|
|
; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19
|
|
; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13]
|
|
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v7
|
|
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22
|
|
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v28
|
|
; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v30
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17
|
|
; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[14:15]
|
|
; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[16:17]
|
|
; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16
|
|
; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_readlane_b32 s31, v31, 1
|
|
; VI-NEXT: v_readlane_b32 s30, v31, 0
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
|
|
; VI-NEXT: v_cndmask_b32_e32 v8, v13, v11, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
|
|
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
|
|
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; VI-NEXT: s_mov_b64 exec, s[4:5]
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_vselect_v16f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: scratch_load_b32 v31, off, s32
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v13
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v9
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
|
|
; GFX11-NEXT: v_perm_b32 v5, v9, v5, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v4, v10, v4, 0x5040100
|
|
; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
|
|
; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo
|
|
; GFX11-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_perm_b32 v7, v11, v7, 0x5040100
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq <16 x i32> %cond, zeroinitializer
|
|
%select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b
|
|
ret <16 x half> %select
|
|
}
|
|
|
|
define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
|
|
; SI-LABEL: v_select_v32f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
|
|
; SI-NEXT: v_or_b32_e32 v20, v20, v21
|
|
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
|
|
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
|
|
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
|
|
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
|
|
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
|
|
; SI-NEXT: v_or_b32_e32 v12, v12, v13
|
|
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
|
|
; SI-NEXT: v_or_b32_e32 v10, v10, v11
|
|
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
|
|
; SI-NEXT: v_or_b32_e32 v8, v8, v9
|
|
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
|
|
; SI-NEXT: v_or_b32_e32 v6, v6, v7
|
|
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
|
|
; SI-NEXT: v_or_b32_e32 v4, v4, v5
|
|
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
|
|
; SI-NEXT: v_or_b32_e32 v2, v2, v3
|
|
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
|
|
; SI-NEXT: v_or_b32_e32 v22, v22, v23
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v23, v30
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
|
|
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
|
|
; SI-NEXT: v_or_b32_e32 v24, v24, v25
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
|
|
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
|
|
; SI-NEXT: v_or_b32_e32 v26, v26, v27
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
|
|
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
|
|
; SI-NEXT: v_or_b32_e32 v28, v28, v29
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
|
|
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
|
|
; SI-NEXT: v_or_b32_e32 v18, v18, v19
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
|
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
|
; SI-NEXT: v_or_b32_e32 v16, v16, v17
|
|
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124
|
|
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
|
|
; SI-NEXT: v_or_b32_e32 v14, v14, v15
|
|
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116
|
|
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
|
|
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
|
|
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
|
|
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
|
|
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
|
|
; SI-NEXT: s_waitcnt vmcnt(14)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
|
|
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
|
|
; SI-NEXT: v_or_b32_e32 v21, v23, v21
|
|
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
|
|
; SI-NEXT: s_waitcnt vmcnt(14)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
|
|
; SI-NEXT: s_waitcnt vmcnt(13)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: s_waitcnt vmcnt(12)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
|
|
; SI-NEXT: s_waitcnt vmcnt(11)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: s_waitcnt vmcnt(10)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(9)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
|
|
; SI-NEXT: s_waitcnt vmcnt(8)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
|
|
; SI-NEXT: s_waitcnt vmcnt(7)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
|
|
; SI-NEXT: s_waitcnt vmcnt(6)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
|
|
; SI-NEXT: s_waitcnt vmcnt(5)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: s_waitcnt vmcnt(4)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
|
|
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
|
|
; SI-NEXT: v_or_b32_e32 v23, v25, v23
|
|
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
|
|
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
|
|
; SI-NEXT: v_or_b32_e32 v25, v27, v25
|
|
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
|
|
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
|
|
; SI-NEXT: v_or_b32_e32 v27, v29, v27
|
|
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
|
|
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
|
|
; SI-NEXT: v_or_b32_e32 v29, v30, v29
|
|
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
|
|
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
|
|
; SI-NEXT: v_or_b32_e32 v30, v31, v30
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
|
|
; SI-NEXT: v_or_b32_e32 v31, v32, v31
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
|
|
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v19, v32, v19
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
|
|
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v17, v32, v17
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v15, v32, v15
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v13, v32, v13
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v11, v32, v11
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v9, v32, v9
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v7, v32, v7
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v5, v32, v5
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v3, v32, v3
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_or_b32_e32 v1, v32, v1
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
|
|
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v15, v15, v14, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v31, v31, v20, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v30, v30, v22, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v29, v29, v24, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v27, v27, v26, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v32, v25, v28, vcc
|
|
; SI-NEXT: v_cndmask_b32_e32 v33, v23, v21, vcc
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v8, v9
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v10, v11
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v12, v13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v14, v15
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v17
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v18, v19
|
|
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v20, v31
|
|
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v31
|
|
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v24, v29
|
|
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v26, v27
|
|
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
|
|
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v32
|
|
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v22, v30
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v28, v32
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v30, v33
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_select_v32f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
|
|
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc
|
|
; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_select_v32f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: s_clause 0x1
|
|
; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:4
|
|
; GFX11-NEXT: scratch_load_b32 v32, off, s32
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v3
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v5, v21, v5
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v9, v25, v9
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v13, v29, v13
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v14, v30, v14 :: v_dual_cndmask_b32 v15, v32, v15
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq i32 %cond, 0
|
|
%select = select i1 %cmp, <32 x half> %a, <32 x half> %b
|
|
ret <32 x half> %select
|
|
}
|
|
|
|
define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> %cond) {
|
|
; SI-LABEL: v_vselect_v32f16:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
|
|
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, v31, v0, s[16:17]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v1, v31, v1, s[14:15]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v2, v31, v2, s[12:13]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v3, v31, v3, s[10:11]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v4, v31, v4, s[8:9]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v5, v31, v5, s[6:7]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e64 v6, v31, v6, s[4:5]
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v7, v31, v7, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v8, v31, v8, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v9, v31, v9, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v10, v31, v10, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v11, v31, v11, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v12, v31, v12, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v14, v31, v14, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v17, v31, v17, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v18, v31, v18, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v19, v31, v19, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v20, v31, v20, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v21, v31, v21, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v22, v31, v22, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v23, v31, v23, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v24, v31, v24, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v25, v31, v25, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v27, v31, v27, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v28, v31, v28, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v29, v31, v29, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v30, v31, v30, vcc
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
|
|
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
|
|
; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: v_vselect_v32f16:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
|
|
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120
|
|
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112
|
|
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
|
|
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
|
|
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
|
|
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
|
|
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72
|
|
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32
|
|
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128
|
|
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64
|
|
; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56
|
|
; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48
|
|
; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40
|
|
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32
|
|
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
|
|
; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
|
|
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
|
|
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124
|
|
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116
|
|
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30
|
|
; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v13
|
|
; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v29
|
|
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v12
|
|
; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v28
|
|
; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v11
|
|
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v27
|
|
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v10
|
|
; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v26
|
|
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
|
|
; VI-NEXT: s_waitcnt vmcnt(14)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36
|
|
; VI-NEXT: v_cndmask_b32_e32 v36, v43, v38, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35
|
|
; VI-NEXT: v_cndmask_b32_e32 v35, v45, v44, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
|
|
; VI-NEXT: v_cndmask_b32_e32 v34, v47, v46, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33
|
|
; VI-NEXT: v_cndmask_b32_e32 v33, v57, v56, vcc
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
|
|
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25
|
|
; VI-NEXT: v_cndmask_b32_e32 v32, v59, v58, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(13)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8
|
|
; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v24
|
|
; VI-NEXT: v_cndmask_b32_e32 v38, v38, v60, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(12)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39
|
|
; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v15
|
|
; VI-NEXT: v_cndmask_b32_e32 v39, v44, v43, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(11)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
|
|
; VI-NEXT: s_waitcnt vmcnt(10)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55
|
|
; VI-NEXT: v_cndmask_b32_e32 v31, v31, v45, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23
|
|
; VI-NEXT: s_waitcnt vmcnt(9)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50
|
|
; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22
|
|
; VI-NEXT: s_waitcnt vmcnt(8)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53
|
|
; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v5
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21
|
|
; VI-NEXT: s_waitcnt vmcnt(7)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54
|
|
; VI-NEXT: v_cndmask_b32_e32 v54, v43, v55, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v4
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20
|
|
; VI-NEXT: s_waitcnt vmcnt(6)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52
|
|
; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19
|
|
; VI-NEXT: s_waitcnt vmcnt(5)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51
|
|
; VI-NEXT: v_cndmask_b32_e32 v51, v43, v55, vcc
|
|
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18
|
|
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108
|
|
; VI-NEXT: s_waitcnt vmcnt(5)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49
|
|
; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc
|
|
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100
|
|
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92
|
|
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17
|
|
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
|
|
; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76
|
|
; VI-NEXT: s_waitcnt vmcnt(8)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48
|
|
; VI-NEXT: v_cndmask_b32_e32 v48, v46, v43, vcc
|
|
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
|
|
; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60
|
|
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
|
|
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v0
|
|
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v16
|
|
; VI-NEXT: s_waitcnt vmcnt(10)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40
|
|
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44
|
|
; VI-NEXT: v_cndmask_b32_e32 v46, v58, v46, vcc
|
|
; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36
|
|
; VI-NEXT: s_waitcnt vmcnt(11)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41
|
|
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28
|
|
; VI-NEXT: v_cndmask_b32_e32 v15, v37, v15, vcc
|
|
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
|
|
; VI-NEXT: s_waitcnt vmcnt(12)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42
|
|
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
|
|
; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc
|
|
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
|
|
; VI-NEXT: s_waitcnt vmcnt(13)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44
|
|
; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(12)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55
|
|
; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(11)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45
|
|
; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(10)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47
|
|
; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(9)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56
|
|
; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(8)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43
|
|
; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(7)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57
|
|
; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(6)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59
|
|
; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(5)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40
|
|
; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(4)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v58
|
|
; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(3)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41
|
|
; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(2)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37
|
|
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(1)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42
|
|
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v46
|
|
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
|
|
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
|
|
; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48
|
|
; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49
|
|
; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v51
|
|
; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52
|
|
; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v54
|
|
; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53
|
|
; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50
|
|
; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v39
|
|
; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v38
|
|
; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32
|
|
; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v33
|
|
; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34
|
|
; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v35
|
|
; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v36
|
|
; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v31
|
|
; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_vselect_v32f16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: s_clause 0x1f
|
|
; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:120
|
|
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:112
|
|
; GFX11-NEXT: scratch_load_b32 v33, off, s32
|
|
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:104
|
|
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:96
|
|
; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:88
|
|
; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:80
|
|
; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72
|
|
; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64
|
|
; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:56
|
|
; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:48
|
|
; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:40
|
|
; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32
|
|
; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:24
|
|
; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:16
|
|
; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:8
|
|
; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:124
|
|
; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:116
|
|
; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:108
|
|
; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:100
|
|
; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:92
|
|
; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84
|
|
; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:76
|
|
; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:68
|
|
; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:60
|
|
; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52
|
|
; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:44
|
|
; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:36
|
|
; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:28
|
|
; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12
|
|
; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:4
|
|
; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:20
|
|
; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v30
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v13
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v29
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v12
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v28
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v11
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v27
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v10
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v26
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v9
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v25
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v8
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v24
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v7
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v23
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v6
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v22
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v5
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v21
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v4
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v20
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v3
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v19
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v2
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v18
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v1
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15
|
|
; GFX11-NEXT: s_waitcnt vmcnt(32)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(31)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v32
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v0
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(29)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v33
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(28)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(27)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(26)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(25)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(24)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(23)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(22)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(21)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(20)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(19)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(18)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(17)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(16)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(15)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(14)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_perm_b32 v14, v97, v14, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(13)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(12)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_perm_b32 v12, v34, v12, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(11)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(10)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69
|
|
; GFX11-NEXT: v_perm_b32 v13, v99, v13, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v10, v36, v10, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(9)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(8)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71
|
|
; GFX11-NEXT: v_perm_b32 v11, v35, v11, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v8, v38, v8, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(7)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(6)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81
|
|
; GFX11-NEXT: v_perm_b32 v9, v37, v9, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(5)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(4)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83
|
|
; GFX11-NEXT: v_perm_b32 v7, v39, v7, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v4, v50, v4, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(3)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86
|
|
; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_perm_b32 v0, v32, v0, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87
|
|
; GFX11-NEXT: v_perm_b32 v3, v51, v3, 0x5040100
|
|
; GFX11-NEXT: v_perm_b32 v2, v52, v2, 0x5040100
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo
|
|
; GFX11-NEXT: v_perm_b32 v1, v31, v1, 0x5040100
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
%cmp = icmp eq <32 x i32> %cond, zeroinitializer
|
|
%select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b
|
|
ret <32 x half> %select
|
|
}
|