Files
clang-p2996/llvm/test/CodeGen/AMDGPU/select.f16.ll
Ruiling, Song 0487db1f13 MachineScheduler: Improve instruction clustering (#137784)
The existing way of managing clustered nodes was done through adding
weak edges between the neighbouring cluster nodes, which is a sort of
ordered queue. And this will be later recorded as `NextClusterPred` or
`NextClusterSucc` in `ScheduleDAGMI`.

But actually the instruction may be picked not in the exact order of the
queue. For example, we have a queue of cluster nodes A B C. But during
scheduling, node B might be picked first, then it will be very likely
that we only cluster B and C for Top-Down scheduling (leaving A alone).

Another issue is:
```
   if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
      std::swap(SUa, SUb);
   if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
```
may break the cluster queue.

For example, we want to cluster nodes (order as in `MemOpRecords`): 1 3
2. 1(SUa) will be pred of 3(SUb) normally. But when it comes to (3, 2),
As 3(SUa) > 2(SUb), we would reorder the two nodes, which makes 2 be
pred of 3. This makes both 1 and 2 become preds of 3, but there is no
edge between 1 and 2. Thus we get a broken cluster chain.

To fix both issues, we introduce an unordered set in the change. This
could help improve clustering in some hard case.

One key reason the change causes so many test check changes is: As the
cluster candidates are not ordered now, the candidates might be picked
in different order from before.

The most affected targets are: AMDGPU, AArch64, RISCV.

For RISCV, it seems to me most are just minor instruction reorder, don't
see obvious regression.

For AArch64, there were some combining of ldr into ldp being affected.
With two cases being regressed and two being improved. This has more
deeper reason that machine scheduler cannot cluster them well both
before and after the change, and the load combine algorithm later is
also not smart enough.

For AMDGPU, some cases have more v_dual instructions used while some are
regressed. It seems less critical. Seems like test `v_vselect_v32bf16`
gets more buffer_load being claused.
2025-06-05 15:28:04 +08:00

3668 lines
163 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
define amdgpu_kernel void @select_f16(
; SI-LABEL: select_f16:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s10
; SI-NEXT: s_mov_b32 s17, s11
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s20, s12
; SI-NEXT: s_mov_b32 s21, s13
; SI-NEXT: s_mov_b32 s22, s2
; SI-NEXT: s_mov_b32 s23, s3
; SI-NEXT: s_mov_b32 s12, s14
; SI-NEXT: s_mov_b32 s13, s15
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s10
; VI-NEXT: s_mov_b32 s17, s11
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s20, s12
; VI-NEXT: s_mov_b32 s21, s13
; VI-NEXT: s_mov_b32 s22, s2
; VI-NEXT: s_mov_b32 s23, s3
; VI-NEXT: s_mov_b32 s12, s14
; VI-NEXT: s_mov_b32 s13, s15
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s0, s8
; VI-NEXT: s_mov_b32 s1, s9
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v3, off, s[24:27], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s7
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s7
; GFX11-FAKE16-NEXT: s_mov_b32 s26, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s27, s7
; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13
; GFX11-FAKE16-NEXT: s_mov_b32 s24, s14
; GFX11-FAKE16-NEXT: s_mov_b32 s25, s15
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
ptr addrspace(1) %d) {
entry:
%a.val = load volatile half, ptr addrspace(1) %a
%b.val = load volatile half, ptr addrspace(1) %b
%c.val = load volatile half, ptr addrspace(1) %c
%d.val = load volatile half, ptr addrspace(1) %d
%fcmp = fcmp olt half %a.val, %b.val
%r.val = select i1 %fcmp, half %c.val, half %d.val
store half %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_f16_imm_a(
; SI-LABEL: select_f16_imm_a:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_f16_imm_a:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_f16_imm_a:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s11
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s7
; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_f16_imm_a:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
ptr addrspace(1) %d) {
entry:
%b.val = load volatile half, ptr addrspace(1) %b
%c.val = load volatile half, ptr addrspace(1) %c
%d.val = load volatile half, ptr addrspace(1) %d
%fcmp = fcmp olt half 0xH3800, %b.val
%r.val = select i1 %fcmp, half %c.val, half %d.val
store half %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_f16_imm_b(
; SI-LABEL: select_f16_imm_b:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_f16_imm_b:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_f16_imm_b:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s11
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s7
; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_f16_imm_b:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %c,
ptr addrspace(1) %d) {
entry:
%a.val = load volatile half, ptr addrspace(1) %a
%c.val = load volatile half, ptr addrspace(1) %c
%d.val = load volatile half, ptr addrspace(1) %d
%fcmp = fcmp olt half %a.val, 0xH3800
%r.val = select i1 %fcmp, half %c.val, half %d.val
store half %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_f16_imm_c(
; SI-LABEL: select_f16_imm_c:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_f16_imm_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_f16_imm_c:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_f16_imm_c:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %d) {
entry:
%a.val = load volatile half, ptr addrspace(1) %a
%b.val = load volatile half, ptr addrspace(1) %b
%d.val = load volatile half, ptr addrspace(1) %d
%fcmp = fcmp olt half %a.val, %b.val
%r.val = select i1 %fcmp, half 0xH3800, half %d.val
store half %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_f16_imm_d(
; SI-LABEL: select_f16_imm_d:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_f16_imm_d:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_f16_imm_d:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_f16_imm_d:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %c) {
entry:
%a.val = load volatile half, ptr addrspace(1) %a
%b.val = load volatile half, ptr addrspace(1) %b
%c.val = load volatile half, ptr addrspace(1) %c
%fcmp = fcmp olt half %a.val, %b.val
%r.val = select i1 %fcmp, half %c.val, half 0xH3800
store half %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_v2f16(
; SI-LABEL: select_v2f16:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s10
; SI-NEXT: s_mov_b32 s17, s11
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s20, s12
; SI-NEXT: s_mov_b32 s21, s13
; SI-NEXT: s_mov_b32 s22, s2
; SI-NEXT: s_mov_b32 s23, s3
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: s_mov_b32 s12, s14
; SI-NEXT: s_mov_b32 s13, s15
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0
; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_v2f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s22, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s20, s12
; VI-NEXT: s_mov_b32 s21, s13
; VI-NEXT: s_mov_b32 s23, s7
; VI-NEXT: s_mov_b32 s16, s10
; VI-NEXT: s_mov_b32 s17, s11
; VI-NEXT: s_mov_b32 s18, s6
; VI-NEXT: s_mov_b32 s19, s7
; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_mov_b32 s12, s14
; VI-NEXT: s_mov_b32 s13, s15
; VI-NEXT: s_mov_b32 s14, s6
; VI-NEXT: s_mov_b32 s15, s7
; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0
; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s4, s8
; VI-NEXT: s_mov_b32 s5, s9
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4
; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v1, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1]
; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_v2f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[20:23], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[4:7], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44
; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s6, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s7, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s26, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s27, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s24, s14
; GFX11-FAKE16-NEXT: s_mov_b32 s25, s15
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, s8
; GFX11-FAKE16-NEXT: s_mov_b32 s1, s9
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
ptr addrspace(1) %d) {
entry:
%a.val = load <2 x half>, ptr addrspace(1) %a
%b.val = load <2 x half>, ptr addrspace(1) %b
%c.val = load <2 x half>, ptr addrspace(1) %c
%d.val = load <2 x half>, ptr addrspace(1) %d
%fcmp = fcmp olt <2 x half> %a.val, %b.val
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_v2f16_imm_a(
; SI-LABEL: select_v2f16_imm_a:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s2, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: s_movk_i32 s2, 0x3900
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], 0.5, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_v2f16_imm_a:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_a:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
ptr addrspace(1) %d) {
entry:
%b.val = load <2 x half>, ptr addrspace(1) %b
%c.val = load <2 x half>, ptr addrspace(1) %c
%d.val = load <2 x half>, ptr addrspace(1) %d
%fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_v2f16_imm_b(
; SI-LABEL: select_v2f16_imm_b:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s2, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3
; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: s_movk_i32 s2, 0x3900
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
; VI-NEXT: v_cmp_gt_f16_e64 s[0:1], 0.5, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_v2f16_imm_b:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_b:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %c,
ptr addrspace(1) %d) {
entry:
%a.val = load <2 x half>, ptr addrspace(1) %a
%c.val = load <2 x half>, ptr addrspace(1) %c
%d.val = load <2 x half>, ptr addrspace(1) %d
%fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_v2f16_imm_c(
; SI-LABEL: select_v2f16_imm_c:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_v2f16_imm_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4
; VI-NEXT: v_mov_b32_e32 v1, 0x3900
; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_v2f16_imm_c:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_c:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %d) {
entry:
%a.val = load <2 x half>, ptr addrspace(1) %a
%b.val = load <2 x half>, ptr addrspace(1) %b
%d.val = load <2 x half>, ptr addrspace(1) %d
%fcmp = fcmp olt <2 x half> %a.val, %b.val
%r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
define amdgpu_kernel void @select_v2f16_imm_d(
; SI-LABEL: select_v2f16_imm_d:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: select_v2f16_imm_d:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4
; VI-NEXT: v_mov_b32_e32 v1, 0x3900
; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: select_v2f16_imm_d:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_d:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10
; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4
; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6
; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %c) {
entry:
%a.val = load <2 x half>, ptr addrspace(1) %a
%b.val = load <2 x half>, ptr addrspace(1) %b
%c.val = load <2 x half>, ptr addrspace(1) %c
%fcmp = fcmp olt <2 x half> %a.val, %b.val
%r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
define <4 x half> @v_select_v4f16(<4 x half> %a, <4 x half> %b, i32 %cond) {
; SI-LABEL: v_select_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_select_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
%select = select i1 %cmp, <4 x half> %a, <4 x half> %b
ret <4 x half> %select
}
define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond) {
; SI-LABEL: v_vselect_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_vselect_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_vselect_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v4
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v0.h, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_vselect_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <4 x i32> %cond, zeroinitializer
%select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
ret <4 x half> %select
}
define <8 x half> @v_select_v8f16(<8 x half> %a, <8 x half> %b, i32 %cond) {
; SI-LABEL: v_select_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; SI-NEXT: v_or_b32_e32 v7, v14, v7
; SI-NEXT: v_or_b32_e32 v5, v12, v5
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v1, v8, v1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_select_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v8f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
%select = select i1 %cmp, <8 x half> %a, <8 x half> %b
ret <8 x half> %select
}
define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond) {
; SI-LABEL: v_vselect_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v8, v9
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v8, v15
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_vselect_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v17, v16, vcc
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v11
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v13
; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v15
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_vselect_v8f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v10
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v12
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v15
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v13
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v11
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v9
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v14
; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.h, v3.h, s2
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.h, v2.h, s3
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s4
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s1
; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_vselect_v8f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v3
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v6
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v15, v3, 0x5040100
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <8 x i32> %cond, zeroinitializer
%select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
ret <8 x half> %select
}
define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) {
; SI-LABEL: v_select_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v29
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v28
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32
; SI-NEXT: v_or_b32_e32 v13, v15, v13
; SI-NEXT: v_cvt_f16_f32_e32 v15, v27
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v15, v26, v15
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v25
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v23
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v21
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17
; SI-NEXT: v_or_b32_e32 v9, v24, v9
; SI-NEXT: v_or_b32_e32 v7, v22, v7
; SI-NEXT: v_or_b32_e32 v5, v20, v5
; SI-NEXT: v_or_b32_e32 v1, v16, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v30
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v11, v3
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19
; SI-NEXT: v_or_b32_e32 v11, v18, v11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc
; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
; SI-NEXT: v_cvt_f32_f16_e32 v8, v9
; SI-NEXT: v_cvt_f32_f16_e32 v10, v15
; SI-NEXT: v_cvt_f32_f16_e32 v12, v13
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v14, v16
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_select_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc
; VI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v16f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v2 :: v_dual_cndmask_b32 v3, v11, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v12, v4 :: v_dual_cndmask_b32 v5, v13, v5
; GFX11-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
%select = select i1 %cmp, <16 x half> %a, <16 x half> %b
ret <16 x half> %select
}
define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> %cond) {
; SI-LABEL: v_vselect_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v33
; SI-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7]
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; SI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
; SI-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5]
; SI-NEXT: v_cvt_f16_f32_e32 v19, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v22
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v19, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v23
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35
; SI-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v19, v21
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31
; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9]
; SI-NEXT: v_cvt_f16_f32_e32 v16, v17
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v32
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9]
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56
; SI-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; SI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v17, v25
; SI-NEXT: v_cvt_f16_f32_e32 v24, v26
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v29
; SI-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v17, v24
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v28
; SI-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v17, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v27
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
; SI-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
; SI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v30
; SI-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v17, v23
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; SI-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_vselect_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32
; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v22
; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v24
; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v14
; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v29
; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v26
; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v28
; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v27
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v4
; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12
; VI-NEXT: v_cndmask_b32_e64 v22, v24, v22, s[20:21]
; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v20
; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v13
; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[14:15]
; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3
; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13]
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11
; VI-NEXT: v_cndmask_b32_e32 v25, v27, v26, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11]
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2
; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[8:9]
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7]
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; VI-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; VI-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v11
; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v30
; VI-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v13
; VI-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[18:19]
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v7
; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[16:17]
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v25
; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18
; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22
; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; VI-NEXT: v_cndmask_b32_e32 v8, v15, v20, vcc
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_vselect_v16f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v24
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v18
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v20
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v22
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v26
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v28
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v30
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 0, v17
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 0, v19
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 0, v29
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 0, v27
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 0, v25
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 0, v23
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 0, v21
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v12.l, v4.l, s3
; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v14.h, v6.h, s9
; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.h, v5.h, s10
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v12.h, v4.h, s11
; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v3.h, s12
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v10.h, v2.h, s13
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v9.h, v1.h, s8
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.h, v0.h, s7
; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v15.l, v7.l, s6
; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v14.l, v6.l, s5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v13.l, v5.l, s4
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.l, v1.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v10.l, v2.l, s1
; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v11.l, v3.l, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v31
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v15.h, v7.h, s3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_vselect_v16f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v13
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v9
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v5, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v4, v10, v4, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v14, v3, 0x5040100
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v11, v7, 0x5040100
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <16 x i32> %cond, zeroinitializer
%select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b
ret <16 x half> %select
}
define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
; SI-LABEL: v_select_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v20, v20, v21
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_or_b32_e32 v22, v22, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v30
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_or_b32_e32 v24, v24, v25
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_or_b32_e32 v26, v26, v27
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_or_b32_e32 v28, v28, v29
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_or_b32_e32 v18, v18, v19
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_or_b32_e32 v16, v16, v17
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v21, v23, v21
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_or_b32_e32 v23, v25, v23
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_or_b32_e32 v25, v27, v25
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_or_b32_e32 v27, v29, v27
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_or_b32_e32 v29, v30, v29
; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: v_or_b32_e32 v30, v31, v30
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; SI-NEXT: v_or_b32_e32 v31, v32, v31
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v19, v32, v19
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v17, v32, v17
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v15, v32, v15
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v13, v32, v13
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v11, v32, v11
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v9, v32, v9
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v7, v32, v7
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v5, v32, v5
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v3, v32, v3
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_or_b32_e32 v1, v32, v1
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc
; SI-NEXT: v_cndmask_b32_e32 v15, v15, v14, vcc
; SI-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
; SI-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
; SI-NEXT: v_cndmask_b32_e32 v31, v31, v20, vcc
; SI-NEXT: v_cndmask_b32_e32 v30, v30, v22, vcc
; SI-NEXT: v_cndmask_b32_e32 v29, v29, v24, vcc
; SI-NEXT: v_cndmask_b32_e32 v27, v27, v26, vcc
; SI-NEXT: v_cndmask_b32_e32 v32, v25, v28, vcc
; SI-NEXT: v_cndmask_b32_e32 v33, v23, v21, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v7
; SI-NEXT: v_cvt_f32_f16_e32 v8, v9
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_cvt_f32_f16_e32 v10, v11
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; SI-NEXT: v_cvt_f32_f16_e32 v12, v13
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_cvt_f32_f16_e32 v14, v15
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_cvt_f32_f16_e32 v16, v17
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; SI-NEXT: v_cvt_f32_f16_e32 v18, v19
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v20, v31
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v31
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30
; SI-NEXT: v_cvt_f32_f16_e32 v24, v29
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v26, v27
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v32
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v22, v30
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v28, v32
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v30, v33
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_select_v32f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc
; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc
; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc
; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc
; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc
; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc
; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc
; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v32f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v32, off, s32
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1
; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v3
; GFX11-NEXT: v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v5, v21, v5
; GFX11-NEXT: v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
; GFX11-NEXT: v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v9, v25, v9
; GFX11-NEXT: v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
; GFX11-NEXT: v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v13, v29, v13
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_cndmask_b32 v14, v30, v14 :: v_dual_cndmask_b32 v15, v32, v15
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
%select = select i1 %cmp, <32 x half> %a, <32 x half> %b
ret <32 x half> %select
}
define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> %cond) {
; SI-LABEL: v_vselect_v32f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v0, v31, v0, s[16:17]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v1, v31, v1, s[14:15]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v2, v31, v2, s[12:13]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v3, v31, v3, s[10:11]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v4, v31, v4, s[8:9]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v5, v31, v5, s[6:7]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e64 v6, v31, v6, s[4:5]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v7, v31, v7, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v8, v31, v8, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v9, v31, v9, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v10, v31, v10, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v11, v31, v11, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v12, v31, v12, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v14, v31, v14, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v17, v31, v17, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v18, v31, v18, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v19, v31, v19, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v20, v31, v20, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v21, v31, v21, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v22, v31, v22, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v23, v31, v23, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v24, v31, v24, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v25, v31, v25, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v27, v31, v27, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v28, v31, v28, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v29, v31, v29, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v30, v31, v30, vcc
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_vselect_v32f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128
; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64
; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56
; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48
; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40
; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32
; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30
; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v13
; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v29
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v12
; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v27
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v10
; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v26
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36
; VI-NEXT: v_cndmask_b32_e32 v36, v43, v38, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35
; VI-NEXT: v_cndmask_b32_e32 v35, v45, v44, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
; VI-NEXT: v_cndmask_b32_e32 v34, v47, v46, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33
; VI-NEXT: v_cndmask_b32_e32 v33, v57, v56, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25
; VI-NEXT: v_cndmask_b32_e32 v32, v59, v58, vcc
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8
; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v24
; VI-NEXT: v_cndmask_b32_e32 v38, v38, v60, vcc
; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39
; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v15
; VI-NEXT: v_cndmask_b32_e32 v39, v44, v43, vcc
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55
; VI-NEXT: v_cndmask_b32_e32 v31, v31, v45, vcc
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50
; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53
; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v5
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54
; VI-NEXT: v_cndmask_b32_e32 v54, v43, v55, vcc
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v4
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52
; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51
; VI-NEXT: v_cndmask_b32_e32 v51, v43, v55, vcc
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49
; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc
; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84
; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48
; VI-NEXT: v_cndmask_b32_e32 v48, v46, v43, vcc
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v16
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44
; VI-NEXT: v_cndmask_b32_e32 v46, v58, v46, vcc
; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28
; VI-NEXT: v_cndmask_b32_e32 v15, v37, v15, vcc
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
; VI-NEXT: s_waitcnt vmcnt(12)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44
; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55
; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45
; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc
; VI-NEXT: s_waitcnt vmcnt(10)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47
; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc
; VI-NEXT: s_waitcnt vmcnt(9)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56
; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43
; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57
; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59
; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40
; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v58
; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41
; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v46
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48
; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49
; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v51
; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52
; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v54
; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53
; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50
; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v39
; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v38
; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32
; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v33
; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34
; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v35
; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v36
; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v31
; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_vselect_v32f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:28
; GFX11-TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:36
; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:44
; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:68
; GFX11-TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:76
; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:56
; GFX11-TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:64
; GFX11-TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:72
; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_b32 v87, off, s32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v33
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v34
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v35
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v36
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v37
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v38
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 0, v39
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 0, v48
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 0, v49
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 0, v50
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 0, v51
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 0, v52
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 0, v53
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 0, v54
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 0, v55
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 0, v64
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 0, v65
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 0, v66
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 0, v67
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 0, v68
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 0, v69
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 0, v70
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 0, v71
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 0, v80
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 0, v81
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 0, v82
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 0, v83
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 0, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 0, v85
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s40, 0, v86
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v87.h, v15.h, s26
; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v30.h, v14.h, s27
; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v29.h, v13.h, s28
; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v28.h, v12.h, s29
; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v27.h, v11.h, s40
; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v26.h, v10.h, s25
; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v25.h, v9.h, s24
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v8.h, s23
; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v23.h, v7.h, s22
; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v22.h, v6.h, s21
; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v21.h, v5.h, s20
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v20.h, v4.h, s19
; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v19.h, v3.h, s18
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v18.h, v2.h, s17
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v17.h, v1.h, s16
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v16.h, v0.h, s15
; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v87.l, v15.l, s14
; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v30.l, v14.l, s13
; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v29.l, v13.l, s12
; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v28.l, v12.l, s11
; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v27.l, v11.l, s10
; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v26.l, v10.l, s9
; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v25.l, v9.l, s8
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v24.l, v8.l, s7
; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v23.l, v7.l, s6
; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v6.l, s5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, s4
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v17.l, v1.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v18.l, v2.l, s1
; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v19.l, v3.l, s2
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v20.l, v4.l, s3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_vselect_v32f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1f
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:112
; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:120
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32
; GFX11-FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:104
; GFX11-FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:96
; GFX11-FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:88
; GFX11-FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:80
; GFX11-FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:72
; GFX11-FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:64
; GFX11-FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:56
; GFX11-FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
; GFX11-FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:40
; GFX11-FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:32
; GFX11-FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:24
; GFX11-FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:16
; GFX11-FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:8
; GFX11-FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:124
; GFX11-FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:116
; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:108
; GFX11-FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:100
; GFX11-FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
; GFX11-FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:84
; GFX11-FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:76
; GFX11-FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:68
; GFX11-FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:60
; GFX11-FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:52
; GFX11-FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:44
; GFX11-FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:36
; GFX11-FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:28
; GFX11-FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:12
; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:20
; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:128
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v13
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v29
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v12
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v28
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v27
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v10
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v26
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v9
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v25
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v24
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v6
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v22
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v5
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v21
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v4
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v20
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v3
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v19
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v18
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v15
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v32
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v97, v98, v97, s0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v33
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v98, v32, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v14, v97, v14, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67
; GFX11-FAKE16-NEXT: v_perm_b32 v13, v99, v13, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69
; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71
; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81
; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83
; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v4, v50, v4, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v51, v3, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v32, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v31, v1, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v52, v2, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <32 x i32> %cond, zeroinitializer
%select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b
ret <32 x half> %select
}