Files
clang-p2996/llvm/test/CodeGen/AMDGPU/bf16.ll
Ruiling, Song 0487db1f13 MachineScheduler: Improve instruction clustering (#137784)
The existing way of managing clustered nodes was done through adding
weak edges between the neighbouring cluster nodes, which is a sort of
ordered queue. And this will be later recorded as `NextClusterPred` or
`NextClusterSucc` in `ScheduleDAGMI`.

But actually the instruction may be picked not in the exact order of the
queue. For example, we have a queue of cluster nodes A B C. But during
scheduling, node B might be picked first, then it will be very likely
that we only cluster B and C for Top-Down scheduling (leaving A alone).

Another issue is:
```
   if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
      std::swap(SUa, SUb);
   if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
```
may break the cluster queue.

For example, we want to cluster nodes (order as in `MemOpRecords`): 1 3
2. 1(SUa) will be pred of 3(SUb) normally. But when it comes to (3, 2),
As 3(SUa) > 2(SUb), we would reorder the two nodes, which makes 2 be
pred of 3. This makes both 1 and 2 become preds of 3, but there is no
edge between 1 and 2. Thus we get a broken cluster chain.

To fix both issues, we introduce an unordered set in the change. This
could help improve clustering in some hard case.

One key reason the change causes so many test check changes is: As the
cluster candidates are not ordered now, the candidates might be picked
in different order from before.

The most affected targets are: AMDGPU, AArch64, RISCV.

For RISCV, it seems to me most are just minor instruction reorder, don't
see obvious regression.

For AArch64, there were some combining of ldr into ldp being affected.
With two cases being regressed and two being improved. This has more
deeper reason that machine scheduler cannot cluster them well both
before and after the change, and the load combine algorithm later is
also not smart enough.

For AMDGPU, some cases have more v_dual instructions used while some are
regressed. It seems less critical. Seems like test `v_vselect_v32bf16`
gets more buffer_load being claused.
2025-06-05 15:28:04 +08:00

44631 lines
2.0 MiB

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_load_store:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_load_store:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
store bfloat %val, ptr addrspace(1) %out
ret void
}
define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <2 x bfloat>, ptr addrspace(1) %ptr
ret <2 x bfloat> %load
}
define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v3bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x bfloat>, ptr addrspace(1) %ptr
ret <3 x bfloat> %load
}
define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x bfloat>, ptr addrspace(1) %ptr
ret <4 x bfloat> %load
}
define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v6bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v6bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v6bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v6bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v6bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v6bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <6 x bfloat>, ptr addrspace(1) %ptr
ret <6 x bfloat> %load
}
define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <8 x bfloat>, ptr addrspace(1) %ptr
ret <8 x bfloat> %load
}
define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v4, v0
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_mov_b32_e32 v8, v0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v9, v1
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v[4:5], off
; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
ret <16 x bfloat> %load
}
define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v12, v0
; GFX8-NEXT: v_mov_b32_e32 v13, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v12
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v13, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v12
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[12:13]
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v12
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v17, v1
; GFX9-NEXT: v_mov_b32_e32 v16, v0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v17, v1
; GFX10-NEXT: v_mov_b32_e32 v16, v0
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
; GFX10-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
; GFX10-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[0:3], v[12:13], off
; GFX11-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
ret <32 x bfloat> %load
}
define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-LABEL: v_load_global_v64bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0
; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0
; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0
; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0
; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0
; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0
; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0
; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0
; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_global_v64bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x64, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 64, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0
; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0
; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: s_waitcnt vmcnt(9)
; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(12)
; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_load_global_v64bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v28, v0
; GFX8-NEXT: v_mov_b32_e32 v29, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v28
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v29, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v28
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v29, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v28
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v29, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v28
; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x50
; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v28
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v29, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x60
; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v28
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x70
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_global_v64bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v29, v1
; GFX9-NEXT: v_mov_b32_e32 v28, v0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v64bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v33, v1
; GFX10-NEXT: v_mov_b32_e32 v32, v0
; GFX10-NEXT: s_clause 0x7
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[32:33], off
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[32:33], off offset:16
; GFX10-NEXT: global_load_dwordx4 v[8:11], v[32:33], off offset:32
; GFX10-NEXT: global_load_dwordx4 v[12:15], v[32:33], off offset:48
; GFX10-NEXT: global_load_dwordx4 v[16:19], v[32:33], off offset:64
; GFX10-NEXT: global_load_dwordx4 v[20:23], v[32:33], off offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v[32:33], off offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v[32:33], off offset:112
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_load_global_v64bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_load_b128 v[0:3], v[28:29], off
; GFX11-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
; GFX11-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
; GFX11-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
; GFX11-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <64 x bfloat>, ptr addrspace(1) %ptr
ret <64 x bfloat> %load
}
define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dword v[1:2], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dword v[1:2], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <2 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dword v[2:3], v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_store_short v[2:3], v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v3bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <3 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <4 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <8 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <16 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5
; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16
; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16
; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26
; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16
; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v13
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_alignbit_b32 v12, v7, v12, 16
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27
; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_alignbit_b32 v28, v6, v7, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48
; GFX11-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32
; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <32 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v64bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16
; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16
; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16
; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: s_waitcnt vmcnt(6)
; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(13)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(12)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(11)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(10)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(6)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16
; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16
; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16
; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16
; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16
; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(6)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21
; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16
; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16
; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16
; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
; GCN-NEXT: s_waitcnt vmcnt(6)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16
; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_store_global_v64bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16
; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84
; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80
; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39
; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48
; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52
; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48
; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20
; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24
; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_store_global_v64bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX8-NEXT: s_movk_i32 s4, 0x70
; GFX8-NEXT: s_movk_i32 s5, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v34, vcc, s4, v32
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc
; GFX8-NEXT: s_movk_i32 s4, 0x60
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[28:31]
; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v32
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v32
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v32
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v33, vcc
; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_add_u32_e32 v24, vcc, 48, v32
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v26, vcc, 32, v32
; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v28, vcc, 16, v32
; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_store_global_v64bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_store_global_v64bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_store_global_v64bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112
; GFX11-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96
; GFX11-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80
; GFX11-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64
; GFX11-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48
; GFX11-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32
; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <64 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
; GCN-LABEL: test_store_fpimm:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80
; GCN-NEXT: v_mov_b32_e32 v5, 0x4228
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_store_fpimm:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80
; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228
; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_store_fpimm:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80
; GFX8-NEXT: flat_store_short v[0:1], v4
; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_store_fpimm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80
; GFX9-NEXT: global_store_short v[0:1], v4, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_store_fpimm:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80
; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228
; GFX10-NEXT: global_store_short v[0:1], v4, off
; GFX10-NEXT: global_store_short v[2:3], v5, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_store_fpimm:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v4, off
; GFX11TRUE16-NEXT: global_store_d16_hi_b16 v[2:3], v4, off
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_store_fpimm:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80
; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
store bfloat 1.0, ptr addrspace(1) %ptr0
store bfloat 42.0, ptr addrspace(1) %ptr1
ret void
}
define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_f32_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_f32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_f32_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_f32_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_f32_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_f32_to_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load float, ptr addrspace(1) %in
%val.bf16 = fptrunc float %val to bfloat
store bfloat %val.bf16, ptr addrspace(1) %out
ret void
}
define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_f64_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_f64_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_f64_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX8-NEXT: v_and_b32_e32 v7, 1, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7
; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v6, v4
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_f64_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_movk_i32 s8, 0x7fff
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7
; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX9-NEXT: v_add3_u32 v4, v5, v4, s8
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_f64_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX10-NEXT: v_and_b32_e32 v7, 1, v6
; GFX10-NEXT: v_cmp_gt_f64_e64 s4, |v[0:1]|, |v[4:5]|
; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v7
; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
; GFX10-NEXT: s_or_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo
; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_f64_to_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX11-NEXT: v_and_b32_e32 v7, 1, v6
; GFX11-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
; GFX11-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load double, ptr addrspace(1) %in
%val.bf16 = fptrunc double %val to bfloat
store bfloat %val.bf16, ptr addrspace(1) %out
ret void
}
define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_bf16_to_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_bf16_to_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_bf16_to_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: flat_store_dword v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_bf16_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_bf16_to_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_bf16_to_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
%val.f32 = fpext bfloat %val to float
store float %val.f32, ptr addrspace(1) %out
ret void
}
define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_bf16_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_bf16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_bf16_to_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_bf16_to_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_bf16_to_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_bf16_to_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
%val.f64 = fpext bfloat %val to double
store double %val.f64, ptr addrspace(1) %out
ret void
}
define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load <2 x bfloat>, ptr addrspace(1) %in
store <2 x bfloat> %val, ptr addrspace(1) %out
ret void
}
define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load <4 x bfloat>, ptr addrspace(1) %in
store <4 x bfloat> %val, ptr addrspace(1) %out
ret void
}
define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load <8 x bfloat>, ptr addrspace(1) %in
store <8 x bfloat> %val, ptr addrspace(1) %out
ret void
}
define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_load_store_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_load_store_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_load_store_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_load_store_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_load_store_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load <16 x bfloat>, ptr addrspace(1) %in
store <16 x bfloat> %val, ptr addrspace(1) %out
ret void
}
define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_arg_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_arg_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_short v[1:2], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_arg_store:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_short v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_short v[1:2], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_arg_store:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b16 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store bfloat %in, ptr addrspace(1) %out
ret void
}
define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_arg_store_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_arg_store_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dword v[1:2], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_arg_store_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dword v[1:2], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_arg_store_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_arg_store_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_arg_store_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dword v[2:3], v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_store_short v[2:3], v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_arg_store_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_arg_store_v3bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_arg_store_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_arg_store_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_arg_store_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_arg_store_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <4 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_arg_store_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_arg_store_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_arg_store_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_arg_store_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <8 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_arg_store_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_arg_store_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_arg_store_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_arg_store_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_arg_store_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <16 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_inreg_arg_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s39, 0xf000
; GCN-NEXT: s_mov_b32 s38, 0
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GCN-NEXT: s_mov_b32 s36, s38
; GCN-NEXT: s_mov_b32 s37, s38
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_inreg_arg_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s38, 0
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7-NEXT: s_mov_b32 s39, 0xf000
; GFX7-NEXT: s_mov_b32 s36, s38
; GFX7-NEXT: s_mov_b32 s37, s38
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_inreg_arg_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_inreg_arg_store:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_inreg_arg_store:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_inreg_arg_store:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s4
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_inreg_arg_store:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_mov_b32_e32 v2, s4
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
store bfloat %in, ptr addrspace(1) %out
ret void
}
define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
; GCN-LABEL: test_byval:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_byval:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_byval:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_byval:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_byval:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_byval:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11TRUE16-NEXT: scratch_store_b16 off, v1, s32
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_byval:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
store bfloat %val, ptr addrspace(5) %bv
%retval = load bfloat, ptr addrspace(5) %bv
ret bfloat %retval
}
define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
; GCN-LABEL: test_sret:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_sret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_sret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_sret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_sret:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_sret:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_store_b16 v0, v1, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store bfloat %val, ptr addrspace(5) %sret
ret void
}
define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_bitcast_from_bfloat:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_bitcast_from_bfloat:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_bitcast_from_bfloat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_bitcast_from_bfloat:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_bitcast_from_bfloat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_bitcast_from_bfloat:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_bitcast_from_bfloat:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
%val_int = bitcast bfloat %val to i16
store i16 %val_int, ptr addrspace(1) %out
ret void
}
define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: test_bitcast_to_bfloat:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_bitcast_to_bfloat:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_bitcast_to_bfloat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_bitcast_to_bfloat:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_bitcast_to_bfloat:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v2, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_bitcast_to_bfloat:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_bitcast_to_bfloat:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: global_load_u16 v2, v[2:3], off
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%val = load i16, ptr addrspace(1) %in
%val_fp = bitcast i16 %val to bfloat
store bfloat %val_fp, ptr addrspace(1) %out
ret void
}
define bfloat @test_ret(bfloat %in) {
; GCN-LABEL: test_ret:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_ret:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_ret:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret bfloat %in
}
define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
; GCN-LABEL: test_ret_v2bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_ret_v2bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ret_v2bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_ret_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_ret_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret <2 x bfloat> %in
}
define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
; GCN-LABEL: test_ret_v3bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_ret_v3bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ret_v3bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_ret_v3bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v3bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_ret_v3bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret <3 x bfloat> %in
}
define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
; GCN-LABEL: test_ret_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_ret_v4bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ret_v4bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_ret_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v4bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_ret_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret <4 x bfloat> %in
}
define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
; GCN-LABEL: test_ret_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_ret_v8bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ret_v8bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_ret_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v8bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_ret_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret <8 x bfloat> %in
}
define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
; GCN-LABEL: test_ret_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_ret_v16bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ret_v16bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_ret_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_ret_v16bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_ret_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret <16 x bfloat> %in
}
define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-LABEL: test_call:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v2, s30, 0
; GCN-NEXT: v_writelane_b32 v2, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_call:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s18, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX7-NEXT: v_writelane_b32 v2, s30, 0
; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
; GFX7-NEXT: v_readlane_b32 s30, v2, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_call:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s18, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX8-NEXT: v_writelane_b32 v2, s30, 0
; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_call:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s18, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_writelane_b32 v2, s30, 0
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s18, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s16
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v2, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%result = call bfloat @test_arg_store(bfloat %in)
store volatile bfloat %result, ptr addrspace(5) %out
ret void
}
define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-LABEL: test_call_v2bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v4, s30, 0
; GCN-NEXT: v_writelane_b32 v4, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: v_readlane_b32 s30, v4, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_call_v2bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s18, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX7-NEXT: v_writelane_b32 v4, s30, 0
; GFX7-NEXT: v_writelane_b32 v4, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_call_v2bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s18, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX8-NEXT: v_writelane_b32 v2, s30, 0
; GFX8-NEXT: v_writelane_b32 v2, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_call_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s18, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_writelane_b32 v2, s30, 0
; GFX9-NEXT: v_writelane_b32 v2, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s18, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s16
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v2, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: v_writelane_b32 v2, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v2, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v2, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
store volatile <2 x bfloat> %result, ptr addrspace(5) %out
ret void
}
define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-LABEL: test_call_v3bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v5, s30, 0
; GCN-NEXT: v_writelane_b32 v5, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v5, 1
; GCN-NEXT: v_readlane_b32 s30, v5, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_call_v3bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s18, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX7-NEXT: v_writelane_b32 v4, s30, 0
; GFX7-NEXT: v_writelane_b32 v4, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_call_v3bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s18, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX8-NEXT: v_writelane_b32 v4, s30, 0
; GFX8-NEXT: v_writelane_b32 v4, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_call_v3bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s18, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v3bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s18, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s16
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_v3bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v3, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
store volatile <3 x bfloat> %result, ptr addrspace(5) %out
ret void
}
define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-LABEL: test_call_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v8, s30, 0
; GCN-NEXT: v_writelane_b32 v8, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4
; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4
; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v1, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: v_readlane_b32 s30, v8, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_call_v4bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s18, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX7-NEXT: v_writelane_b32 v6, s30, 0
; GFX7-NEXT: v_writelane_b32 v6, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v6, 1
; GFX7-NEXT: v_readlane_b32 s30, v6, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_call_v4bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s18, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX8-NEXT: v_writelane_b32 v4, s30, 0
; GFX8-NEXT: v_writelane_b32 v4, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2
; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
; GFX8-NEXT: v_readlane_b32 s30, v4, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_call_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s18, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v4bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s18, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s16
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v3, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: v_writelane_b32 v3, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v3, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v3, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
store volatile <4 x bfloat> %result, ptr addrspace(5) %out
ret void
}
define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-LABEL: test_call_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v16, s30, 0
; GCN-NEXT: v_writelane_b32 v16, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8
; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8
; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8
; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8
; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8
; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8
; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v5, v11, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v4, v12, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v3, v13, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v1, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: v_readlane_b32 s30, v16, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_call_v8bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s18, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX7-NEXT: v_writelane_b32 v10, s30, 0
; GFX7-NEXT: v_writelane_b32 v10, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v10, 1
; GFX7-NEXT: v_readlane_b32 s30, v10, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_call_v8bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s18, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX8-NEXT: v_writelane_b32 v6, s30, 0
; GFX8-NEXT: v_writelane_b32 v6, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4
; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4
; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v6, 1
; GFX8-NEXT: v_readlane_b32 s30, v6, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_call_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s18, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_writelane_b32 v5, s30, 0
; GFX9-NEXT: v_writelane_b32 v5, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v5, 1
; GFX9-NEXT: v_readlane_b32 s30, v5, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v8bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s18, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s16
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v5, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: v_writelane_b32 v5, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v5, 1
; GFX10-NEXT: v_readlane_b32 s30, v5, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v5, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v5, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
store volatile <8 x bfloat> %result, ptr addrspace(5) %out
ret void
}
define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-LABEL: test_call_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_writelane_b32 v20, s30, 0
; GCN-NEXT: v_writelane_b32 v20, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16
; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16
; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16
; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16
; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16
; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16
; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16
; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16
; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v20, 1
; GCN-NEXT: v_readlane_b32 s30, v20, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_call_v16bf16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s18, s33
; GFX7-NEXT: s_mov_b32 s33, s32
; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[16:17]
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: s_getpc_b64 s[16:17]
; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX7-NEXT: v_writelane_b32 v18, s30, 0
; GFX7-NEXT: v_writelane_b32 v18, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v18, 1
; GFX7-NEXT: v_readlane_b32 s30, v18, 0
; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_call_v16bf16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s18, s33
; GFX8-NEXT: s_mov_b32 s33, s32
; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[16:17]
; GFX8-NEXT: s_addk_i32 s32, 0x400
; GFX8-NEXT: s_getpc_b64 s[16:17]
; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX8-NEXT: v_writelane_b32 v10, s30, 0
; GFX8-NEXT: v_writelane_b32 v10, s31, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8
; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8
; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v8
; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v8
; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 12, v8
; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v8
; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v10, 1
; GFX8-NEXT: v_readlane_b32 s30, v10, 0
; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_call_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s18, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_writelane_b32 v9, s30, 0
; GFX9-NEXT: v_writelane_b32 v9, s31, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v9, 1
; GFX9-NEXT: v_readlane_b32 s30, v9, 0
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_v16bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s18, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s16
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v9, s30, 0
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: v_writelane_b32 v9, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v9, 1
; GFX10-NEXT: v_readlane_b32 s30, v9, 0
; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v9, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v9, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v9, 1
; GFX11-NEXT: v_readlane_b32 s30, v9, 0
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
store volatile <16 x bfloat> %result, ptr addrspace(5) %out
ret void
}
define bfloat @test_alloca_load_store_ret(bfloat %in) {
; GCN-LABEL: test_alloca_load_store_ret:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_alloca_load_store_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_alloca_load_store_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_alloca_load_store_ret:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_alloca_load_store_ret:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_alloca_load_store_ret:
; GFX11TRUE16: ; %bb.0: ; %entry
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GFX11TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_alloca_load_store_ret:
; GFX11FAKE16: ; %bb.0: ; %entry
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GFX11FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%in.addr = alloca bfloat, align 2, addrspace(5)
store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
%loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
ret bfloat %loaded
}
define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GCN-LABEL: test_overflow_stack:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v0
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: buffer_store_dword v31, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0
; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0
; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0
; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0
; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0
; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0
; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0
; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0
; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0
; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0
; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0
; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0
; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0
; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0
; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_overflow_stack:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32
; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; GFX7-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; GFX7-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; GFX7-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_overflow_stack:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x78, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x74, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
; GFX8-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
; GFX8-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
; GFX8-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0
; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0
; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0
; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0
; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_overflow_stack:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(25)
; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: s_waitcnt vmcnt(25)
; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
; GFX9-NEXT: s_waitcnt vmcnt(25)
; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_overflow_stack:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: test_overflow_stack:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x2
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11TRUE16-NEXT: s_clause 0x3
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
; GFX11TRUE16-NEXT: s_clause 0x1
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[2:5], off
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x2
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
; GFX11TRUE16-NEXT: scratch_store_b16 v0, v1, off offset:128
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: test_overflow_stack:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: s_clause 0x2
; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11FAKE16-NEXT: s_clause 0x5
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[2:5], off
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: s_clause 0x2
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
; GFX11FAKE16-NEXT: scratch_store_b16 v0, v1, off offset:128
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
%ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
ret { <32 x i32>, bfloat } %ins.1
}
define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v2bf16_to_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <2 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <2 x bfloat> %load to <2 x float>
ret <2 x float> %fpext
}
define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v3bf16_to_v3f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <3 x bfloat> %load to <3 x float>
ret <3 x float> %fpext
}
define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v4bf16_to_v4f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <4 x bfloat> %load to <4 x float>
ret <4 x float> %fpext
}
define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v5bf16_to_v5f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <5 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <5 x bfloat> %load to <5 x float>
ret <5 x float> %fpext
}
define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v6bf16_to_v6f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx3 v[3:5], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx3 v[3:5], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b96 v[3:5], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <6 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <6 x bfloat> %load to <6 x float>
ret <6 x float> %fpext
}
define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v8bf16_to_v8f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <8 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <8 x bfloat> %load to <8 x float>
ret <8 x float> %fpext
}
define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v16bf16_to_v16f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <16 x bfloat> %load to <16 x float>
ret <16 x float> %fpext
}
define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v32bf16_to_v32f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[2:3]
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[2:3]
; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
; GFX9-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16
; GFX10-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32
; GFX10-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16
; GFX11-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32
; GFX11-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v21
; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v28
; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v29
; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v30
; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <32 x bfloat> %load to <32 x float>
ret <32 x float> %fpext
}
define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v2bf16_to_v2f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <2 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <2 x bfloat> %load to <2 x double>
ret <2 x double> %fpext
}
define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v3bf16_to_v3f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <3 x bfloat> %load to <3 x double>
ret <3 x double> %fpext
}
define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v4bf16_to_v4f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <4 x bfloat> %load to <4 x double>
ret <4 x double> %fpext
}
define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v5bf16_to_v5f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v4
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <5 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <5 x bfloat> %load to <5 x double>
ret <5 x double> %fpext
}
define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v6bf16_to_v6f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <6 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <6 x bfloat> %load to <6 x double>
ret <6 x double> %fpext
}
define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v8bf16_to_v8f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v8
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v10
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <8 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <8 x bfloat> %load to <8 x double>
ret <8 x double> %fpext
}
define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v16bf16_to_v16f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v8
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v9
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v4
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v6
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v7
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v8
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v9
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
; GFX7-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GFX7-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
; GFX7-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GFX7-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
; GFX8-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GFX8-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
; GFX8-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GFX8-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v7
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v9
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v9
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v11
; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v9
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v10
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v11
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v12
; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v14
; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX10-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
; GFX10-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX10-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v26
; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off
; GFX11-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v23
; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v24
; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v24
; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25
; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v26
; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v26
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v24
; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v27
; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28
; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <16 x bfloat> %load to <16 x double>
ret <16 x double> %fpext
}
define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-LABEL: global_extload_v32bf16_to_v32f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
; GCN-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
; GCN-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
; GCN-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
; GCN-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
; GCN-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
; GCN-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
; GCN-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
; GCN-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0
; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0
; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0
; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0
; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0
; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0
; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0
; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0
; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0
; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0
; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0
; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0
; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0
; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0
; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0
; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0
; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0
; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0
; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0
; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0
; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9
; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12
; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13
; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62
; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60
; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58
; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56
; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54
; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18
; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20
; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22
; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
; GFX7-NEXT: s_waitcnt vmcnt(14)
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0
; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0
; GFX7-NEXT: s_waitcnt vmcnt(14)
; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0
; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0
; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0
; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0
; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0
; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0
; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0
; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0
; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0
; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0
; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32
; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16
; GFX7-NEXT: s_waitcnt vmcnt(14)
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34
; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0
; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0
; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13
; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14
; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0
; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12
; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0
; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7
; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0
; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0
; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0
; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0
; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0
; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0
; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0
; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1
; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1
; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v45, v[50:51]
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v46, v[50:51]
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v47, v[52:53]
; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v56, v[54:55]
; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v57, v[39:40]
; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v58, v[39:40]
; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1
; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1
; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v42, v[42:43]
; GFX8-NEXT: flat_load_ushort v34, v[33:34]
; GFX8-NEXT: flat_load_ushort v36, v[35:36]
; GFX8-NEXT: flat_load_ushort v38, v[37:38]
; GFX8-NEXT: flat_load_ushort v39, v[48:49]
; GFX8-NEXT: flat_load_ushort v48, v[50:51]
; GFX8-NEXT: flat_load_ushort v51, v[52:53]
; GFX8-NEXT: flat_load_ushort v52, v[54:55]
; GFX8-NEXT: flat_load_ushort v53, v[40:41]
; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1
; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v37, v[3:4]
; GFX8-NEXT: flat_load_ushort v35, v[5:6]
; GFX8-NEXT: flat_load_ushort v33, v[7:8]
; GFX8-NEXT: flat_load_ushort v8, v[9:10]
; GFX8-NEXT: flat_load_ushort v6, v[11:12]
; GFX8-NEXT: flat_load_ushort v4, v[13:14]
; GFX8-NEXT: flat_load_ushort v2, v[15:16]
; GFX8-NEXT: flat_load_ushort v1, v[19:20]
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0
; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
; GFX8-NEXT: s_waitcnt vmcnt(14)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
; GFX8-NEXT: flat_load_ushort v3, v[17:18]
; GFX8-NEXT: flat_load_ushort v5, v[21:22]
; GFX8-NEXT: flat_load_ushort v7, v[23:24]
; GFX8-NEXT: flat_load_ushort v9, v[25:26]
; GFX8-NEXT: flat_load_ushort v10, v[27:28]
; GFX8-NEXT: flat_load_ushort v11, v[29:30]
; GFX8-NEXT: flat_load_ushort v12, v[31:32]
; GFX8-NEXT: flat_load_ushort v13, v[49:50]
; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(14)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38
; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0
; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0
; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0
; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0
; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0
; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0
; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0
; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0
; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0
; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0
; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0
; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0
; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0
; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0
; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0
; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0
; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0
; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0
; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0
; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0
; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0
; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0
; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0
; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0
; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0
; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0
; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:62
; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:60
; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:58
; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:56
; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:54
; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:52
; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:50
; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:48
; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:46
; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:44
; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:42
; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:40
; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:38
; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:36
; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:34
; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:32
; GFX9-NEXT: global_load_ushort v26, v[1:2], off
; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:2
; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24
; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:30
; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26
; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28
; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4
; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6
; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8
; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10
; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12
; GFX9-NEXT: s_waitcnt vmcnt(29)
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11
; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v12
; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16
; GFX9-NEXT: s_waitcnt vmcnt(30)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
; GFX9-NEXT: s_waitcnt vmcnt(31)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v18
; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v14
; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v15
; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
; GFX9-NEXT: s_waitcnt vmcnt(32)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19
; GFX9-NEXT: s_waitcnt vmcnt(30)
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v23
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200
; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196
; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192
; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v21
; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24
; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v19
; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v20
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2
; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144
; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v25
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11
; GFX9-NEXT: s_waitcnt vmcnt(38)
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
; GFX9-NEXT: s_waitcnt vmcnt(38)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112
; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v17
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26
; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v27
; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v2
; GFX9-NEXT: s_waitcnt vmcnt(41)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32
; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104
; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v2
; GFX9-NEXT: s_waitcnt vmcnt(41)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33
; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
; GFX9-NEXT: s_waitcnt vmcnt(41)
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7
; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v10
; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44
; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1f
; GFX10-NEXT: global_load_ushort v3, v[1:2], off
; GFX10-NEXT: global_load_ushort v4, v[1:2], off offset:2
; GFX10-NEXT: global_load_ushort v5, v[1:2], off offset:4
; GFX10-NEXT: global_load_ushort v6, v[1:2], off offset:6
; GFX10-NEXT: global_load_ushort v7, v[1:2], off offset:8
; GFX10-NEXT: global_load_ushort v8, v[1:2], off offset:10
; GFX10-NEXT: global_load_ushort v9, v[1:2], off offset:12
; GFX10-NEXT: global_load_ushort v10, v[1:2], off offset:14
; GFX10-NEXT: global_load_ushort v11, v[1:2], off offset:16
; GFX10-NEXT: global_load_ushort v12, v[1:2], off offset:18
; GFX10-NEXT: global_load_ushort v13, v[1:2], off offset:20
; GFX10-NEXT: global_load_ushort v14, v[1:2], off offset:22
; GFX10-NEXT: global_load_ushort v15, v[1:2], off offset:24
; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26
; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28
; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30
; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62
; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32
; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34
; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36
; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60
; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38
; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40
; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58
; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42
; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44
; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56
; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46
; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48
; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54
; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50
; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52
; GFX10-NEXT: s_waitcnt vmcnt(31)
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3
; GFX10-NEXT: s_waitcnt vmcnt(30)
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
; GFX10-NEXT: s_waitcnt vmcnt(29)
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(28)
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6
; GFX10-NEXT: s_waitcnt vmcnt(27)
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7
; GFX10-NEXT: s_waitcnt vmcnt(26)
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8
; GFX10-NEXT: s_waitcnt vmcnt(25)
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9
; GFX10-NEXT: s_waitcnt vmcnt(24)
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX10-NEXT: s_waitcnt vmcnt(23)
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11
; GFX10-NEXT: s_waitcnt vmcnt(22)
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12
; GFX10-NEXT: s_waitcnt vmcnt(21)
; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
; GFX10-NEXT: s_waitcnt vmcnt(20)
; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14
; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35
; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36
; GFX10-NEXT: s_waitcnt vmcnt(17)
; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17
; GFX10-NEXT: s_waitcnt vmcnt(16)
; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18
; GFX10-NEXT: s_waitcnt vmcnt(15)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GFX10-NEXT: s_waitcnt vmcnt(14)
; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20
; GFX10-NEXT: s_waitcnt vmcnt(13)
; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21
; GFX10-NEXT: s_waitcnt vmcnt(12)
; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22
; GFX10-NEXT: s_waitcnt vmcnt(11)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23
; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX10-NEXT: s_waitcnt vmcnt(9)
; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27
; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30
; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33
; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34
; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31
; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24
; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71
; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68
; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16
; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70
; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23
; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37
; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38
; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240
; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25
; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232
; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27
; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48
; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224
; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81
; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49
; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80
; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212
; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208
; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69
; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64
; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50
; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51
; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67
; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39
; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196
; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192
; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55
; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180
; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176
; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53
; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172
; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168
; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52
; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160
; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156
; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116
; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112
; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92
; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88
; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84
; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76
; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72
; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68
; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64
; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60
; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56
; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52
; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48
; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4
; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:2
; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:12
; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:8
; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:4
; GFX11-NEXT: global_load_u16 v7, v[1:2], off
; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6
; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10
; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14
; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:18
; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:28
; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:24
; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:20
; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16
; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22
; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26
; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30
; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:34
; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:44
; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:40
; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:36
; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32
; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38
; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42
; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46
; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:50
; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:60
; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:56
; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:52
; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48
; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: s_waitcnt vmcnt(30)
; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4
; GFX11-NEXT: s_waitcnt vmcnt(29)
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_waitcnt vmcnt(28)
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: s_waitcnt vmcnt(27)
; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7
; GFX11-NEXT: s_waitcnt vmcnt(26)
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8
; GFX11-NEXT: s_waitcnt vmcnt(25)
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: s_waitcnt vmcnt(24)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: s_waitcnt vmcnt(23)
; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v11
; GFX11-NEXT: s_waitcnt vmcnt(22)
; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v12
; GFX11-NEXT: s_waitcnt vmcnt(21)
; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-NEXT: s_waitcnt vmcnt(20)
; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-NEXT: s_waitcnt vmcnt(19)
; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v15
; GFX11-NEXT: s_waitcnt vmcnt(18)
; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v16
; GFX11-NEXT: s_waitcnt vmcnt(17)
; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11-NEXT: s_waitcnt vmcnt(16)
; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GFX11-NEXT: s_waitcnt vmcnt(15)
; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19
; GFX11-NEXT: s_waitcnt vmcnt(14)
; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20
; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-NEXT: s_waitcnt vmcnt(12)
; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-NEXT: s_waitcnt vmcnt(11)
; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v23
; GFX11-NEXT: s_waitcnt vmcnt(10)
; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v27
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v65
; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v29
; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64
; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33
; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1
; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v30
; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v52
; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53
; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26
; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v49
; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25
; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v21
; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48
; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v22
; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v34
; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v35
; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v36
; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v101
; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18
; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v100
; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17
; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v13
; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v14
; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10
; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v38
; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v5
; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v6
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240
; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224
; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208
; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192
; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176
; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <32 x bfloat> %load to <32 x double>
ret <32 x double> %fpext
}
define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fadd_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd bfloat %a, %b
ret bfloat %op
}
define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_fadd_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v1, v1, v3
; GCN-NEXT: v_add_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add_f32_e32 v2, v3, v2
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd <2 x bfloat> %a, %b
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-LABEL: v_fadd_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v2, v2, v5
; GCN-NEXT: v_add_f32_e32 v1, v1, v4
; GCN-NEXT: v_add_f32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add_f32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd <3 x bfloat> %a, %b
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_fadd_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v3, v3, v7
; GCN-NEXT: v_add_f32_e32 v2, v2, v6
; GCN-NEXT: v_add_f32_e32 v1, v1, v5
; GCN-NEXT: v_add_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_add_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_add_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_add_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_add_f32_e32 v3, v7, v6
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd <4 x bfloat> %a, %b
ret <4 x bfloat> %op
}
define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-LABEL: v_fadd_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v7, v7, v15
; GCN-NEXT: v_add_f32_e32 v6, v6, v14
; GCN-NEXT: v_add_f32_e32 v5, v5, v13
; GCN-NEXT: v_add_f32_e32 v4, v4, v12
; GCN-NEXT: v_add_f32_e32 v3, v3, v11
; GCN-NEXT: v_add_f32_e32 v2, v2, v10
; GCN-NEXT: v_add_f32_e32 v1, v1, v9
; GCN-NEXT: v_add_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v7, v7, v15
; GFX7-NEXT: v_add_f32_e32 v6, v6, v14
; GFX7-NEXT: v_add_f32_e32 v5, v5, v13
; GFX7-NEXT: v_add_f32_e32 v4, v4, v12
; GFX7-NEXT: v_add_f32_e32 v3, v3, v11
; GFX7-NEXT: v_add_f32_e32 v2, v2, v10
; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
; GFX7-NEXT: v_add_f32_e32 v0, v0, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_add_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_add_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_add_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_add_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX9-NEXT: v_add_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX9-NEXT: v_add_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX9-NEXT: v_add_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX9-NEXT: v_add_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_add_f32_e32 v8, v9, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX10-NEXT: v_add_f32_e32 v7, v10, v9
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_add_f32_e32 v6, v10, v6
; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
; GFX10-NEXT: v_add_f32_e32 v5, v15, v13
; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
; GFX11TRUE16-NEXT: v_add_f32_e32 v7, v10, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11TRUE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v10, v12, v7, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v7, v12, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v6, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
; GFX11TRUE16-NEXT: v_dual_add_f32 v5, v12, v11 :: v_dual_cndmask_b32 v2, v7, v10
; GFX11TRUE16-NEXT: v_add_f32_e32 v9, v14, v13
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v8
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v9, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
; GFX11TRUE16-NEXT: v_add3_u32 v10, v13, v9, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v7, v13, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v5
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v8bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_f32_e32 v3, v3, v7
; GFX11FAKE16-NEXT: v_add_f32_e32 v7, v10, v9
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_f32_e32 v6, v10, v6
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v4
; GFX11FAKE16-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX11FAKE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
; GFX11FAKE16-NEXT: v_add_f32_e32 v5, v15, v13
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %op
}
define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_fadd_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_add_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_add_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_add_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_add_f32_e32 v11, v11, v27
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_add_f32_e32 v10, v10, v26
; GCN-NEXT: v_add_f32_e32 v9, v9, v25
; GCN-NEXT: v_add_f32_e32 v8, v8, v24
; GCN-NEXT: v_add_f32_e32 v7, v7, v23
; GCN-NEXT: v_add_f32_e32 v6, v6, v22
; GCN-NEXT: v_add_f32_e32 v5, v5, v21
; GCN-NEXT: v_add_f32_e32 v4, v4, v20
; GCN-NEXT: v_add_f32_e32 v3, v3, v19
; GCN-NEXT: v_add_f32_e32 v2, v2, v18
; GCN-NEXT: v_add_f32_e32 v1, v1, v17
; GCN-NEXT: v_add_f32_e32 v0, v0, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_add_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
; GFX7-NEXT: v_add_f32_e32 v7, v7, v23
; GFX7-NEXT: v_add_f32_e32 v5, v5, v21
; GFX7-NEXT: v_add_f32_e32 v4, v4, v20
; GFX7-NEXT: v_add_f32_e32 v3, v3, v19
; GFX7-NEXT: v_add_f32_e32 v2, v2, v18
; GFX7-NEXT: v_add_f32_e32 v1, v1, v17
; GFX7-NEXT: v_add_f32_e32 v0, v0, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX8-NEXT: v_add_f32_e32 v16, v17, v16
; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_add_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_add_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_add_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_add_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_add_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_add_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_add_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_add_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX9-NEXT: v_add_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add_f32_e32 v7, v7, v15
; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX9-NEXT: v_add_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_add_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX9-NEXT: v_add_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_add_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX9-NEXT: v_add_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_add_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX9-NEXT: v_add_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_add_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX9-NEXT: v_add_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX9-NEXT: v_add_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX9-NEXT: v_add_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_add_f32_e32 v16, v17, v16
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX10-NEXT: v_add_f32_e32 v7, v7, v15
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX10-NEXT: v_add_f32_e32 v17, v18, v17
; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX10-NEXT: v_add_f32_e32 v6, v6, v14
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX10-NEXT: v_add_f32_e32 v17, v20, v19
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX10-NEXT: v_add_f32_e32 v5, v5, v13
; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v13, v19, v18
; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX10-NEXT: v_add_f32_e32 v4, v4, v12
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_add_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX10-NEXT: v_add_f32_e32 v18, v19, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX10-NEXT: v_add_f32_e32 v19, v22, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
; GFX10-NEXT: v_add_f32_e32 v9, v22, v20
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX10-NEXT: v_add_f32_e32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_add_f32 v18, v19, v18 :: v_dual_and_b32 v23, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_dual_add_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_add_f32 v7, v7, v15 :: v_dual_and_b32 v14, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v7, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v17, v17, v7, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v18, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo
; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_add_f32 v14, v17, v14 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_add_f32_e32 v5, v5, v13
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v20, v6, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v14, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v5, 16, 1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v15
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v18, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v14
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v5, 0x7fff
; GFX11TRUE16-NEXT: v_add_f32_e32 v4, v4, v12
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_add_f32_e32 v18, v21, v20
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v4, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v18
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v5, v19, v20 :: v_dual_and_b32 v20, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v12, v21, v18, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v17, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX11TRUE16-NEXT: v_dual_add_f32 v18, v20, v19 :: v_dual_and_b32 v19, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v4, v16, v17 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX11TRUE16-NEXT: v_add_f32_e32 v16, v20, v19
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v16, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v3, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v16, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v11, v11, v18, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_add3_u32 v10, v17, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v10, v17, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_add3_u32 v16, v22, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v1, 16, 1
; GFX11TRUE16-NEXT: v_add_f32_e32 v9, v22, v21
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v8
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v1, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_add_f32_e32 v17, v24, v23
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v9, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: v_add3_u32 v19, v23, v17, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v10
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v23, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v9
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v8, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v11
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v16bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add_f32_e32 v17, v18, v17
; GFX11FAKE16-NEXT: v_add_f32_e32 v6, v6, v14
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v7, v7, v15
; GFX11FAKE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
; GFX11FAKE16-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v4, v4, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_add_f32_e32 v5, v5, v13
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
; GFX11FAKE16-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_add_f32_e32 v12, v18, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add_f32_e32 v18, v19, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_add_f32_e32 v3, v3, v11
; GFX11FAKE16-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX11FAKE16-NEXT: v_add_f32_e32 v19, v22, v20
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9
; GFX11FAKE16-NEXT: v_add_f32_e32 v9, v22, v20
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd <16 x bfloat> %a, %b
ret <16 x bfloat> %op
}
define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_fadd_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_add_f32_e32 v31, v31, v32
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_add_f32_e32 v30, v30, v32
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_add_f32_e32 v29, v29, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_add_f32_e32 v28, v28, v32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_add_f32_e32 v27, v27, v32
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_add_f32_e32 v26, v26, v32
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_add_f32_e32 v25, v25, v32
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_add_f32_e32 v24, v24, v32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_add_f32_e32 v23, v23, v32
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_add_f32_e32 v22, v22, v32
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_add_f32_e32 v21, v21, v32
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_add_f32_e32 v20, v20, v32
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_add_f32_e32 v19, v19, v32
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_add_f32_e32 v18, v18, v32
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_add_f32_e32 v17, v17, v32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_add_f32_e32 v16, v16, v32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_add_f32_e32 v15, v15, v32
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_add_f32_e32 v14, v14, v32
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_add_f32_e32 v13, v13, v32
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_add_f32_e32 v12, v12, v32
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_add_f32_e32 v11, v11, v32
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_add_f32_e32 v10, v10, v32
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_add_f32_e32 v9, v9, v32
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_add_f32_e32 v8, v8, v32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_add_f32_e32 v7, v7, v32
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_add_f32_e32 v6, v6, v32
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_add_f32_e32 v5, v5, v32
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_add_f32_e32 v4, v4, v32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_add_f32_e32 v3, v3, v32
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_add_f32_e32 v2, v2, v32
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_add_f32_e32 v1, v1, v32
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_add_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: v_add_f32_e32 v31, v31, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_add_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_add_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_add_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX8-NEXT: v_add_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_add_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_add_f32_e32 v33, v33, v34
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_add_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_add_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_add_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_add_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_add_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_add_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_add_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_add_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_add_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_add_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_add_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_add_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_add_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_add_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_add_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_add_f32_e32 v31, v32, v31
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_add_f32_e32 v14, v14, v30
; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_add_f32_e32 v30, v32, v30
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
; GFX9-NEXT: v_add_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_add_f32_e32 v32, v32, v29
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_add_f32_e32 v12, v12, v28
; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
; GFX9-NEXT: v_add_f32_e32 v33, v33, v34
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_add_f32_e32 v29, v15, v29
; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX9-NEXT: v_add_f32_e32 v28, v33, v28
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_add_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_add_f32_e32 v27, v33, v27
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_add_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_add_f32_e32 v26, v33, v26
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_add_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_add_f32_e32 v25, v33, v25
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_add_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_add_f32_e32 v24, v33, v24
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_add_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_add_f32_e32 v23, v33, v23
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_add_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_add_f32_e32 v22, v33, v22
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_add_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_add_f32_e32 v21, v33, v21
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_add_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_add_f32_e32 v20, v33, v20
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_add_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_add_f32_e32 v19, v33, v19
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_add_f32_e32 v18, v33, v18
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_add_f32_e32 v17, v33, v17
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_add_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_add_f32_e32 v31, v32, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12
; GFX10-NEXT: v_add_f32_e32 v30, v14, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31
; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1
; GFX10-NEXT: v_add_f32_e32 v33, v33, v14
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff
; GFX10-NEXT: v_add_f32_e32 v35, v13, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5
; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_add_f32_e32 v34, v36, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_add_f32_e32 v33, v12, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_add_f32_e32 v35, v36, v12
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_add_f32_e32 v34, v11, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_add_f32_e32 v33, v36, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3
; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_add_f32_e32 v35, v10, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2
; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_add_f32_e32 v34, v36, v10
; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24
; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1
; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_add_f32_e32 v33, v36, v33
; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7
; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_add_f32_e32 v24, v35, v24
; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1
; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24
; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7
; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_add_f32_e32 v8, v34, v8
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff
; GFX10-NEXT: v_add_f32_e32 v35, v38, v37
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21
; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6
; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1
; GFX10-NEXT: v_add_f32_e32 v5, v5, v8
; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35
; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35
; GFX10-NEXT: v_add_f32_e32 v8, v21, v8
; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff
; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_add_f32_e32 v48, v49, v48
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4
; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1
; GFX10-NEXT: v_add_f32_e32 v49, v51, v49
; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48
; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff
; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8
; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17
; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9
; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10
; GFX10-NEXT: v_add_f32_e32 v49, v52, v49
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11
; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1
; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49
; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49
; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13
; GFX10-NEXT: v_add_f32_e32 v17, v49, v17
; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo
; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17
; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4
; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12
; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302
; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo
; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302
; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302
; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302
; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302
; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302
; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302
; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302
; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302
; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_add_f32_e32 v17, v33, v8
; GFX10-NEXT: v_add_f32_e32 v15, v15, v16
; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302
; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1
; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff
; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_v32bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11TRUE16-NEXT: v_add_f32_e32 v5, v5, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24
; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add_f32_e32 v3, v3, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11TRUE16-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v18
; GFX11TRUE16-NEXT: v_dual_add_f32 v18, v84, v83 :: v_dual_add_f32 v9, v9, v25
; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11TRUE16-NEXT: v_add_f32_e32 v17, v86, v85
; GFX11TRUE16-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v16
; GFX11TRUE16-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28
; GFX11TRUE16-NEXT: v_add_f32_e32 v7, v7, v23
; GFX11TRUE16-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_add_f32_e32 v4, v4, v20
; GFX11TRUE16-NEXT: v_add_f32_e32 v20, v80, v71
; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11TRUE16-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11TRUE16-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27
; GFX11TRUE16-NEXT: v_dual_add_f32 v26, v52, v51 :: v_dual_add_f32 v25, v54, v53
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add_f32_e32 v6, v6, v22
; GFX11TRUE16-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_add_f32_e32 v22, v68, v67
; GFX11TRUE16-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_add_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add_f32_e32 v14, v14, v30
; GFX11TRUE16-NEXT: v_add_f32_e32 v28, v48, v39
; GFX11TRUE16-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27
; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28
; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26
; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30
; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add_f32_e32 v15, v15, v33
; GFX11TRUE16-NEXT: v_add_f32_e32 v17, v31, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v32bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX11FAKE16-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19
; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11FAKE16-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v18
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v16
; GFX11FAKE16-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11FAKE16-NEXT: v_add_f32_e32 v7, v7, v23
; GFX11FAKE16-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
; GFX11FAKE16-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_add_f32_e32 v4, v4, v20
; GFX11FAKE16-NEXT: v_add_f32_e32 v20, v80, v71
; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX11FAKE16-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11FAKE16-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX11FAKE16-NEXT: v_add_f32_e32 v26, v52, v51
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_f32_e32 v6, v6, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
; GFX11FAKE16-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
; GFX11FAKE16-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_f32_e32 v29, v38, v37
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX11FAKE16-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_f32_e32 v14, v14, v30
; GFX11FAKE16-NEXT: v_add_f32_e32 v28, v48, v39
; GFX11FAKE16-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
; GFX11FAKE16-NEXT: v_add_f32_e32 v15, v15, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX11FAKE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fadd <32 x bfloat> %a, %b
ret <32 x bfloat> %op
}
define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GCN-LABEL: v_fadd_bf16_fpimm_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16_fpimm_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16_fpimm_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_bf16_fpimm_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16_fpimm_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_0:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_bf16_fpimm_0:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%add = fadd bfloat %arg0, 1.0
ret bfloat %add
}
define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GCN-LABEL: v_fadd_bf16_fpimm_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16_fpimm_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16_fpimm_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_bf16_fpimm_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fadd_bf16_fpimm_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_1:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_bf16_fpimm_1:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, 0x42280000, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%add = fadd bfloat %arg0, 42.0
ret bfloat %add
}
define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fsub_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fsub_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fsub_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fsub bfloat %a, %b
ret bfloat %op
}
define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_fsub_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_sub_f32_e32 v1, v1, v3
; GCN-NEXT: v_sub_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fsub_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fsub_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fsub <2 x bfloat> %a, %b
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-LABEL: v_fsub_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
; GCN-NEXT: v_sub_f32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v4
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fsub_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fsub_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11FAKE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fsub <3 x bfloat> %a, %b
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_fsub_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v3, v7
; GCN-NEXT: v_sub_f32_e32 v2, v2, v6
; GCN-NEXT: v_sub_f32_e32 v1, v1, v5
; GCN-NEXT: v_sub_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_sub_f32_e32 v3, v3, v7
; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v5
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fsub_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fsub_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fsub_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fsub <4 x bfloat> %a, %b
ret <4 x bfloat> %op
}
define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fmul_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul bfloat %a, %b
ret bfloat %op
}
define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_fmul_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul <2 x bfloat> %a, %b
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-LABEL: v_fmul_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul <3 x bfloat> %a, %b
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_fmul_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul <4 x bfloat> %a, %b
ret <4 x bfloat> %op
}
define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-LABEL: v_fmul_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v7, v10, v9
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_mul_f32_e32 v6, v10, v6
; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13
; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v2, v2, v6 :: v_dual_mul_f32 v3, v3, v7
; GFX11TRUE16-NEXT: v_mul_f32_e32 v7, v10, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11TRUE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v10, v12, v7, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v7, v12, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v6, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
; GFX11TRUE16-NEXT: v_dual_mul_f32 v5, v12, v11 :: v_dual_cndmask_b32 v2, v7, v10
; GFX11TRUE16-NEXT: v_mul_f32_e32 v9, v14, v13
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v8
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v9, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
; GFX11TRUE16-NEXT: v_add3_u32 v10, v13, v9, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v7, v13, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v5
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v8bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX11FAKE16-NEXT: v_mul_f32_e32 v7, v10, v9
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v6, v10, v6
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11FAKE16-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX11FAKE16-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
; GFX11FAKE16-NEXT: v_mul_f32_e32 v5, v15, v13
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul <8 x bfloat> %a, %b
ret <8 x bfloat> %op
}
define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_fmul_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_mul_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_mul_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21
; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17
; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18
; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX10-NEXT: v_mul_f32_e32 v19, v22, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v18, v19, v18 :: v_dual_and_b32 v23, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_dual_mul_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v7, v7, v15 :: v_dual_and_b32 v14, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v7, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v17, v17, v7, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v18, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo
; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v14, v17, v14 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v20, v6, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v14, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v5, 16, 1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v15
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v18, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v14
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v5, 0x7fff
; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_mul_f32_e32 v18, v21, v20
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v4, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v18
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v5, v19, v20 :: v_dual_and_b32 v20, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v12, v21, v18, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v17, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX11TRUE16-NEXT: v_dual_mul_f32 v18, v20, v19 :: v_dual_and_b32 v19, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v4, v16, v17 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX11TRUE16-NEXT: v_mul_f32_e32 v16, v20, v19
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v16, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v3, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v16, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v11, v11, v18, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_add3_u32 v10, v17, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v10, v17, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_add3_u32 v16, v22, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v1, 16, 1
; GFX11TRUE16-NEXT: v_mul_f32_e32 v9, v22, v21
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v1, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_mul_f32_e32 v17, v24, v23
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v9, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: v_add3_u32 v19, v23, v17, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v10
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v23, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v9
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v8, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v11
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v16bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v17, v18, v17
; GFX11FAKE16-NEXT: v_mul_f32_e32 v6, v6, v14
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v7, v7, v15
; GFX11FAKE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
; GFX11FAKE16-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v4, v4, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_mul_f32_e32 v5, v5, v13
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
; GFX11FAKE16-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_mul_f32_e32 v12, v18, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v18, v19, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v3, v11
; GFX11FAKE16-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX11FAKE16-NEXT: v_mul_f32_e32 v19, v22, v20
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9
; GFX11FAKE16-NEXT: v_mul_f32_e32 v9, v22, v20
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul <16 x bfloat> %a, %b
ret <16 x bfloat> %op
}
define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_fmul_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v31, v31, v32
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v30, v30, v32
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v29, v29, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v27, v27, v32
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v25, v25, v32
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v23, v23, v32
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v21, v21, v32
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v19, v19, v32
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v17, v17, v32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v15, v15, v32
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v13, v13, v32
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v11, v11, v32
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v9, v9, v32
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v7, v7, v32
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v5, v5, v32
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v3, v3, v32
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v1, v1, v32
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_mul_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29
; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmul_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_mul_f32_e32 v31, v32, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12
; GFX10-NEXT: v_mul_f32_e32 v30, v14, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31
; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v33, v33, v14
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff
; GFX10-NEXT: v_mul_f32_e32 v35, v13, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5
; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v34, v36, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_mul_f32_e32 v33, v12, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_mul_f32_e32 v35, v36, v12
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_mul_f32_e32 v34, v11, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_mul_f32_e32 v33, v36, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3
; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_mul_f32_e32 v35, v10, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2
; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_mul_f32_e32 v34, v36, v10
; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24
; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1
; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_mul_f32_e32 v33, v36, v33
; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7
; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_mul_f32_e32 v24, v35, v24
; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1
; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24
; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7
; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_mul_f32_e32 v8, v34, v8
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff
; GFX10-NEXT: v_mul_f32_e32 v35, v38, v37
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21
; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6
; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v8
; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35
; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35
; GFX10-NEXT: v_mul_f32_e32 v8, v21, v8
; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff
; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v48, v49, v48
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4
; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1
; GFX10-NEXT: v_mul_f32_e32 v49, v51, v49
; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48
; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff
; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8
; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9
; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10
; GFX10-NEXT: v_mul_f32_e32 v49, v52, v49
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1
; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49
; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49
; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13
; GFX10-NEXT: v_mul_f32_e32 v17, v49, v17
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo
; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17
; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4
; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12
; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302
; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo
; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302
; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302
; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302
; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302
; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302
; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302
; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302
; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302
; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_mul_f32_e32 v17, v33, v8
; GFX10-NEXT: v_mul_f32_e32 v15, v15, v16
; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302
; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1
; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff
; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmul_v32bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11TRUE16-NEXT: v_mul_f32_e32 v5, v5, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24
; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v3, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11TRUE16-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11TRUE16-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX11TRUE16-NEXT: v_dual_mul_f32 v18, v84, v83 :: v_dual_mul_f32 v9, v9, v25
; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11TRUE16-NEXT: v_mul_f32_e32 v17, v86, v85
; GFX11TRUE16-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX11TRUE16-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28
; GFX11TRUE16-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX11TRUE16-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX11TRUE16-NEXT: v_mul_f32_e32 v20, v80, v71
; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11TRUE16-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11TRUE16-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27
; GFX11TRUE16-NEXT: v_dual_mul_f32 v26, v52, v51 :: v_dual_mul_f32 v25, v54, v53
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX11TRUE16-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_mul_f32_e32 v22, v68, v67
; GFX11TRUE16-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX11TRUE16-NEXT: v_mul_f32_e32 v28, v48, v39
; GFX11TRUE16-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27
; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28
; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26
; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30
; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v15, v15, v33
; GFX11TRUE16-NEXT: v_mul_f32_e32 v17, v31, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v32bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX11FAKE16-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19
; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11FAKE16-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, v2, v18
; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v16
; GFX11FAKE16-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11FAKE16-NEXT: v_mul_f32_e32 v7, v7, v23
; GFX11FAKE16-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
; GFX11FAKE16-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_mul_f32_e32 v4, v4, v20
; GFX11FAKE16-NEXT: v_mul_f32_e32 v20, v80, v71
; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX11FAKE16-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11FAKE16-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX11FAKE16-NEXT: v_mul_f32_e32 v26, v52, v51
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
; GFX11FAKE16-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
; GFX11FAKE16-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v29, v38, v37
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX11FAKE16-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX11FAKE16-NEXT: v_mul_f32_e32 v28, v48, v39
; GFX11FAKE16-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
; GFX11FAKE16-NEXT: v_mul_f32_e32 v15, v15, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX11FAKE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fmul <32 x bfloat> %a, %b
ret <32 x bfloat> %op
}
define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fdiv_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GCN-NEXT: v_rcp_f32_e32 v3, v2
; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GCN-NEXT: v_fma_f32 v3, v4, v3, v3
; GCN-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GCN-NEXT: v_mul_f32_e32 v5, v4, v3
; GCN-NEXT: v_fma_f32 v6, -v2, v5, v4
; GCN-NEXT: v_fma_f32 v5, v6, v3, v5
; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4
; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fdiv_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
; GFX8-NEXT: v_rcp_f32_e32 v4, v2
; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3
; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fdiv_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_rcp_f32_e32 v4, v2
; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3
; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3
; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3
; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5
; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fdiv_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v2
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v5, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v6, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_fma_f32 v2, -v2, v4, v5
; GFX11TRUE16-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fdiv_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11FAKE16-NEXT: v_rcp_f32_e32 v3, v2
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX11FAKE16-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
; GFX11FAKE16-NEXT: v_mul_f32_e32 v4, v5, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_fma_f32 v2, -v2, v4, v5
; GFX11FAKE16-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fdiv bfloat %a, %b
ret bfloat %op
}
declare bfloat @llvm.fabs.bf16(bfloat)
define bfloat @v_fabs_bf16(bfloat %a) {
; GCN-LABEL: v_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fabs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fabs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fabs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fabs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fabs_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
ret bfloat %op
}
define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fabs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fabs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fabs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fabs_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
ret i32 %readlane
}
define bfloat @v_fneg_bf16(bfloat %a) {
; GCN-LABEL: v_fneg_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fneg_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fneg_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fneg_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fneg_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fneg_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fneg_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fneg bfloat %a
ret bfloat %op
}
declare i32 @llvm.amdgcn.readfirstlane(i32)
; FIXME: readfirstlane hack for other bugs
define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fneg_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fneg_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fneg_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%op = fneg bfloat %a
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
ret i32 %readlane
}
define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN-LABEL: v_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fneg_fabs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fneg_fabs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fneg_fabs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
ret bfloat %op
}
; FIXME: readfirstlane hack for other bugs
define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
; GCN-NEXT: s_bitset0_b32 s0, 31
; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
; GFX7-NEXT: s_bitset0_b32 s0, 31
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_bitset1_b32 s0, 15
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fneg_fabs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_bitset1_b32 s0, 15
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fneg_fabs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_bitset1_b32 s0, 15
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fneg_fabs_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_bitset1_b32 s0, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
ret i32 %readlane
}
declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_minnum_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_min_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
ret bfloat %op
}
define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_minnum_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_min_f32_e32 v1, v1, v3
; GCN-NEXT: v_min_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_min_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_min_f32_e32 v2, v3, v2
; GFX10-NEXT: v_min_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_min_f32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_min_f32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-LABEL: v_minnum_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_min_f32_e32 v2, v2, v5
; GCN-NEXT: v_min_f32_e32 v1, v1, v4
; GCN-NEXT: v_min_f32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_f32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_minnum_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v7
; GCN-NEXT: v_min_f32_e32 v2, v2, v6
; GCN-NEXT: v_min_f32_e32 v1, v1, v5
; GCN-NEXT: v_min_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
; GFX10-NEXT: v_min_f32_e32 v3, v7, v6
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_min_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %op
}
define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-LABEL: v_minnum_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_min_f32_e32 v7, v7, v15
; GCN-NEXT: v_min_f32_e32 v6, v6, v14
; GCN-NEXT: v_min_f32_e32 v5, v5, v13
; GCN-NEXT: v_min_f32_e32 v4, v4, v12
; GCN-NEXT: v_min_f32_e32 v3, v3, v11
; GCN-NEXT: v_min_f32_e32 v2, v2, v10
; GCN-NEXT: v_min_f32_e32 v1, v1, v9
; GCN-NEXT: v_min_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
; GFX7-NEXT: v_min_f32_e32 v4, v4, v12
; GFX7-NEXT: v_min_f32_e32 v3, v3, v11
; GFX7-NEXT: v_min_f32_e32 v2, v2, v10
; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_min_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_min_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX9-NEXT: v_min_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX9-NEXT: v_min_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX9-NEXT: v_min_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_min_f32_e32 v8, v9, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX10-NEXT: v_min_f32_e32 v7, v10, v9
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_min_f32_e32 v6, v10, v6
; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v5
; GFX10-NEXT: v_min_f32_e32 v5, v15, v13
; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7
; GFX11TRUE16-NEXT: v_min_f32_e32 v7, v10, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11TRUE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v10, v12, v7, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v7, v12, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v6, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5
; GFX11TRUE16-NEXT: v_dual_min_f32 v5, v12, v11 :: v_dual_cndmask_b32 v2, v7, v10
; GFX11TRUE16-NEXT: v_min_f32_e32 v9, v14, v13
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v8
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v9, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
; GFX11TRUE16-NEXT: v_add3_u32 v10, v13, v9, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v7, v13, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v5
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v8bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_f32_e32 v3, v3, v7
; GFX11FAKE16-NEXT: v_min_f32_e32 v7, v10, v9
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_f32_e32 v6, v10, v6
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_min_f32_e32 v0, v0, v4
; GFX11FAKE16-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX11FAKE16-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
; GFX11FAKE16-NEXT: v_min_f32_e32 v5, v15, v13
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %op
}
define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_minnum_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_min_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_min_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_min_f32_e32 v11, v11, v27
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_min_f32_e32 v9, v9, v25
; GCN-NEXT: v_min_f32_e32 v8, v8, v24
; GCN-NEXT: v_min_f32_e32 v7, v7, v23
; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_min_f32_e32 v5, v5, v21
; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
; GCN-NEXT: v_min_f32_e32 v0, v0, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_min_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_min_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_min_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_min_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_min_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_min_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_min_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_min_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX9-NEXT: v_min_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX9-NEXT: v_min_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX9-NEXT: v_min_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX9-NEXT: v_min_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX9-NEXT: v_min_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX9-NEXT: v_min_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX9-NEXT: v_min_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_min_f32_e32 v16, v17, v16
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX10-NEXT: v_min_f32_e32 v17, v18, v17
; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX10-NEXT: v_min_f32_e32 v17, v20, v19
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX10-NEXT: v_min_f32_e32 v5, v5, v13
; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
; GFX10-NEXT: v_min_f32_e32 v13, v19, v18
; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_min_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX10-NEXT: v_min_f32_e32 v18, v19, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX10-NEXT: v_min_f32_e32 v19, v22, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
; GFX10-NEXT: v_min_f32_e32 v9, v22, v20
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX10-NEXT: v_min_f32_e32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_min_f32 v18, v19, v18 :: v_dual_and_b32 v23, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_dual_min_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_min_f32 v7, v7, v15 :: v_dual_and_b32 v14, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11TRUE16-NEXT: v_min_f32_e32 v1, v1, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v7, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v17, v17, v7, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v18, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo
; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_min_f32 v14, v17, v14 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_min_f32_e32 v5, v5, v13
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v20, v6, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v14, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v5, 16, 1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v15
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v18, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v14
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v5, 0x7fff
; GFX11TRUE16-NEXT: v_min_f32_e32 v4, v4, v12
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_min_f32_e32 v18, v21, v20
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v4, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v18
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v5, v19, v20 :: v_dual_and_b32 v20, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v12, v21, v18, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v17, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX11TRUE16-NEXT: v_dual_min_f32 v18, v20, v19 :: v_dual_and_b32 v19, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v4, v16, v17 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX11TRUE16-NEXT: v_min_f32_e32 v16, v20, v19
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_min_f32 v3, v3, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v16, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v3, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v16, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v11, v11, v18, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_add3_u32 v10, v17, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v10, v17, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_add3_u32 v16, v22, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v1, 16, 1
; GFX11TRUE16-NEXT: v_min_f32_e32 v9, v22, v21
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v8
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v1, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_min_f32_e32 v17, v24, v23
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v9, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: v_add3_u32 v19, v23, v17, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v10
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v23, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v9
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v8, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v11
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v16bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_f32_e32 v17, v18, v17
; GFX11FAKE16-NEXT: v_min_f32_e32 v6, v6, v14
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_min_f32_e32 v7, v7, v15
; GFX11FAKE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
; GFX11FAKE16-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_min_f32_e32 v4, v4, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_min_f32_e32 v5, v5, v13
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
; GFX11FAKE16-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_min_f32_e32 v12, v18, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_min_f32_e32 v18, v19, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_min_f32_e32 v3, v3, v11
; GFX11FAKE16-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX11FAKE16-NEXT: v_min_f32_e32 v19, v22, v20
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9
; GFX11FAKE16-NEXT: v_min_f32_e32 v9, v22, v20
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
ret <16 x bfloat> %op
}
define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_minnum_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_min_f32_e32 v31, v31, v32
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_min_f32_e32 v30, v30, v32
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_min_f32_e32 v29, v29, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_min_f32_e32 v28, v28, v32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_min_f32_e32 v27, v27, v32
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_min_f32_e32 v26, v26, v32
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_min_f32_e32 v25, v25, v32
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_min_f32_e32 v24, v24, v32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_min_f32_e32 v23, v23, v32
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_min_f32_e32 v22, v22, v32
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_min_f32_e32 v21, v21, v32
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_min_f32_e32 v20, v20, v32
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_min_f32_e32 v19, v19, v32
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_min_f32_e32 v18, v18, v32
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_min_f32_e32 v17, v17, v32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_min_f32_e32 v16, v16, v32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_min_f32_e32 v15, v15, v32
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_min_f32_e32 v14, v14, v32
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_min_f32_e32 v13, v13, v32
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_min_f32_e32 v12, v12, v32
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_min_f32_e32 v11, v11, v32
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_min_f32_e32 v10, v10, v32
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_min_f32_e32 v9, v9, v32
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_min_f32_e32 v8, v8, v32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_min_f32_e32 v7, v7, v32
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_min_f32_e32 v6, v6, v32
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_min_f32_e32 v5, v5, v32
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_min_f32_e32 v4, v4, v32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_min_f32_e32 v3, v3, v32
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_min_f32_e32 v2, v2, v32
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_min_f32_e32 v1, v1, v32
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_min_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_min_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_min_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX8-NEXT: v_min_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_min_f32_e32 v33, v33, v34
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_min_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_min_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_min_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_min_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_min_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_min_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_min_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_min_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_min_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_min_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_min_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_min_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_min_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_min_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_min_f32_e32 v30, v32, v30
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_min_f32_e32 v32, v32, v29
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
; GFX9-NEXT: v_min_f32_e32 v33, v33, v34
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_min_f32_e32 v29, v15, v29
; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX9-NEXT: v_min_f32_e32 v28, v33, v28
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_min_f32_e32 v27, v33, v27
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_min_f32_e32 v26, v33, v26
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_min_f32_e32 v25, v33, v25
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_min_f32_e32 v24, v33, v24
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_min_f32_e32 v23, v33, v23
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_min_f32_e32 v22, v33, v22
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_min_f32_e32 v21, v33, v21
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_min_f32_e32 v20, v33, v20
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_min_f32_e32 v19, v33, v19
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_min_f32_e32 v18, v33, v18
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_min_f32_e32 v17, v33, v17
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minnum_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_min_f32_e32 v31, v32, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12
; GFX10-NEXT: v_min_f32_e32 v30, v14, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31
; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1
; GFX10-NEXT: v_min_f32_e32 v33, v33, v14
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff
; GFX10-NEXT: v_min_f32_e32 v35, v13, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5
; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_min_f32_e32 v34, v36, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_min_f32_e32 v33, v12, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_min_f32_e32 v35, v36, v12
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_min_f32_e32 v34, v11, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_min_f32_e32 v33, v36, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3
; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_min_f32_e32 v35, v10, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2
; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_min_f32_e32 v34, v36, v10
; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24
; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1
; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_min_f32_e32 v33, v36, v33
; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7
; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_min_f32_e32 v24, v35, v24
; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1
; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24
; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7
; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_min_f32_e32 v8, v34, v8
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff
; GFX10-NEXT: v_min_f32_e32 v35, v38, v37
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21
; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6
; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1
; GFX10-NEXT: v_min_f32_e32 v5, v5, v8
; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35
; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35
; GFX10-NEXT: v_min_f32_e32 v8, v21, v8
; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff
; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_min_f32_e32 v48, v49, v48
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4
; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1
; GFX10-NEXT: v_min_f32_e32 v49, v51, v49
; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48
; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff
; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8
; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17
; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9
; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10
; GFX10-NEXT: v_min_f32_e32 v49, v52, v49
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11
; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1
; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49
; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49
; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13
; GFX10-NEXT: v_min_f32_e32 v17, v49, v17
; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo
; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17
; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4
; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12
; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302
; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo
; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302
; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302
; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302
; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302
; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302
; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302
; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302
; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302
; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_min_f32_e32 v17, v33, v8
; GFX10-NEXT: v_min_f32_e32 v15, v15, v16
; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302
; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1
; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff
; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_minnum_v32bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11TRUE16-NEXT: v_min_f32_e32 v5, v5, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24
; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_min_f32_e32 v3, v3, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11TRUE16-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11TRUE16-NEXT: v_min_f32_e32 v2, v2, v18
; GFX11TRUE16-NEXT: v_dual_min_f32 v18, v84, v83 :: v_dual_min_f32 v9, v9, v25
; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11TRUE16-NEXT: v_min_f32_e32 v17, v86, v85
; GFX11TRUE16-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v16
; GFX11TRUE16-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28
; GFX11TRUE16-NEXT: v_min_f32_e32 v7, v7, v23
; GFX11TRUE16-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_min_f32_e32 v4, v4, v20
; GFX11TRUE16-NEXT: v_min_f32_e32 v20, v80, v71
; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11TRUE16-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11TRUE16-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27
; GFX11TRUE16-NEXT: v_dual_min_f32 v26, v52, v51 :: v_dual_min_f32 v25, v54, v53
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_f32_e32 v6, v6, v22
; GFX11TRUE16-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_min_f32_e32 v22, v68, v67
; GFX11TRUE16-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_min_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_min_f32_e32 v14, v14, v30
; GFX11TRUE16-NEXT: v_min_f32_e32 v28, v48, v39
; GFX11TRUE16-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27
; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28
; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26
; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30
; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_f32_e32 v15, v15, v33
; GFX11TRUE16-NEXT: v_min_f32_e32 v17, v31, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v32bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX11FAKE16-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19
; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11FAKE16-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX11FAKE16-NEXT: v_min_f32_e32 v2, v2, v18
; GFX11FAKE16-NEXT: v_min_f32_e32 v0, v0, v16
; GFX11FAKE16-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11FAKE16-NEXT: v_min_f32_e32 v7, v7, v23
; GFX11FAKE16-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
; GFX11FAKE16-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_min_f32_e32 v4, v4, v20
; GFX11FAKE16-NEXT: v_min_f32_e32 v20, v80, v71
; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX11FAKE16-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11FAKE16-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX11FAKE16-NEXT: v_min_f32_e32 v26, v52, v51
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_f32_e32 v6, v6, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
; GFX11FAKE16-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
; GFX11FAKE16-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_f32_e32 v29, v38, v37
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX11FAKE16-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_f32_e32 v14, v14, v30
; GFX11FAKE16-NEXT: v_min_f32_e32 v28, v48, v39
; GFX11FAKE16-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
; GFX11FAKE16-NEXT: v_min_f32_e32 v15, v15, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX11FAKE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
ret <32 x bfloat> %op
}
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_maxnum_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
ret bfloat %op
}
define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_max_f32_e32 v1, v1, v3
; GCN-NEXT: v_max_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_max_f32_e32 v2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_max_f32_e32 v2, v3, v2
; GFX10-NEXT: v_max_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_max_f32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_max_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_max_f32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_max_f32_e32 v2, v2, v5
; GCN-NEXT: v_max_f32_e32 v1, v1, v4
; GCN-NEXT: v_max_f32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_max_f32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v7
; GCN-NEXT: v_max_f32_e32 v2, v2, v6
; GCN-NEXT: v_max_f32_e32 v1, v1, v5
; GCN-NEXT: v_max_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
; GFX10-NEXT: v_max_f32_e32 v3, v7, v6
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_max_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_max_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
ret <4 x bfloat> %op
}
define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_max_f32_e32 v7, v7, v15
; GCN-NEXT: v_max_f32_e32 v6, v6, v14
; GCN-NEXT: v_max_f32_e32 v5, v5, v13
; GCN-NEXT: v_max_f32_e32 v4, v4, v12
; GCN-NEXT: v_max_f32_e32 v3, v3, v11
; GCN-NEXT: v_max_f32_e32 v2, v2, v10
; GCN-NEXT: v_max_f32_e32 v1, v1, v9
; GCN-NEXT: v_max_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
; GFX7-NEXT: v_max_f32_e32 v4, v4, v12
; GFX7-NEXT: v_max_f32_e32 v3, v3, v11
; GFX7-NEXT: v_max_f32_e32 v2, v2, v10
; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_max_f32_e32 v0, v0, v4
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX9-NEXT: v_max_f32_e32 v7, v9, v7
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX9-NEXT: v_max_f32_e32 v6, v9, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX9-NEXT: v_max_f32_e32 v5, v9, v5
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_max_f32_e32 v8, v9, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX10-NEXT: v_max_f32_e32 v7, v10, v9
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_max_f32_e32 v6, v10, v6
; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v5
; GFX10-NEXT: v_max_f32_e32 v5, v15, v13
; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7
; GFX11TRUE16-NEXT: v_max_f32_e32 v7, v10, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11TRUE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v11, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v10, v12, v7, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v7, v12, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v6, v10, v11 :: v_dual_lshlrev_b32 v11, 16, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5
; GFX11TRUE16-NEXT: v_dual_max_f32 v5, v12, v11 :: v_dual_cndmask_b32 v2, v7, v10
; GFX11TRUE16-NEXT: v_max_f32_e32 v9, v14, v13
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v8
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v9, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
; GFX11TRUE16-NEXT: v_add3_u32 v10, v13, v9, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v7, v13, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v5
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v8bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_max_f32_e32 v3, v3, v7
; GFX11FAKE16-NEXT: v_max_f32_e32 v7, v10, v9
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v10, v11, v8, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v11, v12, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_max_f32_e32 v6, v10, v6
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v10, v13, v2, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v6, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v7, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_max_f32_e32 v0, v0, v4
; GFX11FAKE16-NEXT: v_add3_u32 v4, v12, v6, 0x7fff
; GFX11FAKE16-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
; GFX11FAKE16-NEXT: v_max_f32_e32 v5, v15, v13
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v6, v11, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_add3_u32 v10, v12, v5, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v12, v13, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v5, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
ret <8 x bfloat> %op
}
define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_max_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_max_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_max_f32_e32 v11, v11, v27
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_max_f32_e32 v9, v9, v25
; GCN-NEXT: v_max_f32_e32 v8, v8, v24
; GCN-NEXT: v_max_f32_e32 v7, v7, v23
; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_max_f32_e32 v5, v5, v21
; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
; GCN-NEXT: v_max_f32_e32 v0, v0, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v7, v7, v15
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX8-NEXT: v_max_f32_e32 v15, v17, v15
; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v6, v6, v14
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX8-NEXT: v_max_f32_e32 v14, v17, v14
; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v5, v5, v13
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5
; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX8-NEXT: v_max_f32_e32 v13, v17, v13
; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v4, v4, v12
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4
; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX8-NEXT: v_max_f32_e32 v12, v17, v12
; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v3, v3, v11
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX8-NEXT: v_max_f32_e32 v11, v17, v11
; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v2, v2, v10
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX8-NEXT: v_max_f32_e32 v10, v17, v10
; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v1, v1, v9
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX8-NEXT: v_max_f32_e32 v9, v17, v9
; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_max_f32_e32 v0, v0, v8
; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GFX9-NEXT: v_max_f32_e32 v15, v17, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1
; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5
; GFX9-NEXT: v_max_f32_e32 v14, v17, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1
; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4
; GFX9-NEXT: v_max_f32_e32 v13, v17, v13
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1
; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3
; GFX9-NEXT: v_max_f32_e32 v12, v17, v12
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX9-NEXT: v_max_f32_e32 v11, v17, v11
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX9-NEXT: v_max_f32_e32 v10, v17, v10
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; GFX9-NEXT: v_max_f32_e32 v9, v17, v9
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4
; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4
; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_max_f32_e32 v16, v17, v16
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX10-NEXT: v_max_f32_e32 v17, v18, v17
; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX10-NEXT: v_max_f32_e32 v17, v20, v19
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX10-NEXT: v_max_f32_e32 v5, v5, v13
; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo
; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo
; GFX10-NEXT: v_max_f32_e32 v13, v19, v18
; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_max_f32_e32 v12, v18, v12
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX10-NEXT: v_max_f32_e32 v18, v19, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX10-NEXT: v_max_f32_e32 v19, v22, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
; GFX10-NEXT: v_max_f32_e32 v9, v22, v20
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX10-NEXT: v_max_f32_e32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_max_f32 v18, v19, v18 :: v_dual_and_b32 v23, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_dual_max_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_max_f32 v7, v7, v15 :: v_dual_and_b32 v14, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11TRUE16-NEXT: v_max_f32_e32 v1, v1, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v7, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v17, v17, v7, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v18, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v20, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo
; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_max_f32 v14, v17, v14 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_max_f32_e32 v5, v5, v13
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v20, v6, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v14, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v5, 16, 1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v15
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v18, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v14
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v5, 0x7fff
; GFX11TRUE16-NEXT: v_max_f32_e32 v4, v4, v12
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_max_f32_e32 v18, v21, v20
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v4, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v18
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v5, v19, v20 :: v_dual_and_b32 v20, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v12, v21, v18, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v17, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v4
; GFX11TRUE16-NEXT: v_dual_max_f32 v18, v20, v19 :: v_dual_and_b32 v19, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v4, v16, v17 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX11TRUE16-NEXT: v_max_f32_e32 v16, v20, v19
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_max_f32 v3, v3, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v16, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v17, v3, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v16, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v11, v11, v18, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_add3_u32 v10, v17, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v10, v17, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11TRUE16-NEXT: v_add3_u32 v16, v22, v2, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v1, 16, 1
; GFX11TRUE16-NEXT: v_max_f32_e32 v9, v22, v21
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v8
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v1, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_max_f32_e32 v17, v24, v23
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v9, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: v_add3_u32 v19, v23, v17, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v23, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v10
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: v_add3_u32 v16, v23, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v9
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v16, v22, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v8, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v11
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v16bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_max_f32_e32 v17, v18, v17
; GFX11FAKE16-NEXT: v_max_f32_e32 v6, v6, v14
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v14, v21, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_max_f32_e32 v7, v7, v15
; GFX11FAKE16-NEXT: v_bfe_u32 v15, v16, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v15, v15, v16, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v18, v19, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v6, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v15, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16
; GFX11FAKE16-NEXT: v_add3_u32 v16, v18, v6, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_max_f32_e32 v4, v4, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_max_f32_e32 v5, v5, v13
; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
; GFX11FAKE16-NEXT: v_add3_u32 v16, v20, v17, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v14, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_max_f32_e32 v12, v18, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v19, v21, v5, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v21, v13, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v21, v13, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v13
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v17, v17, v12, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_max_f32_e32 v18, v19, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_max_f32_e32 v3, v3, v11
; GFX11FAKE16-NEXT: v_add3_u32 v11, v20, v4, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v10, v23, v18, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v19, v20, v3, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v2, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v9
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v12, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v17, v19, v2, 0x7fff
; GFX11FAKE16-NEXT: v_max_f32_e32 v19, v22, v20
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v19, 16, 1
; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9
; GFX11FAKE16-NEXT: v_max_f32_e32 v9, v22, v20
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add3_u32 v20, v23, v19, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v9
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo
; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v20, v0, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v20, v20, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v10, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
ret <16 x bfloat> %op
}
define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_maxnum_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_max_f32_e32 v31, v31, v32
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_max_f32_e32 v30, v30, v32
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_max_f32_e32 v29, v29, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_max_f32_e32 v28, v28, v32
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_max_f32_e32 v27, v27, v32
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_max_f32_e32 v26, v26, v32
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_max_f32_e32 v25, v25, v32
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_max_f32_e32 v24, v24, v32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_max_f32_e32 v23, v23, v32
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_max_f32_e32 v22, v22, v32
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_max_f32_e32 v21, v21, v32
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_max_f32_e32 v20, v20, v32
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_max_f32_e32 v19, v19, v32
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_max_f32_e32 v18, v18, v32
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_max_f32_e32 v17, v17, v32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_max_f32_e32 v16, v16, v32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_max_f32_e32 v15, v15, v32
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_max_f32_e32 v14, v14, v32
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_max_f32_e32 v13, v13, v32
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_max_f32_e32 v12, v12, v32
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_max_f32_e32 v11, v11, v32
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_max_f32_e32 v10, v10, v32
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_max_f32_e32 v9, v9, v32
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_max_f32_e32 v8, v8, v32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_max_f32_e32 v7, v7, v32
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_max_f32_e32 v6, v6, v32
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_max_f32_e32 v5, v5, v32
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_max_f32_e32 v4, v4, v32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_max_f32_e32 v3, v3, v32
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_max_f32_e32 v2, v2, v32
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_max_f32_e32 v1, v1, v32
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_max_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX8-NEXT: v_max_f32_e32 v31, v32, v31
; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32
; GFX8-NEXT: v_max_f32_e32 v14, v14, v30
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14
; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX8-NEXT: v_max_f32_e32 v32, v32, v30
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_max_f32_e32 v33, v33, v34
; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX8-NEXT: v_max_f32_e32 v30, v15, v30
; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33
; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13
; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_max_f32_e32 v29, v33, v29
; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29
; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12
; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX8-NEXT: v_max_f32_e32 v28, v33, v28
; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28
; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11
; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_max_f32_e32 v27, v33, v27
; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27
; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10
; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX8-NEXT: v_max_f32_e32 v26, v33, v26
; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26
; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9
; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_max_f32_e32 v25, v33, v25
; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8
; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX8-NEXT: v_max_f32_e32 v24, v33, v24
; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7
; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_max_f32_e32 v23, v33, v23
; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23
; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6
; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX8-NEXT: v_max_f32_e32 v22, v33, v22
; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v5, v5, v21
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5
; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_max_f32_e32 v21, v33, v21
; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4
; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX8-NEXT: v_max_f32_e32 v20, v33, v20
; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3
; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_max_f32_e32 v19, v33, v19
; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2
; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX8-NEXT: v_max_f32_e32 v18, v33, v18
; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_max_f32_e32 v17, v33, v17
; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33
; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0
; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13
; GFX9-NEXT: v_max_f32_e32 v30, v32, v30
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1
; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4
; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28
; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12
; GFX9-NEXT: v_max_f32_e32 v32, v32, v29
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1
; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29
; GFX9-NEXT: v_max_f32_e32 v33, v33, v34
; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX9-NEXT: v_max_f32_e32 v29, v15, v29
; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1
; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc
; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11
; GFX9-NEXT: v_max_f32_e32 v28, v33, v28
; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10
; GFX9-NEXT: v_max_f32_e32 v27, v33, v27
; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1
; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9
; GFX9-NEXT: v_max_f32_e32 v26, v33, v26
; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8
; GFX9-NEXT: v_max_f32_e32 v25, v33, v25
; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7
; GFX9-NEXT: v_max_f32_e32 v24, v33, v24
; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1
; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_max_f32_e32 v23, v33, v23
; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1
; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5
; GFX9-NEXT: v_max_f32_e32 v22, v33, v22
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_max_f32_e32 v21, v33, v21
; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1
; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3
; GFX9-NEXT: v_max_f32_e32 v20, v33, v20
; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2
; GFX9-NEXT: v_max_f32_e32 v19, v33, v19
; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1
; GFX9-NEXT: v_max_f32_e32 v18, v33, v18
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GFX9-NEXT: v_max_f32_e32 v17, v33, v17
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1
; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4
; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4
; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4
; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4
; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4
; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4
; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4
; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4
; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4
; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4
; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4
; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4
; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4
; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maxnum_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v13
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX10-NEXT: v_max_f32_e32 v31, v32, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v12
; GFX10-NEXT: v_max_f32_e32 v30, v14, v30
; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v29
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_bfe_u32 v32, v31, 16, 1
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v31
; GFX10-NEXT: v_bfe_u32 v35, v30, 16, 1
; GFX10-NEXT: v_max_f32_e32 v33, v33, v14
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX10-NEXT: v_add3_u32 v32, v32, v31, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_add3_u32 v31, v35, v30, 0x7fff
; GFX10-NEXT: v_max_f32_e32 v35, v13, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX10-NEXT: v_cndmask_b32_e32 v14, v32, v34, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v30
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v21
; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v5
; GFX10-NEXT: v_add3_u32 v30, v34, v33, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_max_f32_e32 v34, v36, v13
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_max_f32_e32 v33, v12, v28
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_max_f32_e32 v35, v36, v12
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v10
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX10-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_max_f32_e32 v34, v11, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v26
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_max_f32_e32 v33, v36, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v3
; GFX10-NEXT: v_cndmask_b32_e32 v27, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_max_f32_e32 v35, v10, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v25
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v2
; GFX10-NEXT: v_cndmask_b32_e32 v11, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v33, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_max_f32_e32 v34, v36, v10
; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v32, v35, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v24
; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v1
; GFX10-NEXT: v_cndmask_b32_e32 v10, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v35, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v35
; GFX10-NEXT: v_bfe_u32 v32, v34, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX10-NEXT: v_max_f32_e32 v33, v36, v33
; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v7
; GFX10-NEXT: v_cndmask_b32_e32 v25, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v30, v32, v34, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v31, 0x400000, v34
; GFX10-NEXT: v_bfe_u32 v32, v9, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX10-NEXT: v_max_f32_e32 v24, v35, v24
; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v32, v9, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v9
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1
; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v24
; GFX10-NEXT: v_cmp_u_f32_e64 s4, v24, v24
; GFX10-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v31, v34, v33, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v32, 0x400000, v33
; GFX10-NEXT: v_bfe_u32 v34, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v8
; GFX10-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e64 s5, v7, v7
; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo
; GFX10-NEXT: v_add3_u32 v32, v34, v8, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v22
; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v6
; GFX10-NEXT: v_add3_u32 v24, v35, v7, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_max_f32_e32 v8, v34, v8
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s6, v8, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s7, v6, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v15
; GFX10-NEXT: v_add3_u32 v7, v35, v8, 0x7fff
; GFX10-NEXT: v_max_f32_e32 v35, v38, v37
; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v21
; GFX10-NEXT: v_bfe_u32 v37, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v22, s6
; GFX10-NEXT: v_bfe_u32 v21, v35, 16, 1
; GFX10-NEXT: v_max_f32_e32 v5, v5, v8
; GFX10-NEXT: v_add3_u32 v37, v37, v6, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v20
; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX10-NEXT: v_add3_u32 v6, v21, v35, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
; GFX10-NEXT: v_bfe_u32 v48, v5, 16, 1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v35
; GFX10-NEXT: v_cmp_u_f32_e64 s8, v35, v35
; GFX10-NEXT: v_max_f32_e32 v8, v21, v8
; GFX10-NEXT: v_add3_u32 v21, v48, v5, 0x7fff
; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v19
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v5
; GFX10-NEXT: v_bfe_u32 v20, v8, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s9, v5, v5
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_max_f32_e32 v48, v49, v48
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_add3_u32 v20, v20, v8, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v8
; GFX10-NEXT: v_cmp_u_f32_e64 s10, v8, v8
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_cmp_u_f32_e64 s11, v4, v4
; GFX10-NEXT: v_bfe_u32 v4, v48, 16, 1
; GFX10-NEXT: v_max_f32_e32 v49, v51, v49
; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v48
; GFX10-NEXT: v_cmp_u_f32_e64 s12, v48, v48
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_add3_u32 v4, v4, v48, 0x7fff
; GFX10-NEXT: v_bfe_u32 v48, v49, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e64 s13, v49, v49
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v39, s8
; GFX10-NEXT: v_add3_u32 v19, v48, v49, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v49
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v17
; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v35, s9
; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v50, s10
; GFX10-NEXT: v_max_f32_e32 v49, v52, v49
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s11
; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_bfe_u32 v18, v49, 16, 1
; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v49
; GFX10-NEXT: v_cmp_u_f32_e64 s14, v49, v49
; GFX10-NEXT: v_bfe_u32 v39, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v18, v18, v49, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add3_u32 v39, v39, v1, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v48, s13
; GFX10-NEXT: v_max_f32_e32 v17, v49, v17
; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX10-NEXT: v_cndmask_b32_e32 v1, v39, v35, vcc_lo
; GFX10-NEXT: v_bfe_u32 v22, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v49, v17, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v17
; GFX10-NEXT: v_bfe_u32 v50, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v49, v49, v17, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX10-NEXT: v_add3_u32 v50, v50, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v36, s4
; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v8, v49, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v38, s7
; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v22, v22, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v34, s5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v50, v48, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v52, s14
; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v38, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v51, s12
; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX10-NEXT: v_perm_b32 v9, v9, v30, 0x7060302
; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v36, v34, vcc_lo
; GFX10-NEXT: v_perm_b32 v10, v25, v10, 0x7060302
; GFX10-NEXT: v_perm_b32 v11, v26, v11, 0x7060302
; GFX10-NEXT: v_perm_b32 v12, v27, v12, 0x7060302
; GFX10-NEXT: v_perm_b32 v13, v28, v13, 0x7060302
; GFX10-NEXT: v_perm_b32 v3, v3, v4, 0x7060302
; GFX10-NEXT: v_perm_b32 v4, v5, v20, 0x7060302
; GFX10-NEXT: v_perm_b32 v5, v21, v6, 0x7060302
; GFX10-NEXT: v_perm_b32 v6, v37, v7, 0x7060302
; GFX10-NEXT: v_perm_b32 v7, v24, v23, 0x7060302
; GFX10-NEXT: v_perm_b32 v14, v29, v14, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v16
; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX10-NEXT: v_max_f32_e32 v17, v33, v8
; GFX10-NEXT: v_max_f32_e32 v15, v15, v16
; GFX10-NEXT: v_perm_b32 v8, v32, v31, 0x7060302
; GFX10-NEXT: v_bfe_u32 v16, v17, 16, 1
; GFX10-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX10-NEXT: v_add3_u32 v16, v16, v17, 0x7fff
; GFX10-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX10-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_maxnum_v32bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11TRUE16-NEXT: v_max_f32_e32 v5, v5, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
; GFX11TRUE16-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v24, 16, v24
; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_max_f32_e32 v3, v3, v19
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
; GFX11TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11TRUE16-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v18, 16, v18
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GFX11TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11TRUE16-NEXT: v_max_f32_e32 v2, v2, v18
; GFX11TRUE16-NEXT: v_dual_max_f32 v18, v84, v83 :: v_dual_max_f32 v9, v9, v25
; GFX11TRUE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11TRUE16-NEXT: v_max_f32_e32 v17, v86, v85
; GFX11TRUE16-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v39, 0xffff0000, v27
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11TRUE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11TRUE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11TRUE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v16
; GFX11TRUE16-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_and_b32 v37, 0xffff0000, v28
; GFX11TRUE16-NEXT: v_max_f32_e32 v7, v7, v23
; GFX11TRUE16-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v28, 16, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11TRUE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11TRUE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11TRUE16-NEXT: v_max_f32_e32 v4, v4, v20
; GFX11TRUE16-NEXT: v_max_f32_e32 v20, v80, v71
; GFX11TRUE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11TRUE16-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_lshlrev_b32 v10, 16, v10
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11TRUE16-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_lshlrev_b32 v27, 16, v27
; GFX11TRUE16-NEXT: v_dual_max_f32 v26, v52, v51 :: v_dual_max_f32 v25, v54, v53
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_max_f32_e32 v6, v6, v22
; GFX11TRUE16-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v36, 0xffff0000, v13
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
; GFX11TRUE16-NEXT: v_max_f32_e32 v22, v68, v67
; GFX11TRUE16-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v38, 0xffff0000, v12
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11TRUE16-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v12, 16, v12
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_max_f32 v29, v38, v37 :: v_dual_lshlrev_b32 v30, 16, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
; GFX11TRUE16-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v15, 16, v15
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_max_f32_e32 v14, v14, v30
; GFX11TRUE16-NEXT: v_max_f32_e32 v28, v48, v39
; GFX11TRUE16-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
; GFX11TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11TRUE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11TRUE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11TRUE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11TRUE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11TRUE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11TRUE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11TRUE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11TRUE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11TRUE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11TRUE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11TRUE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11TRUE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11TRUE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11TRUE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v27
; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v28
; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v26
; GFX11TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v29
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30
; GFX11TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v25
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v24
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v23
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v22
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v21
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v20
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v19
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v18
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v32
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v32
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_max_f32_e32 v15, v15, v33
; GFX11TRUE16-NEXT: v_max_f32_e32 v17, v31, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v18, v15, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
; GFX11TRUE16-NEXT: v_add3_u32 v18, v18, v15, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v17, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v21, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v17
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v32bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v21
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v17
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX11FAKE16-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v24
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v19
; GFX11FAKE16-NEXT: v_bfe_u32 v103, v5, 16, 1
; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v135, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v5
; GFX11FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v103, v103, v5, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v135, v135, v1, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v16
; GFX11FAKE16-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v129, v19, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v130, 0x400000, v19
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v119, v3, 16, 1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX11FAKE16-NEXT: v_add3_u32 v129, v129, v19, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
; GFX11FAKE16-NEXT: v_or_b32_e32 v128, 0x400000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v119, v119, v3, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v145, v17, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v146, 0x400000, v17
; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v4
; GFX11FAKE16-NEXT: v_add3_u32 v145, v145, v17, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v23
; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX11FAKE16-NEXT: v_max_f32_e32 v2, v2, v18
; GFX11FAKE16-NEXT: v_max_f32_e32 v0, v0, v16
; GFX11FAKE16-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
; GFX11FAKE16-NEXT: v_max_f32_e32 v7, v7, v23
; GFX11FAKE16-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
; GFX11FAKE16-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v85, v24, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v97, v23, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v24
; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v23
; GFX11FAKE16-NEXT: v_bfe_u32 v87, v7, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v85, v85, v24, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX11FAKE16-NEXT: v_add3_u32 v97, v97, v23, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v7
; GFX11FAKE16-NEXT: v_add3_u32 v87, v87, v7, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11FAKE16-NEXT: v_max_f32_e32 v4, v4, v20
; GFX11FAKE16-NEXT: v_max_f32_e32 v20, v80, v71
; GFX11FAKE16-NEXT: v_bfe_u32 v71, v9, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX11FAKE16-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v71, v71, v9, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11FAKE16-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX11FAKE16-NEXT: v_max_f32_e32 v26, v52, v51
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_max_f32_e32 v6, v6, v22
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11FAKE16-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
; GFX11FAKE16-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
; GFX11FAKE16-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11FAKE16-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
; GFX11FAKE16-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_max_f32_e32 v29, v38, v37
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
; GFX11FAKE16-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_max_f32_e32 v14, v14, v30
; GFX11FAKE16-NEXT: v_max_f32_e32 v28, v48, v39
; GFX11FAKE16-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
; GFX11FAKE16-NEXT: v_bfe_u32 v39, v13, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v35, v14, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v37, v30, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v16, v33, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11FAKE16-NEXT: v_add3_u32 v35, v35, v14, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11FAKE16-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v37, v37, v30, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v13
; GFX11FAKE16-NEXT: v_bfe_u32 v49, v29, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v39, v39, v13, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v29
; GFX11FAKE16-NEXT: v_bfe_u32 v51, v12, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v49, v49, v29, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
; GFX11FAKE16-NEXT: v_bfe_u32 v55, v11, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v65, v27, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v55, v55, v11, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v67, v10, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v65, v65, v27, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v10
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11FAKE16-NEXT: v_bfe_u32 v69, v26, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v67, v67, v10, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v26
; GFX11FAKE16-NEXT: v_bfe_u32 v81, v25, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11FAKE16-NEXT: v_add3_u32 v69, v69, v26, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v25
; GFX11FAKE16-NEXT: v_bfe_u32 v83, v8, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v81, v81, v25, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v8
; GFX11FAKE16-NEXT: v_add3_u32 v83, v83, v8, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v99, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11FAKE16-NEXT: v_bfe_u32 v101, v22, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v99, v99, v6, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v22
; GFX11FAKE16-NEXT: v_bfe_u32 v113, v21, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11FAKE16-NEXT: v_add3_u32 v101, v101, v22, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v21
; GFX11FAKE16-NEXT: v_bfe_u32 v115, v4, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v113, v113, v21, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v117, v20, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v115, v115, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v20
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11FAKE16-NEXT: v_add3_u32 v117, v117, v20, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v133, v18, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v134, 0x400000, v18
; GFX11FAKE16-NEXT: v_bfe_u32 v147, v0, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
; GFX11FAKE16-NEXT: v_add3_u32 v133, v133, v18, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v147, v147, v0, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v131, v2, 16, 1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11FAKE16-NEXT: v_or_b32_e32 v132, 0x400000, v2
; GFX11FAKE16-NEXT: v_perm_b32 v9, v9, v26, 0x7060302
; GFX11FAKE16-NEXT: v_add3_u32 v131, v131, v2, 0x7fff
; GFX11FAKE16-NEXT: v_perm_b32 v10, v10, v27, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11FAKE16-NEXT: v_perm_b32 v11, v11, v28, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v12, v12, v29, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v13, v13, v30, 0x7060302
; GFX11FAKE16-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11FAKE16-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
; GFX11FAKE16-NEXT: v_max_f32_e32 v15, v15, v18
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX11FAKE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
ret <32 x bfloat> %op
}
declare bfloat @llvm.sqrt.bf16(bfloat)
define bfloat @v_sqrt_bf16(bfloat %a) {
; GCN-LABEL: v_sqrt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xf800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x260
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_sqrt_f32_e32 v2, v0
; GCN-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
; GCN-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
; GCN-NEXT: v_fma_f32 v5, -v3, v2, v0
; GCN-NEXT: v_fma_f32 v6, -v4, v2, v0
; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sqrt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0xf800000
; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX7-NEXT: v_sqrt_f32_e32 v1, v0
; GFX7-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
; GFX7-NEXT: v_fma_f32 v3, -v2, v1, v0
; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
; GFX7-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
; GFX7-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
; GFX7-NEXT: v_fma_f32 v1, -v3, v1, v0
; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
; GFX7-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sqrt_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0xf800000
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_sqrt_f32_e32 v1, v0
; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], -1, v1
; GFX8-NEXT: v_fma_f32 v3, -v2, v1, v0
; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], 1, v1
; GFX8-NEXT: v_fma_f32 v1, -v3, v1, v0
; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
; GFX8-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, 0x260
; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sqrt_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0xf800000
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_sqrt_f32_e32 v1, v0
; GFX9-NEXT: v_add_u32_e32 v2, -1, v1
; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
; GFX9-NEXT: v_add_u32_e32 v3, 1, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0
; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, 0x260
; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sqrt_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_sqrt_f32_e32 v1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v1
; GFX10-NEXT: v_fma_f32 v4, -v2, v1, v0
; GFX10-NEXT: v_fma_f32 v5, -v3, v1, v0
; GFX10-NEXT: v_cmp_ge_f32_e64 s4, 0, v4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4
; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sqrt_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_sqrt_f32_e32 v1, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v1
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v1
; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_fma_f32 v5, -v3, v1, v0
; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
; GFX11TRUE16-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
; GFX11TRUE16-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sqrt_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_sqrt_f32_e32 v1, v0
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v2, -1, v1
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v3, 1, v1
; GFX11FAKE16-NEXT: v_fma_f32 v4, -v2, v1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_fma_f32 v5, -v3, v1, v0
; GFX11FAKE16-NEXT: v_cmp_ge_f32_e64 s0, 0, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0
; GFX11FAKE16-NEXT: v_cmp_lt_f32_e64 s0, 0, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0
; GFX11FAKE16-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.sqrt.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GCN-LABEL: v_ldexp_bf16_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_ldexp_bf16_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ldexp_bf16_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ldexp_bf16_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ldexp_bf16_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_ldexp_bf16_i32:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_ldexp_bf16_i32:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
ret bfloat %op
}
declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GCN-LABEL: v_frexp_bf16_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x7f800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0
; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_frexp_bf16_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_frexp_bf16_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v1
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_frexp_bf16_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_frexp_bf16_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_frexp_bf16_i16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_frexp_mant_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_frexp_bf16_i16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_frexp_mant_f32_e32 v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
ret { bfloat, i16 } %op
}
declare bfloat @llvm.log.bf16(bfloat)
declare bfloat @llvm.log2.bf16(bfloat)
declare bfloat @llvm.log10.bf16(bfloat)
define bfloat @v_log_bf16(bfloat %a) {
; GCN-LABEL: v_log_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
; GCN-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v3, v5, v3
; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3f317217
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
; GFX7-NEXT: s_mov_b32 s4, 0x3377d1cf
; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
; GFX7-NEXT: s_mov_b32 s4, 0x7f800000
; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX8-NEXT: v_mul_f32_e32 v3, 0x3f317000, v2
; GFX8-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2
; GFX8-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_log_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3f317217
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf
; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_log_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_log_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_log_f32_e32 v0, v0
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX11FAKE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
; GFX11FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log.bf16(bfloat %a)
ret bfloat %op
}
define bfloat @v_log2_bf16(bfloat %a) {
; GCN-LABEL: v_log2_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log2_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log2_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_log2_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log2_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_log2_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_log2_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_log_f32_e32 v0, v0
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log2.bf16(bfloat %a)
ret bfloat %op
}
define bfloat @v_log10_bf16(bfloat %a) {
; GCN-LABEL: v_log10_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
; GCN-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v3, v5, v3
; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log10_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
; GFX7-NEXT: s_mov_b32 s4, 0x3284fbcf
; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
; GFX7-NEXT: s_mov_b32 s4, 0x7f800000
; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log10_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX8-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v2
; GFX8-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2
; GFX8-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
; GFX8-NEXT: v_add_f32_e32 v2, v4, v2
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_log10_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1
; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf
; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2
; GFX9-NEXT: s_mov_b32 s4, 0x7f800000
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_log10_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_log10_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_log10_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_log_f32_e32 v0, v0
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX11FAKE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
; GFX11FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log10.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.exp.bf16(bfloat)
declare bfloat @llvm.exp2.bf16(bfloat)
declare bfloat @llvm.exp10.bf16(bfloat)
define bfloat @v_exp_bf16(bfloat %a) {
; GCN-LABEL: v_exp_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; GCN-NEXT: s_mov_b32 s5, 0x42b17218
; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
; GCN-NEXT: v_rndne_f32_e32 v5, v2
; GCN-NEXT: v_mul_f32_e32 v6, 0x39a3b295, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
; GCN-NEXT: v_add_f32_e32 v3, v3, v6
; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: v_exp_f32_e32 v2, v2
; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
; GFX7-NEXT: s_mov_b32 s4, 0x32a5705f
; GFX7-NEXT: v_rndne_f32_e32 v3, v1
; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
; GFX7-NEXT: v_exp_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX7-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GFX7-NEXT: s_mov_b32 s4, 0x42b17218
; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v0
; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v3
; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
; GFX8-NEXT: v_rndne_f32_e32 v2, v1
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_exp_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX8-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GFX8-NEXT: s_mov_b32 s4, 0x42b17218
; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_exp_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GFX9-NEXT: v_rndne_f32_e32 v2, v1
; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f
; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
; GFX9-NEXT: v_exp_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GFX9-NEXT: s_mov_b32 s4, 0x42b17218
; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
; GFX10-NEXT: v_rndne_f32_e32 v2, v1
; GFX10-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_exp_f32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_exp_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1
; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_exp_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX11FAKE16-NEXT: v_rndne_f32_e32 v2, v1
; GFX11FAKE16-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX11FAKE16-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_exp_f32_e32 v1, v1
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.exp.bf16(bfloat %a)
ret bfloat %op
}
define bfloat @v_exp2_bf16(bfloat %a) {
; GCN-LABEL: v_exp2_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
; GCN-NEXT: v_not_b32_e32 v2, 63
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp2_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-NEXT: v_exp_f32_e32 v0, v0
; GFX7-NEXT: v_not_b32_e32 v1, 63
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp2_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0xc2fc0000
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_not_b32_e32 v1, 63
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_exp2_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v1, 63
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp2_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_exp_f32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_exp2_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_exp2_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_exp_f32_e32 v0, v0
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.exp2.bf16(bfloat %a)
ret bfloat %op
}
define bfloat @v_exp10_bf16(bfloat %a) {
; GCN-LABEL: v_exp10_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xc23369f4
; GCN-NEXT: s_mov_b32 s5, 0x421a209b
; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
; GCN-NEXT: v_rndne_f32_e32 v5, v2
; GCN-NEXT: v_mul_f32_e32 v6, 0x3a2784bc, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
; GCN-NEXT: v_sub_f32_e32 v2, v2, v5
; GCN-NEXT: v_add_f32_e32 v3, v3, v6
; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: v_exp_f32_e32 v2, v2
; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp10_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x40549a78
; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
; GFX7-NEXT: s_mov_b32 s4, 0x33979a37
; GFX7-NEXT: v_rndne_f32_e32 v3, v1
; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2
; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX7-NEXT: v_add_f32_e32 v1, v1, v2
; GFX7-NEXT: v_exp_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX7-NEXT: s_mov_b32 s4, 0xc23369f4
; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GFX7-NEXT: s_mov_b32 s4, 0x421a209b
; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp10_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x40549000, v0
; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v3
; GFX8-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
; GFX8-NEXT: v_rndne_f32_e32 v2, v1
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX8-NEXT: v_add_f32_e32 v3, v4, v3
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_exp_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX8-NEXT: s_mov_b32 s4, 0xc23369f4
; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GFX8-NEXT: s_mov_b32 s4, 0x421a209b
; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_exp10_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX9-NEXT: s_mov_b32 s4, 0x40549a78
; GFX9-NEXT: v_rndne_f32_e32 v2, v1
; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2
; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1
; GFX9-NEXT: s_mov_b32 s4, 0x33979a37
; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1
; GFX9-NEXT: v_add_f32_e32 v1, v3, v1
; GFX9-NEXT: v_exp_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; GFX9-NEXT: s_mov_b32 s4, 0x421a209b
; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_exp10_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
; GFX10-NEXT: v_rndne_f32_e32 v2, v1
; GFX10-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_exp_f32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_exp10_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1
; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_exp10_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX11FAKE16-NEXT: v_rndne_f32_e32 v2, v1
; GFX11FAKE16-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX11FAKE16-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_exp_f32_e32 v1, v1
; GFX11FAKE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.exp10.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.ceil.bf16(bfloat)
define bfloat @v_ceil_bf16(bfloat %a) {
; GCN-LABEL: v_ceil_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_ceil_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_ceil_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_ceil_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ceil_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_ceil_f32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ceil_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_ceil_f32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ceil_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_ceil_f32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_ceil_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_ceil_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_ceil_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.ceil.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.trunc.bf16(bfloat)
define bfloat @v_trunc_bf16(bfloat %a) {
; GCN-LABEL: v_trunc_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_trunc_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_trunc_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_trunc_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_trunc_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_trunc_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_trunc_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_trunc_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.trunc.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.rint.bf16(bfloat)
define bfloat @v_rint_bf16(bfloat %a) {
; GCN-LABEL: v_rint_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_rint_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rint_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_rndne_f32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_rint_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rint_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_rint_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_rint_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_rndne_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.rint.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.nearbyint.bf16(bfloat)
define bfloat @v_nearbyint_bf16(bfloat %a) {
; GCN-LABEL: v_nearbyint_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_nearbyint_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_nearbyint_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_rndne_f32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_nearbyint_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_nearbyint_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_nearbyint_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_nearbyint_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_rndne_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.round.bf16(bfloat)
define bfloat @v_round_bf16(bfloat %a) {
; GCN-LABEL: v_round_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v0
; GCN-NEXT: v_sub_f32_e32 v2, v0, v1
; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
; GCN-NEXT: s_brev_b32 s4, -2
; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0
; GCN-NEXT: v_add_f32_e32 v0, v1, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_round_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_trunc_f32_e32 v1, v0
; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
; GFX7-NEXT: s_brev_b32 s4, -2
; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_round_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v0
; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
; GFX8-NEXT: s_brev_b32 s4, -2
; GFX8-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_round_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v0
; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_round_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f32_e32 v1, v0
; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_round_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_trunc_f32_e32 v1, v0
; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_round_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_trunc_f32_e32 v1, v0
; GFX11FAKE16-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.round.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.roundeven.bf16(bfloat)
define bfloat @v_roundeven_bf16(bfloat %a) {
; GCN-LABEL: v_roundeven_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_rndne_f32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_roundeven_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_rndne_f32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_rndne_f32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_roundeven_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_roundeven_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_rndne_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.roundeven.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.floor.bf16(bfloat)
define bfloat @v_floor_bf16(bfloat %a) {
; GCN-LABEL: v_floor_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_floor_f32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_floor_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_floor_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_floor_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_floor_f32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_floor_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_floor_f32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_floor_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_floor_f32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_floor_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_floor_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_floor_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.floor.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.canonicalize.bf16(bfloat)
define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN-LABEL: v_canonicalize_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_canonicalize_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_canonicalize_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_canonicalize_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_canonicalize_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_max_f32_e32 v0, v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
ret bfloat %op
}
declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
; FIXME: Promotion broken
; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
; %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
; ret bfloat %op
; }
define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_false_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_false_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_false_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_false_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_false_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_false_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp false bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_oeq_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_oeq_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_oeq_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_oeq_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oeq_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_oeq_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp oeq bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ogt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ogt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ogt_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ogt_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ogt_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ogt_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ogt bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_oge_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_oge_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_oge_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_oge_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_oge_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_oge_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp oge bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_olt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_olt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_olt_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_olt_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_olt_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_olt_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp olt bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ole_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ole_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ole_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ole_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ole_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ole_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ole bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_one_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_one_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_one_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_one_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_one_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_one_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp one bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_uno_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_uno_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_uno_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_uno_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uno_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_uno_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp uno bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ueq_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ueq_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ueq_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ueq_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ueq_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ueq_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ueq bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ugt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ugt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ugt_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ugt_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ugt_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ugt_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ugt bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_uge_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_uge_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_uge_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_uge_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_uge_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_uge_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp uge bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ult_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ult_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ult_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ult_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ult_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ult_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ult bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ule_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_ule_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_ule_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_ule_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_ule_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_ule_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ule bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_une_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_une_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_une_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_une_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_une_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_une_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp une bfloat %a, %b
ret i1 %op
}
define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_true_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fcmp_true_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, 1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fcmp_true_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fcmp_true_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fcmp_true_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fcmp_true_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fcmp true bfloat %a, %b
ret i1 %op
}
define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
; GCN-LABEL: v_fptosi_bf16_to_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_bf16_to_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_bf16_to_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_bf16_to_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_bf16_to_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_bf16_to_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i16
ret i16 %op
}
define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v2bf16_to_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i16>
ret <2 x i16> %op
}
define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v3bf16_to_v3i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i16>
ret <3 x i16> %op
}
define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v4bf16_to_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i16>
ret <4 x i16> %op
}
define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
; GCN-LABEL: v_fptosi_bf16_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_bf16_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_bf16_to_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_bf16_to_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_bf16_to_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_bf16_to_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i32
ret i32 %op
}
define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v2bf16_to_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i32>
ret <2 x i32> %op
}
define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v3bf16_to_v3i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v4
; GFX9-NEXT: v_mov_b32_e32 v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i32>
ret <3 x i32> %op
}
define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v4bf16_to_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_i32_f32_e32 v5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2
; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v4
; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4
; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4
; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i32>
ret <4 x i32> %op
}
define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GCN-LABEL: v_fptosi_bf16_to_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GCN-NEXT: v_floor_f32_e32 v1, v1
; GCN-NEXT: v_fma_f32 v0, v1, s5, |v0|
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_bf16_to_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4
; GFX7-NEXT: v_floor_f32_e32 v1, v1
; GFX7-NEXT: s_mov_b32 s4, 0xcf800000
; GFX7-NEXT: v_fma_f32 v2, v1, s4, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GFX7-NEXT: v_xor_b32_e32 v0, v2, v3
; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_bf16_to_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
; GFX8-NEXT: v_mul_f32_e64 v1, |v0|, s4
; GFX8-NEXT: v_floor_f32_e32 v1, v1
; GFX8-NEXT: s_mov_b32 s4, 0xcf800000
; GFX8-NEXT: v_fma_f32 v2, v1, s4, |v0|
; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GFX8-NEXT: v_xor_b32_e32 v0, v2, v3
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_bf16_to_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4
; GFX9-NEXT: v_floor_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b32 s4, 0xcf800000
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0|
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_bf16_to_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GFX10-NEXT: v_floor_f32_e32 v1, v1
; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v2
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_bf16_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f32_e32 v0, v0
; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_floor_f32_e32 v1, v1
; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i64
ret i64 %op
}
define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v2bf16_to_v2i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_mul_f32_e64 v2, |v0|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GCN-NEXT: v_mul_f32_e64 v4, |v1|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; GCN-NEXT: v_floor_f32_e32 v2, v2
; GCN-NEXT: v_floor_f32_e32 v4, v4
; GCN-NEXT: v_fma_f32 v0, v2, s5, |v0|
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
; GCN-NEXT: v_fma_f32 v1, v4, s5, |v1|
; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_xor_b32_e32 v2, v2, v3
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_xor_b32_e32 v4, v4, v5
; GCN-NEXT: v_xor_b32_e32 v0, v0, v3
; GCN-NEXT: v_xor_b32_e32 v6, v1, v5
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v6, v5
; GCN-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v2, |v0|, s4
; GFX7-NEXT: v_floor_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4
; GFX7-NEXT: v_trunc_f32_e32 v3, v1
; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4
; GFX7-NEXT: v_floor_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX7-NEXT: v_fma_f32 v5, v1, s5, |v3|
; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v1
; GFX7-NEXT: v_xor_b32_e32 v2, v2, v4
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc
; GFX7-NEXT: v_xor_b32_e32 v2, v5, v3
; GFX7-NEXT: v_xor_b32_e32 v4, v6, v3
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
; GFX8-NEXT: v_mul_f32_e64 v2, |v1|, s4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_floor_f32_e32 v2, v2
; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
; GFX8-NEXT: v_trunc_f32_e32 v4, v0
; GFX8-NEXT: v_fma_f32 v3, v2, s5, |v1|
; GFX8-NEXT: v_mul_f32_e64 v0, |v4|, s4
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_floor_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX8-NEXT: v_fma_f32 v5, v0, s5, |v4|
; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
; GFX8-NEXT: v_xor_b32_e32 v3, v3, v1
; GFX8-NEXT: v_xor_b32_e32 v2, v2, v1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v3, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v2, v5, v3
; GFX8-NEXT: v_xor_b32_e32 v4, v6, v3
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_floor_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
; GFX9-NEXT: v_trunc_f32_e32 v4, v0
; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1|
; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_floor_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4|
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1
; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_trunc_f32_e32 v1, v1
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v0
; GFX10-NEXT: v_floor_f32_e32 v2, v2
; GFX10-NEXT: v_floor_f32_e32 v3, v3
; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
; GFX10-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v5
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1
; GFX10-NEXT: v_xor_b32_e32 v3, v3, v6
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-NEXT: v_xor_b32_e32 v4, v4, v6
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_trunc_f32_e32 v1, v1
; GFX11-NEXT: v_trunc_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1|
; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0|
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_floor_f32_e32 v2, v2
; GFX11-NEXT: v_floor_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1|
; GFX11-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0|
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v4
; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1
; GFX11-NEXT: v_xor_b32_e32 v3, v3, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-NEXT: v_xor_b32_e32 v4, v4, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i64>
ret <2 x i64> %op
}
define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v3bf16_to_v3i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
; GCN-NEXT: v_mul_f32_e64 v3, |v0|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCN-NEXT: v_mul_f32_e64 v5, |v1|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GCN-NEXT: v_mul_f32_e64 v7, |v2|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v2
; GCN-NEXT: v_floor_f32_e32 v3, v3
; GCN-NEXT: v_floor_f32_e32 v5, v5
; GCN-NEXT: v_floor_f32_e32 v7, v7
; GCN-NEXT: v_fma_f32 v0, v3, s5, |v0|
; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
; GCN-NEXT: v_fma_f32 v1, v5, s5, |v1|
; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
; GCN-NEXT: v_fma_f32 v2, v7, s5, |v2|
; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_xor_b32_e32 v3, v3, v4
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_xor_b32_e32 v5, v5, v6
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
; GCN-NEXT: v_xor_b32_e32 v7, v7, v8
; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
; GCN-NEXT: v_xor_b32_e32 v9, v1, v6
; GCN-NEXT: v_xor_b32_e32 v10, v2, v8
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v9, v6
; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v6, vcc
; GCN-NEXT: v_sub_i32_e32 v4, vcc, v10, v8
; GCN-NEXT: v_subb_u32_e32 v5, vcc, v7, v8, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
; GFX7-NEXT: v_floor_f32_e32 v3, v3
; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5
; GFX7-NEXT: v_trunc_f32_e32 v4, v1
; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4
; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX7-NEXT: v_floor_f32_e32 v1, v1
; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4|
; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_xor_b32_e32 v5, v6, v3
; GFX7-NEXT: v_trunc_f32_e32 v6, v2
; GFX7-NEXT: v_mul_f32_e64 v2, |v6|, s4
; GFX7-NEXT: v_floor_f32_e32 v2, v2
; GFX7-NEXT: v_xor_b32_e32 v4, v7, v3
; GFX7-NEXT: v_fma_f32 v7, v2, s5, |v6|
; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v2
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v5, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v6
; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
; GFX7-NEXT: v_xor_b32_e32 v4, v7, v5
; GFX7-NEXT: v_xor_b32_e32 v6, v8, v5
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5
; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v2
; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4
; GFX8-NEXT: v_floor_f32_e32 v3, v3
; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX8-NEXT: v_trunc_f32_e32 v5, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
; GFX8-NEXT: v_floor_f32_e32 v0, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v3, v2, vcc
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX8-NEXT: v_mul_f32_e64 v5, |v1|, s4
; GFX8-NEXT: v_floor_f32_e32 v5, v5
; GFX8-NEXT: v_xor_b32_e32 v2, v7, v3
; GFX8-NEXT: v_fma_f32 v7, v5, s5, |v1|
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX8-NEXT: v_xor_b32_e32 v4, v8, v3
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
; GFX8-NEXT: v_xor_b32_e32 v4, v7, v1
; GFX8-NEXT: v_xor_b32_e32 v5, v5, v1
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v1
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
; GFX9-NEXT: v_floor_f32_e32 v3, v3
; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX9-NEXT: v_trunc_f32_e32 v5, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
; GFX9-NEXT: v_floor_f32_e32 v0, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4
; GFX9-NEXT: v_floor_f32_e32 v5, v5
; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3
; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1|
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1
; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
; GFX10-NEXT: v_trunc_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
; GFX10-NEXT: v_floor_f32_e32 v3, v3
; GFX10-NEXT: v_floor_f32_e32 v4, v4
; GFX10-NEXT: v_floor_f32_e32 v6, v6
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v1
; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
; GFX10-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
; GFX10-NEXT: v_xor_b32_e32 v9, v0, v7
; GFX10-NEXT: v_xor_b32_e32 v4, v4, v7
; GFX10-NEXT: v_xor_b32_e32 v10, v1, v8
; GFX10-NEXT: v_xor_b32_e32 v6, v6, v8
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_trunc_f32_e32 v2, v2
; GFX11-NEXT: v_trunc_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_trunc_f32_e32 v1, v1
; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0|
; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1|
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
; GFX11-NEXT: v_floor_f32_e32 v3, v3
; GFX11-NEXT: v_floor_f32_e32 v4, v4
; GFX11-NEXT: v_floor_f32_e32 v6, v6
; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2|
; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1|
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
; GFX11-NEXT: v_xor_b32_e32 v9, v0, v7
; GFX11-NEXT: v_xor_b32_e32 v4, v4, v7
; GFX11-NEXT: v_xor_b32_e32 v10, v1, v8
; GFX11-NEXT: v_xor_b32_e32 v6, v6, v8
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v3, v5, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v4, v7, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v6, v8, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i64>
ret <3 x i64> %op
}
define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GCN-LABEL: v_fptosi_v4bf16_to_v4i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
; GCN-NEXT: v_trunc_f32_e32 v3, v3
; GCN-NEXT: v_mul_f32_e64 v4, |v0|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-NEXT: v_mul_f32_e64 v6, |v1|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-NEXT: v_mul_f32_e64 v8, |v2|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v2
; GCN-NEXT: v_mul_f32_e64 v10, |v3|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3
; GCN-NEXT: v_floor_f32_e32 v4, v4
; GCN-NEXT: v_floor_f32_e32 v6, v6
; GCN-NEXT: v_floor_f32_e32 v8, v8
; GCN-NEXT: v_floor_f32_e32 v10, v10
; GCN-NEXT: v_fma_f32 v0, v4, s5, |v0|
; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
; GCN-NEXT: v_fma_f32 v1, v6, s5, |v1|
; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
; GCN-NEXT: v_fma_f32 v2, v8, s5, |v2|
; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8
; GCN-NEXT: v_fma_f32 v3, v10, s5, |v3|
; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_xor_b32_e32 v4, v4, v5
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_xor_b32_e32 v6, v6, v7
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
; GCN-NEXT: v_xor_b32_e32 v8, v8, v9
; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
; GCN-NEXT: v_xor_b32_e32 v10, v10, v11
; GCN-NEXT: v_xor_b32_e32 v0, v0, v5
; GCN-NEXT: v_xor_b32_e32 v12, v1, v7
; GCN-NEXT: v_xor_b32_e32 v13, v2, v9
; GCN-NEXT: v_xor_b32_e32 v14, v3, v11
; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc
; GCN-NEXT: v_sub_i32_e32 v2, vcc, v12, v7
; GCN-NEXT: v_subb_u32_e32 v3, vcc, v6, v7, vcc
; GCN-NEXT: v_sub_i32_e32 v4, vcc, v13, v9
; GCN-NEXT: v_subb_u32_e32 v5, vcc, v8, v9, vcc
; GCN-NEXT: v_sub_i32_e32 v6, vcc, v14, v11
; GCN-NEXT: v_subb_u32_e32 v7, vcc, v10, v11, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3
; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4
; GFX7-NEXT: v_floor_f32_e32 v3, v3
; GFX7-NEXT: s_mov_b32 s5, 0xcf800000
; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0|
; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6
; GFX7-NEXT: v_trunc_f32_e32 v5, v1
; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4
; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX7-NEXT: v_floor_f32_e32 v1, v1
; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5|
; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3
; GFX7-NEXT: v_trunc_f32_e32 v7, v2
; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4
; GFX7-NEXT: v_floor_f32_e32 v2, v2
; GFX7-NEXT: v_xor_b32_e32 v5, v8, v3
; GFX7-NEXT: v_fma_f32 v8, v2, s5, |v7|
; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v2
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3
; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5
; GFX7-NEXT: v_trunc_f32_e32 v8, v4
; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4
; GFX7-NEXT: v_floor_f32_e32 v4, v4
; GFX7-NEXT: v_xor_b32_e32 v6, v9, v5
; GFX7-NEXT: v_fma_f32 v9, v4, s5, |v8|
; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v9
; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v4
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v7, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v8
; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
; GFX7-NEXT: v_xor_b32_e32 v6, v9, v7
; GFX7-NEXT: v_xor_b32_e32 v8, v10, v7
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v2
; GFX8-NEXT: s_mov_b32 s4, 0x2f800000
; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4
; GFX8-NEXT: v_floor_f32_e32 v3, v3
; GFX8-NEXT: s_mov_b32 s5, 0xcf800000
; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2|
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX8-NEXT: v_trunc_f32_e32 v5, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4
; GFX8-NEXT: v_floor_f32_e32 v0, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5|
; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2
; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v3, v2, vcc
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_trunc_f32_e32 v5, v5
; GFX8-NEXT: v_xor_b32_e32 v2, v6, v3
; GFX8-NEXT: v_mul_f32_e64 v6, |v5|, s4
; GFX8-NEXT: v_floor_f32_e32 v6, v6
; GFX8-NEXT: v_xor_b32_e32 v4, v7, v3
; GFX8-NEXT: v_fma_f32 v7, v6, s5, |v5|
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
; GFX8-NEXT: v_xor_b32_e32 v4, v7, v5
; GFX8-NEXT: v_mul_f32_e64 v7, |v1|, s4
; GFX8-NEXT: v_floor_f32_e32 v7, v7
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX8-NEXT: v_fma_f32 v9, v7, s5, |v1|
; GFX8-NEXT: v_cvt_u32_f32_e32 v9, v9
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX8-NEXT: v_xor_b32_e32 v6, v6, v5
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
; GFX8-NEXT: v_xor_b32_e32 v6, v9, v1
; GFX8-NEXT: v_xor_b32_e32 v7, v7, v1
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v1
; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s4, 0x2f800000
; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4
; GFX9-NEXT: v_floor_f32_e32 v3, v3
; GFX9-NEXT: s_mov_b32 s5, 0xcf800000
; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX9-NEXT: v_trunc_f32_e32 v5, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4
; GFX9-NEXT: v_floor_f32_e32 v0, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5|
; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3
; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4
; GFX9-NEXT: v_floor_f32_e32 v6, v6
; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3
; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5|
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5
; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4
; GFX9-NEXT: v_floor_f32_e32 v7, v7
; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1|
; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1
; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1
; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
; GFX10-NEXT: v_trunc_f32_e32 v0, v0
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
; GFX10-NEXT: v_trunc_f32_e32 v4, v1
; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
; GFX10-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GFX10-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
; GFX10-NEXT: v_floor_f32_e32 v1, v1
; GFX10-NEXT: v_floor_f32_e32 v6, v6
; GFX10-NEXT: v_floor_f32_e32 v8, v8
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0
; GFX10-NEXT: v_floor_f32_e32 v9, v9
; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
; GFX10-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX10-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5
; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v3
; GFX10-NEXT: v_xor_b32_e32 v3, v0, v7
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX10-NEXT: v_xor_b32_e32 v6, v6, v7
; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
; GFX10-NEXT: v_ashrrev_i32_e32 v13, 31, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v9
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v4, v12, v10
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
; GFX10-NEXT: v_xor_b32_e32 v5, v8, v10
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v6, v11, v13
; GFX10-NEXT: v_xor_b32_e32 v7, v9, v13
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_trunc_f32_e32 v2, v2
; GFX11-NEXT: v_trunc_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
; GFX11-NEXT: v_trunc_f32_e32 v4, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
; GFX11-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GFX11-NEXT: v_floor_f32_e32 v1, v1
; GFX11-NEXT: v_floor_f32_e32 v6, v6
; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
; GFX11-NEXT: v_floor_f32_e32 v8, v8
; GFX11-NEXT: v_floor_f32_e32 v9, v9
; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
; GFX11-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3|
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX11-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4|
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v3
; GFX11-NEXT: v_xor_b32_e32 v3, v0, v7
; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX11-NEXT: v_xor_b32_e32 v6, v6, v7
; GFX11-NEXT: v_cvt_u32_f32_e32 v11, v11
; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v4
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
; GFX11-NEXT: v_cvt_u32_f32_e32 v9, v9
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10
; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v6, v7, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v5, v8, v10
; GFX11-NEXT: v_xor_b32_e32 v6, v11, v13
; GFX11-NEXT: v_xor_b32_e32 v7, v9, v13
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v5, v10, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, null, v7, v13, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i64>
ret <4 x i64> %op
}
define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GCN-LABEL: v_sitofp_i16_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i16_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i16_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i16_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i16_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_i16_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_i16_to_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp i16 %x to bfloat
ret bfloat %op
}
define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GCN-LABEL: v_sitofp_v2i16_to_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i16> %x to <2 x bfloat>
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GCN-LABEL: v_sitofp_v3i16_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX11FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i16> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16
; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v2, 16, v1
; GFX11TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; GFX11TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v4i16_to_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 16
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_bfe_i32 v3, v0, 0, 16
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v10, v0, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i16> %x to <4 x bfloat>
ret <4 x bfloat> %op
}
define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
; GCN-LABEL: v_sitofp_i32_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i32_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i32_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i32_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_i32_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_i32_to_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp i32 %x to bfloat
ret bfloat %op
}
define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GCN-LABEL: v_sitofp_v2i32_to_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i32> %x to <2 x bfloat>
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i32> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v4i32_to_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i32> %x to <4 x bfloat>
ret <4 x bfloat> %op
}
define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GCN-LABEL: v_sitofp_i64_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v2, v0, v1
; GCN-NEXT: v_ffbh_i32_e32 v3, v1
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; GCN-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-NEXT: v_min_u32_e32 v2, v3, v2
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i64_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; GFX7-NEXT: v_min_u32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i64_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX8-NEXT: v_ffbh_i32_e32 v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, -1, v3
; GFX8-NEXT: v_min_u32_e32 v2, v3, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i64_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX9-NEXT: v_ffbh_i32_e32 v3, v1
; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
; GFX9-NEXT: v_add_u32_e32 v3, -1, v3
; GFX9-NEXT: v_min_u32_e32 v2, v3, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i64_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_i64_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX11TRUE16-NEXT: v_cls_i32_e32 v3, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_i64_to_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX11FAKE16-NEXT: v_cls_i32_e32 v3, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v2, 31, v2
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v2, 32, v2
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp i64 %x to bfloat
ret bfloat %op
}
define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GCN-LABEL: v_sitofp_v2i64_to_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_i32_e32 v4, v3
; GCN-NEXT: v_xor_b32_e32 v5, v2, v3
; GCN-NEXT: v_ffbh_i32_e32 v6, v1
; GCN-NEXT: v_xor_b32_e32 v7, v0, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v4
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GCN-NEXT: v_add_i32_e32 v5, vcc, 32, v5
; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7
; GCN-NEXT: v_min_u32_e32 v4, v4, v5
; GCN-NEXT: v_min_u32_e32 v5, v6, v7
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v5, v2, v3
; GFX7-NEXT: v_ffbh_i32_e32 v4, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
; GFX7-NEXT: v_min_u32_e32 v4, v4, v5
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX8-NEXT: v_ffbh_i32_e32 v4, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
; GFX8-NEXT: v_min_u32_e32 v4, v4, v5
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0
; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
; GFX8-NEXT: v_min_u32_e32 v6, v0, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX9-NEXT: v_ffbh_i32_e32 v4, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
; GFX9-NEXT: v_min_u32_e32 v4, v4, v5
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
; GFX9-NEXT: v_min_u32_e32 v6, v0, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v4, v0, v1
; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3
; GFX10-NEXT: v_ffbh_i32_e32 v6, v1
; GFX10-NEXT: v_ffbh_i32_e32 v7, v3
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
; GFX10-NEXT: v_add_nc_u32_e32 v7, -1, v7
; GFX10-NEXT: v_add_nc_u32_e32 v4, 32, v4
; GFX10-NEXT: v_add_nc_u32_e32 v5, 32, v5
; GFX10-NEXT: v_min_u32_e32 v4, v6, v4
; GFX10-NEXT: v_min_u32_e32 v5, v7, v5
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_xor_b32_e32 v4, v0, v1
; GFX11TRUE16-NEXT: v_xor_b32_e32 v5, v2, v3
; GFX11TRUE16-NEXT: v_cls_i32_e32 v6, v1
; GFX11TRUE16-NEXT: v_cls_i32_e32 v7, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v4, 31, v4
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v7, -1, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v4, 32, v4
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v5, 32, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_u32_e32 v4, v6, v4
; GFX11TRUE16-NEXT: v_min_u32_e32 v5, v7, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v2, 32, v4
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_xor_b32_e32 v4, v0, v1
; GFX11FAKE16-NEXT: v_xor_b32_e32 v5, v2, v3
; GFX11FAKE16-NEXT: v_cls_i32_e32 v6, v1
; GFX11FAKE16-NEXT: v_cls_i32_e32 v7, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v4, 31, v4
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v7, -1, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v4, 32, v4
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v5, 32, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_min_u32_e32 v4, v6, v4
; GFX11FAKE16-NEXT: v_min_u32_e32 v5, v7, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v2, 32, v4
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i64> %x to <2 x bfloat>
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_i32_e32 v6, v5
; GCN-NEXT: v_xor_b32_e32 v7, v4, v5
; GCN-NEXT: v_ffbh_i32_e32 v8, v3
; GCN-NEXT: v_xor_b32_e32 v9, v2, v3
; GCN-NEXT: v_ffbh_i32_e32 v10, v1
; GCN-NEXT: v_xor_b32_e32 v11, v0, v1
; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6
; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10
; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7
; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11
; GCN-NEXT: v_min_u32_e32 v6, v6, v7
; GCN-NEXT: v_min_u32_e32 v7, v8, v9
; GCN-NEXT: v_min_u32_e32 v8, v10, v11
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v4
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX7-NEXT: v_ffbh_i32_e32 v6, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
; GFX7-NEXT: v_xor_b32_e32 v7, v2, v3
; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
; GFX7-NEXT: v_ffbh_i32_e32 v6, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6
; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7
; GFX7-NEXT: v_min_u32_e32 v6, v6, v7
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX8-NEXT: v_ffbh_i32_e32 v6, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7
; GFX8-NEXT: v_min_u32_e32 v6, v6, v7
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
; GFX8-NEXT: v_ffbh_i32_e32 v7, v1
; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v7, v7, v8
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_xor_b32_e32 v6, v2, v3
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_ffbh_i32_e32 v5, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6
; GFX8-NEXT: v_min_u32_e32 v5, v5, v6
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7
; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX9-NEXT: v_ffbh_i32_e32 v6, v5
; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
; GFX9-NEXT: v_ffbh_i32_e32 v6, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX9-NEXT: v_add_u32_e32 v6, -1, v6
; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
; GFX9-NEXT: v_min_u32_e32 v7, v0, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v8, v0, v1
; GFX10-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX10-NEXT: v_xor_b32_e32 v9, v2, v3
; GFX10-NEXT: v_ffbh_i32_e32 v10, v1
; GFX10-NEXT: v_ffbh_i32_e32 v6, v5
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX10-NEXT: v_ffbh_i32_e32 v11, v3
; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6
; GFX10-NEXT: v_add_nc_u32_e32 v7, 32, v7
; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11
; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
; GFX10-NEXT: v_min_u32_e32 v8, v10, v8
; GFX10-NEXT: v_min_u32_e32 v6, v6, v7
; GFX10-NEXT: v_min_u32_e32 v7, v11, v9
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v6
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7
; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
; GFX10-NEXT: v_ldexp_f32 v2, v2, v4
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_xor_b32_e32 v6, v0, v1
; GFX11TRUE16-NEXT: v_xor_b32_e32 v9, v2, v3
; GFX11TRUE16-NEXT: v_cls_i32_e32 v10, v1
; GFX11TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX11TRUE16-NEXT: v_cls_i32_e32 v11, v3
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v6, 31, v6
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
; GFX11TRUE16-NEXT: v_cls_i32_e32 v8, v5
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v11, -1, v11
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, -1, v8
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v7, 32, v7
; GFX11TRUE16-NEXT: v_min_u32_e32 v6, v10, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v9, v11, v9
; GFX11TRUE16-NEXT: v_min_u32_e32 v7, v8, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v1, 1, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v9
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_xor_b32_e32 v8, v0, v1
; GFX11FAKE16-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX11FAKE16-NEXT: v_xor_b32_e32 v9, v2, v3
; GFX11FAKE16-NEXT: v_cls_i32_e32 v10, v1
; GFX11FAKE16-NEXT: v_cls_i32_e32 v6, v5
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
; GFX11FAKE16-NEXT: v_cls_i32_e32 v11, v3
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v6, -1, v6
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v7, 32, v7
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v11, -1, v11
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
; GFX11FAKE16-NEXT: v_min_u32_e32 v8, v10, v8
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_u32_e32 v6, v6, v7
; GFX11FAKE16-NEXT: v_min_u32_e32 v7, v11, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i64> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_i32_e32 v8, v7
; GCN-NEXT: v_xor_b32_e32 v9, v6, v7
; GCN-NEXT: v_ffbh_i32_e32 v10, v5
; GCN-NEXT: v_xor_b32_e32 v11, v4, v5
; GCN-NEXT: v_ffbh_i32_e32 v12, v3
; GCN-NEXT: v_xor_b32_e32 v13, v2, v3
; GCN-NEXT: v_ffbh_i32_e32 v14, v1
; GCN-NEXT: v_xor_b32_e32 v15, v0, v1
; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8
; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10
; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GCN-NEXT: v_add_i32_e32 v12, vcc, -1, v12
; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v13
; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v14
; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v15
; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11
; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v13
; GCN-NEXT: v_add_i32_e32 v15, vcc, 32, v15
; GCN-NEXT: v_min_u32_e32 v8, v8, v9
; GCN-NEXT: v_min_u32_e32 v9, v10, v11
; GCN-NEXT: v_min_u32_e32 v10, v12, v13
; GCN-NEXT: v_min_u32_e32 v11, v14, v15
; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
; GCN-NEXT: v_min_u32_e32 v6, 1, v6
; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v6, v7, v6
; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_i32_e32 v1, v6
; GCN-NEXT: v_cvt_f32_i32_e32 v3, v4
; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v9, v6, v7
; GFX7-NEXT: v_ffbh_i32_e32 v8, v7
; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
; GFX7-NEXT: v_xor_b32_e32 v9, v4, v5
; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
; GFX7-NEXT: v_ffbh_i32_e32 v8, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; GFX7-NEXT: v_min_u32_e32 v8, v8, v9
; GFX7-NEXT: v_cvt_f32_i32_e32 v6, v6
; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
; GFX7-NEXT: v_xor_b32_e32 v8, v2, v3
; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
; GFX7-NEXT: v_ffbh_i32_e32 v7, v3
; GFX7-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX7-NEXT: v_add_i32_e32 v7, vcc, -1, v7
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v8
; GFX7-NEXT: v_min_u32_e32 v7, v7, v8
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_ffbh_i32_e32 v3, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5
; GFX7-NEXT: v_min_u32_e32 v3, v3, v5
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v9, v4, v5
; GFX8-NEXT: v_ffbh_i32_e32 v8, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_xor_b32_e32 v5, v6, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4
; GFX8-NEXT: v_ffbh_i32_e32 v4, v7
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
; GFX8-NEXT: v_min_u32_e32 v10, v4, v5
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1
; GFX8-NEXT: v_ffbh_i32_e32 v8, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0
; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
; GFX8-NEXT: v_min_u32_e32 v8, v0, v1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5
; GFX9-NEXT: v_ffbh_i32_e32 v8, v5
; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9
; GFX9-NEXT: v_add_u32_e32 v8, -1, v8
; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7
; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
; GFX9-NEXT: v_ffbh_i32_e32 v4, v7
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX9-NEXT: v_add_u32_e32 v4, -1, v4
; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
; GFX9-NEXT: v_min_u32_e32 v10, v4, v5
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1
; GFX9-NEXT: v_ffbh_i32_e32 v7, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX9-NEXT: v_add_u32_e32 v7, -1, v7
; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_min_u32_e32 v7, v7, v8
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3
; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
; GFX9-NEXT: v_ffbh_i32_e32 v0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1
; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
; GFX9-NEXT: v_add_u32_e32 v1, 32, v1
; GFX9-NEXT: v_min_u32_e32 v8, v0, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5
; GFX10-NEXT: v_ffbh_i32_e32 v9, v5
; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7
; GFX10-NEXT: v_xor_b32_e32 v13, v0, v1
; GFX10-NEXT: v_ffbh_i32_e32 v10, v7
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9
; GFX10-NEXT: v_ffbh_i32_e32 v12, v1
; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3
; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8
; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10
; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14
; GFX10-NEXT: v_add_nc_u32_e32 v11, 32, v11
; GFX10-NEXT: v_min_u32_e32 v8, v9, v8
; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v13
; GFX10-NEXT: v_ffbh_i32_e32 v13, v3
; GFX10-NEXT: v_add_nc_u32_e32 v14, 32, v14
; GFX10-NEXT: v_min_u32_e32 v10, v10, v11
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9
; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13
; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_min_u32_e32 v9, v12, v9
; GFX10-NEXT: v_min_u32_e32 v11, v13, v14
; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
; GFX10-NEXT: v_min_u32_e32 v5, 1, v6
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v8
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX10-NEXT: v_or_b32_e32 v5, v7, v5
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
; GFX10-NEXT: v_ldexp_f32 v2, v4, v6
; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v5
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v10
; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9
; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11
; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX10-NEXT: v_ldexp_f32 v3, v3, v4
; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_ldexp_f32 v1, v1, v6
; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v6, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_xor_b32_e32 v8, v6, v7
; GFX11TRUE16-NEXT: v_cls_i32_e32 v9, v7
; GFX11TRUE16-NEXT: v_xor_b32_e32 v11, v4, v5
; GFX11TRUE16-NEXT: v_cls_i32_e32 v10, v5
; GFX11TRUE16-NEXT: v_xor_b32_e32 v13, v2, v3
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, -1, v9
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GFX11TRUE16-NEXT: v_xor_b32_e32 v15, v0, v1
; GFX11TRUE16-NEXT: v_cls_i32_e32 v14, v1
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
; GFX11TRUE16-NEXT: v_cls_i32_e32 v12, v3
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v11, 32, v11
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v8, v9, v8
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, -1, v10
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v10, 31, v13
; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v13, 31, v15
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7]
; GFX11TRUE16-NEXT: v_min_u32_e32 v9, v9, v11
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, 32, v10
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v13, 32, v13
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v9, v[4:5]
; GFX11TRUE16-NEXT: v_min_u32_e32 v6, 1, v6
; GFX11TRUE16-NEXT: v_min_u32_e32 v10, v12, v10
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v7, 32, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v9
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v6, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: v_ldexp_f32 v4, v4, v5
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v10
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v14, -1, v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v5
; GFX11TRUE16-NEXT: v_min_u32_e32 v11, v14, v13
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v11, v[0:1]
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v11
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v6, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v9, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v4i64_to_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_xor_b32_e32 v8, v4, v5
; GFX11FAKE16-NEXT: v_cls_i32_e32 v9, v5
; GFX11FAKE16-NEXT: v_xor_b32_e32 v11, v6, v7
; GFX11FAKE16-NEXT: v_xor_b32_e32 v13, v0, v1
; GFX11FAKE16-NEXT: v_cls_i32_e32 v10, v7
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, -1, v9
; GFX11FAKE16-NEXT: v_cls_i32_e32 v12, v1
; GFX11FAKE16-NEXT: v_xor_b32_e32 v14, v2, v3
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v11, 31, v11
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, -1, v10
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v12, -1, v12
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v14, 31, v14
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v11, 32, v11
; GFX11FAKE16-NEXT: v_min_u32_e32 v8, v9, v8
; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v9, 31, v13
; GFX11FAKE16-NEXT: v_cls_i32_e32 v13, v3
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v14, 32, v14
; GFX11FAKE16-NEXT: v_min_u32_e32 v10, v10, v11
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v13, -1, v13
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7]
; GFX11FAKE16-NEXT: v_min_u32_e32 v9, v12, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_u32_e32 v11, v13, v14
; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11FAKE16-NEXT: v_min_u32_e32 v5, 1, v6
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v8
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, v7, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v4, v6
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v3, v5
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v10
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v5, 32, v9
; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v11
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11FAKE16-NEXT: v_ldexp_f32 v3, v3, v4
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v5
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v6
; GFX11FAKE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v4, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i64> %x to <4 x bfloat>
ret <4 x bfloat> %op
}
define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GCN-LABEL: v_uitofp_i16_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i16_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i16_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i16_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i16_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_i16_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_i16_to_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp i16 %x to bfloat
ret bfloat %op
}
define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GCN-LABEL: v_uitofp_v2i16_to_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v1
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i16> %x to <2 x bfloat>
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GCN-LABEL: v_uitofp_v3i16_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v2
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i16> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GCN-LABEL: v_uitofp_v4i16_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1
; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v4, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v6, v10, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v11, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v4i16_to_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v10, v0, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v10, v10, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i16> %x to <4 x bfloat>
ret <4 x bfloat> %op
}
define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
; GCN-LABEL: v_uitofp_i32_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i32_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i32_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i32_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_i32_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_i32_to_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp i32 %x to bfloat
ret bfloat %op
}
define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GCN-LABEL: v_uitofp_v2i32_to_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i32> %x to <2 x bfloat>
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i32> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v4i32_to_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v3, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i32> %x to <4 x bfloat>
ret <4 x bfloat> %op
}
define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GCN-LABEL: v_uitofp_i64_to_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_u32_e32 v2, v1
; GCN-NEXT: v_min_u32_e32 v2, 32, v2
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i64_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_ffbh_u32_e32 v2, v1
; GFX7-NEXT: v_min_u32_e32 v2, 32, v2
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i64_to_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v2, v1
; GFX8-NEXT: v_min_u32_e32 v2, 32, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i64_to_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_ffbh_u32_e32 v2, v1
; GFX9-NEXT: v_min_u32_e32 v2, 32, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i64_to_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_i64_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v2, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 32, v2
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_i64_to_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v2, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 32, v2
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v1, 32, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp i64 %x to bfloat
ret bfloat %op
}
define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GCN-LABEL: v_uitofp_v2i64_to_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_u32_e32 v4, v3
; GCN-NEXT: v_ffbh_u32_e32 v5, v1
; GCN-NEXT: v_min_u32_e32 v4, 32, v4
; GCN-NEXT: v_min_u32_e32 v5, 32, v5
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_ffbh_u32_e32 v4, v3
; GFX7-NEXT: v_min_u32_e32 v4, 32, v4
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v4, v1
; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0
; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
; GFX8-NEXT: v_min_u32_e32 v6, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_ffbh_u32_e32 v4, v1
; GFX9-NEXT: v_min_u32_e32 v4, 32, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4
; GFX9-NEXT: v_ldexp_f32 v4, v0, v1
; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4
; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
; GFX9-NEXT: v_min_u32_e32 v6, 32, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v4, v1
; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
; GFX10-NEXT: v_min_u32_e32 v4, 32, v4
; GFX10-NEXT: v_min_u32_e32 v5, 32, v5
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v4, v1
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v5, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 32, v4
; GFX11TRUE16-NEXT: v_min_u32_e32 v5, 32, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v2, 32, v4
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v4, v1
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v5, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 32, v4
; GFX11FAKE16-NEXT: v_min_u32_e32 v5, 32, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v2, 32, v4
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i64> %x to <2 x bfloat>
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_u32_e32 v6, v5
; GCN-NEXT: v_ffbh_u32_e32 v7, v3
; GCN-NEXT: v_ffbh_u32_e32 v8, v1
; GCN-NEXT: v_min_u32_e32 v6, 32, v6
; GCN-NEXT: v_min_u32_e32 v7, 32, v7
; GCN-NEXT: v_min_u32_e32 v8, 32, v8
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6
; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_ffbh_u32_e32 v6, v5
; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6
; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
; GFX7-NEXT: v_ffbh_u32_e32 v6, v3
; GFX7-NEXT: v_min_u32_e32 v6, 32, v6
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6
; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v6, v5
; GFX8-NEXT: v_min_u32_e32 v6, 32, v6
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GFX8-NEXT: v_ffbh_u32_e32 v7, v1
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6
; GFX8-NEXT: v_min_u32_e32 v7, 32, v7
; GFX8-NEXT: v_ldexp_f32 v4, v4, v5
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_ffbh_u32_e32 v5, v3
; GFX8-NEXT: v_min_u32_e32 v5, 32, v5
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7
; GFX8-NEXT: v_ldexp_f32 v0, v0, v4
; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_ffbh_u32_e32 v6, v5
; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6
; GFX9-NEXT: v_ffbh_u32_e32 v6, v1
; GFX9-NEXT: v_min_u32_e32 v6, 32, v6
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_ldexp_f32 v4, v4, v5
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
; GFX9-NEXT: v_ldexp_f32 v5, v0, v1
; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1
; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4
; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
; GFX9-NEXT: v_min_u32_e32 v7, 32, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v6, v1
; GFX10-NEXT: v_ffbh_u32_e32 v8, v3
; GFX10-NEXT: v_ffbh_u32_e32 v7, v5
; GFX10-NEXT: v_min_u32_e32 v6, 32, v6
; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
; GFX10-NEXT: v_min_u32_e32 v7, 32, v7
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v7
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6
; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v8
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
; GFX10-NEXT: v_ldexp_f32 v2, v2, v4
; GFX10-NEXT: v_ldexp_f32 v1, v1, v7
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v6, v1
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v7, v3
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v8, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v6, 32, v6
; GFX11TRUE16-NEXT: v_min_u32_e32 v7, 32, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v8, 32, v8
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_min_u32_e32 v1, 1, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v6, v1
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v7, v5
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v8, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_u32_e32 v6, 32, v6
; GFX11FAKE16-NEXT: v_min_u32_e32 v7, 32, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_u32_e32 v8, 32, v8
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1]
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i64> %x to <3 x bfloat>
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ffbh_u32_e32 v8, v7
; GCN-NEXT: v_ffbh_u32_e32 v9, v5
; GCN-NEXT: v_ffbh_u32_e32 v10, v3
; GCN-NEXT: v_ffbh_u32_e32 v11, v1
; GCN-NEXT: v_min_u32_e32 v8, 32, v8
; GCN-NEXT: v_min_u32_e32 v9, 32, v9
; GCN-NEXT: v_min_u32_e32 v10, 32, v10
; GCN-NEXT: v_min_u32_e32 v11, 32, v11
; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9
; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11
; GCN-NEXT: v_min_u32_e32 v6, 1, v6
; GCN-NEXT: v_min_u32_e32 v4, 1, v4
; GCN-NEXT: v_min_u32_e32 v2, 1, v2
; GCN-NEXT: v_min_u32_e32 v0, 1, v0
; GCN-NEXT: v_or_b32_e32 v6, v7, v6
; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6
; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8
; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9
; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_ffbh_u32_e32 v8, v7
; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8
; GFX7-NEXT: v_min_u32_e32 v6, 1, v6
; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
; GFX7-NEXT: v_cvt_f32_u32_e32 v6, v6
; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8
; GFX7-NEXT: v_ffbh_u32_e32 v8, v5
; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7
; GFX7-NEXT: v_ffbh_u32_e32 v7, v3
; GFX7-NEXT: v_min_u32_e32 v7, 32, v7
; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
; GFX7-NEXT: v_min_u32_e32 v8, 32, v8
; GFX7-NEXT: v_min_u32_e32 v2, 1, v2
; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_ffbh_u32_e32 v3, v1
; GFX7-NEXT: v_min_u32_e32 v3, 32, v3
; GFX7-NEXT: v_min_u32_e32 v4, 1, v4
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX7-NEXT: v_min_u32_e32 v0, 1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8
; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5
; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7
; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v8, v5
; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8
; GFX8-NEXT: v_ldexp_f32 v8, v4, v5
; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4
; GFX8-NEXT: v_ffbh_u32_e32 v4, v7
; GFX8-NEXT: v_min_u32_e32 v10, 32, v4
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX8-NEXT: v_ffbh_u32_e32 v8, v1
; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0
; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
; GFX8-NEXT: v_min_u32_e32 v8, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_ffbh_u32_e32 v8, v5
; GFX9-NEXT: v_min_u32_e32 v8, 32, v8
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8
; GFX9-NEXT: v_ldexp_f32 v8, v4, v5
; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1
; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4
; GFX9-NEXT: v_ffbh_u32_e32 v4, v7
; GFX9-NEXT: v_min_u32_e32 v10, 32, v4
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
; GFX9-NEXT: v_ffbh_u32_e32 v7, v1
; GFX9-NEXT: v_min_u32_e32 v4, 1, v4
; GFX9-NEXT: v_min_u32_e32 v7, 32, v7
; GFX9-NEXT: v_or_b32_e32 v4, v5, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_ldexp_f32 v4, v4, v6
; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7
; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
; GFX9-NEXT: v_ldexp_f32 v6, v0, v1
; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4
; GFX9-NEXT: v_ffbh_u32_e32 v0, v3
; GFX9-NEXT: v_min_u32_e32 v8, 32, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX9-NEXT: v_min_u32_e32 v0, 1, v0
; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v8, v5
; GFX10-NEXT: v_ffbh_u32_e32 v10, v1
; GFX10-NEXT: v_ffbh_u32_e32 v11, v3
; GFX10-NEXT: v_ffbh_u32_e32 v9, v7
; GFX10-NEXT: v_min_u32_e32 v8, 32, v8
; GFX10-NEXT: v_min_u32_e32 v10, 32, v10
; GFX10-NEXT: v_min_u32_e32 v11, 32, v11
; GFX10-NEXT: v_min_u32_e32 v9, 32, v9
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8
; GFX10-NEXT: v_sub_nc_u32_e32 v9, 32, v9
; GFX10-NEXT: v_min_u32_e32 v4, 1, v4
; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
; GFX10-NEXT: v_min_u32_e32 v6, 1, v6
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v10
; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v11
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v4
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX10-NEXT: v_or_b32_e32 v6, v7, v6
; GFX10-NEXT: v_ldexp_f32 v2, v2, v8
; GFX10-NEXT: v_ldexp_f32 v0, v0, v5
; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v6
; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_ldexp_f32 v4, v4, v9
; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v9, v5
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v10, v3
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v11, v1
; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v8, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_min_u32_e32 v9, 32, v9
; GFX11TRUE16-NEXT: v_min_u32_e32 v10, 32, v10
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_min_u32_e32 v11, 32, v11
; GFX11TRUE16-NEXT: v_min_u32_e32 v8, 32, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v9, v[4:5]
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v11, v[0:1]
; GFX11TRUE16-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7]
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v9, 32, v9
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v8, 32, v8
; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_min_u32_e32 v6, 1, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v10
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v3, v4
; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v11
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v6
; GFX11TRUE16-NEXT: v_ldexp_f32 v3, v3, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v5
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v8
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v4i64_to_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v8, v5
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v10, v1
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v11, v3
; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v9, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_u32_e32 v8, 32, v8
; GFX11FAKE16-NEXT: v_min_u32_e32 v10, 32, v10
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_min_u32_e32 v11, 32, v11
; GFX11FAKE16-NEXT: v_min_u32_e32 v9, 32, v9
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5]
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX11FAKE16-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7]
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v8, 32, v8
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v9, 32, v9
; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: v_min_u32_e32 v6, 1, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, v5, v4
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v3, v2
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v5, 32, v10
; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v11
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v4
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v8
; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v4, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11FAKE16-NEXT: v_ldexp_f32 v4, v4, v9
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i64> %x to <4 x bfloat>
ret <4 x bfloat> %op
}
define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_select_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, bfloat %a, bfloat %b
ret bfloat %op
}
define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_lhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_lhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_lhs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_fneg_lhs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_fneg_lhs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg bfloat %a
%op = select i1 %cond, bfloat %neg.a, bfloat %b
ret bfloat %op
}
define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_rhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_rhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_rhs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_fneg_rhs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_fneg_rhs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.b = fneg bfloat %b
%op = select i1 %cond, bfloat %a, bfloat %neg.b
ret bfloat %op
}
define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_select_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v1.h, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
ret <2 x bfloat> %op
}
define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-LABEL: v_vselect_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_vselect_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s4
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v2.h, s0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
ret <2 x bfloat> %op
}
define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
; GCN-LABEL: s_select_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_select_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_select_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11TRUE16-LABEL: s_select_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_select_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: v_mov_b32_e32 v1, s0
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11FAKE16-NEXT: ; return to shader part epilog
%cond = icmp eq i32 %c, 0
%op = select i1 %cond, bfloat %a, bfloat %b
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
ret i32 %readlane
}
define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
; GCN-LABEL: s_select_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s2
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_select_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_select_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: s_mov_b32 s0, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_select_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
; GFX11FAKE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11FAKE16-NEXT: ; return to shader part epilog
%cond = icmp eq i32 %c, 0
%op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
%cast = bitcast <2 x bfloat> %op to i32
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
ret i32 %readlane
}
define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
; GCN-LABEL: s_vselect_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_vselect_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1
; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_vselect_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_vselect_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s0, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_vselect_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: s_lshr_b32 s0, s1, 16
; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11FAKE16-NEXT: ; return to shader part epilog
%cond = icmp eq <2 x i32> %c, zeroinitializer
%op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
%cast = bitcast <2 x bfloat> %op to i32
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
ret i32 %readlane
}
define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-LABEL: v_select_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v3bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_select_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16
; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
ret <4 x bfloat> %op
}
define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
; GCN-LABEL: v_select_v6bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16
; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16
; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v6bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12
; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v6bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v6bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v6bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v6bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
ret <6 x bfloat> %op
}
define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-LABEL: v_select_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16
; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16
; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16
; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13
; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16
; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v8bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
ret <8 x bfloat> %op
}
define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_select_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16
; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16
; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16
; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23
; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25
; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16
; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v16bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
; GFX11-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
ret <16 x bfloat> %op
}
define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_select_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc
; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc
; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc
; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc
; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc
; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc
; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc
; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc
; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc
; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v24
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v25
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v26
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v27
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v27
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v28
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(14)
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(13)
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: s_waitcnt vmcnt(12)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(12)
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: s_waitcnt vmcnt(11)
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(9)
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16
; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_select_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_select_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_select_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
; GFX11-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
; GFX11-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
; GFX11-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
; GFX11-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
; GFX11-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
ret <32 x bfloat> %op
}
define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
; GCN-LABEL: s_select_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_readfirstlane_b32 s0, v1
; GCN-NEXT: v_readfirstlane_b32 s1, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_readfirstlane_b32 s0, v1
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_select_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_select_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_select_v3bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
%cond = icmp eq i32 %c, 0
%op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
%cast = bitcast <3 x bfloat> %op to i48
%elt0 = trunc i48 %cast to i32
%elt1.hi = lshr i48 %cast, 32
%elt1 = trunc i48 %elt1.hi to i32
%readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
%readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
%bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
%bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
ret <2 x i32> %bv.1
}
define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
; GCN-LABEL: s_select_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2
; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7
; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: v_readfirstlane_b32 s0, v1
; GCN-NEXT: v_readfirstlane_b32 s1, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4
; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6
; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_readfirstlane_b32 s0, v1
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_select_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_select_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_select_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s1, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_select_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: ; return to shader part epilog
%cond = icmp eq i32 %c, 0
%op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
%cast = bitcast <4 x bfloat> %op to <2 x i32>
%elt0 = extractelement <2 x i32> %cast, i32 0
%elt1 = extractelement <2 x i32> %cast, i32 1
%readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
%readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
%bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
%bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
ret <2 x i32> %bv.1
}
define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
; GCN-LABEL: s_vselect_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1
; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5
; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2
; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6
; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3
; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_or_b32_e32 v2, v2, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: v_readfirstlane_b32 s1, v2
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_vselect_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3
; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2
; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6
; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1
; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5
; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4
; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_vselect_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_lshr_b32 s5, s3, 16
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_lshr_b32 s3, s2, 16
; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_vselect_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; GFX9-NEXT: s_mov_b32 s1, 0x5040100
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_vselect_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 16
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_lshr_b32 s0, s2, 16
; GFX10-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v4, s5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX11FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
; GFX11FAKE16-NEXT: s_lshr_b32 s4, s3, 16
; GFX11FAKE16-NEXT: s_lshr_b32 s5, s0, 16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo
; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, s5
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX11FAKE16-NEXT: v_mov_b32_e32 v6, s0
; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1
; GFX11FAKE16-NEXT: ; return to shader part epilog
%cond = icmp eq <4 x i32> %c, zeroinitializer
%op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
%cast = bitcast <4 x bfloat> %op to <2 x i32>
%elt0 = extractelement <2 x i32> %cast, i32 0
%elt1 = extractelement <2 x i32> %cast, i32 1
%readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
%readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
%bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
%bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
ret <2 x i32> %bv.1
}
define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-LABEL: v_vselect_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_vselect_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v1
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v3
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v5.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v4.l, s0
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.h, s1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v5.h, s2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
ret <4 x bfloat> %op
}
define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-LABEL: v_vselect_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: v_and_b32_e32 v6, 1, v6
; GCN-NEXT: v_and_b32_e32 v5, 1, v5
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GCN-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17
; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16
; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v8bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v11, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v13
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v12
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_vselect_v8bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v6
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v4
; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo
; GFX10-NEXT: s_mov_b32 vcc_lo, s6
; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_mov_b32 vcc_lo, s5
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo
; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100
; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v7
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v4
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v15.l, v11.l, s2
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v14.l, v10.l, s3
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v8.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v9.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v14.h, v10.h, s5
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v15.h, v11.h, s6
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v8bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v10
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
ret <8 x bfloat> %op
}
define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-LABEL: v_vselect_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v2
; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v3
; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v4
; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v5
; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v6
; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v7
; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v8
; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
; GCN-NEXT: v_and_b32_e32 v0, 1, v9
; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v1, 1, v10
; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v2, 1, v11
; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v3, 1, v12
; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v7, 1, v13
; GCN-NEXT: v_and_b32_e32 v8, 1, v14
; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v8
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
; GCN-NEXT: v_and_b32_e32 v9, 1, v15
; GCN-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v9
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[42:43]
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[40:41]
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29]
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27]
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25]
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23]
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21]
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19]
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17]
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15]
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11]
; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9]
; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v9
; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 1, v11
; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v2, 1, v12
; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v3, 1, v13
; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v4, 1, v14
; GFX7-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v4
; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v4, 1, v15
; GFX7-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v4
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:40
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[42:43]
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[40:41]
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v29
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
; GFX7-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[28:29]
; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v28
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
; GFX7-NEXT: v_cndmask_b32_e64 v12, v2, v1, s[26:27]
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v27
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v1, s[24:25]
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23]
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25
; GFX7-NEXT: v_cndmask_b32_e64 v9, v4, v5, s[20:21]
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_cndmask_b32_e64 v8, v3, v5, s[18:19]
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23
; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[14:15]
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13]
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[6:7]
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v19, v1, v19, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v17, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11]
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v16, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v16bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v3
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v7
; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v8
; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v9
; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30
; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28
; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21]
; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27]
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23]
; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19]
; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11]
; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7]
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e64 v9, v0, v23, s[40:41]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v1, s[42:43]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16
; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v10
; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_vselect_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX9-NEXT: v_and_b32_e32 v6, 1, v8
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6
; GFX9-NEXT: v_and_b32_e32 v6, 1, v10
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6
; GFX9-NEXT: v_and_b32_e32 v6, 1, v12
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6
; GFX9-NEXT: v_and_b32_e32 v8, 1, v13
; GFX9-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v30
; GFX9-NEXT: v_and_b32_e32 v11, 1, v11
; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
; GFX9-NEXT: v_and_b32_e32 v13, 1, v14
; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9]
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21
; GFX9-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7]
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v29
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20
; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v19
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v27
; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX9-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7]
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v28
; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX9-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v6, v10, v6, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4
; GFX9-NEXT: v_perm_b32 v4, v11, v20, s4
; GFX9-NEXT: v_perm_b32 v5, v12, v14, s4
; GFX9-NEXT: v_perm_b32 v7, v7, v13, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v16bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v30
; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21
; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20
; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v25
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v12
; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v16
; GFX10-NEXT: v_cndmask_b32_e64 v22, v30, v22, s4
; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v24
; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v26
; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
; GFX10-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v19
; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v27
; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v23
; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v51, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v30, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31
; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100
; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v30.l, v22.l, s10
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v30.h, v22.h, s11
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v29.l, v21.l, s12
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v29.h, v21.h, s9
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v28.l, v20.l, s8
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v28.h, v20.h, s7
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v27.l, v19.l, s6
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v27.h, v19.h, s5
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v26.l, v18.l, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v25.l, v17.l, s2
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v24.l, v16.l, s0
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v24.h, v16.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v25.h, v17.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v26.h, v18.h, s3
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v31.l, v23.l, s14
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v31.h, v23.h, s13
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v16bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26
; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v31
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
ret <16 x bfloat> %op
}
define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-LABEL: v_vselect_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_and_b32_e32 v36, 1, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192
; GCN-NEXT: v_and_b32_e32 v53, 1, v26
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84
; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88
; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92
; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100
; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112
; GCN-NEXT: v_and_b32_e32 v27, 1, v27
; GCN-NEXT: v_and_b32_e32 v28, 1, v28
; GCN-NEXT: v_and_b32_e32 v29, 1, v29
; GCN-NEXT: v_and_b32_e32 v30, 1, v30
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30
; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5]
; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:232
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228
; GCN-NEXT: s_waitcnt expcnt(3)
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:224
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:220
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:216
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(10)
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
; GCN-NEXT: s_waitcnt vmcnt(9)
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29
; GCN-NEXT: v_cndmask_b32_e64 v29, v43, v42, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
; GCN-NEXT: v_cndmask_b32_e64 v28, v44, v41, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27
; GCN-NEXT: v_cndmask_b32_e64 v27, v45, v55, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v53
; GCN-NEXT: v_cndmask_b32_e64 v36, v36, v54, s[4:5]
; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144
; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
; GCN-NEXT: v_and_b32_e32 v5, 1, v5
; GCN-NEXT: v_and_b32_e32 v6, 1, v6
; GCN-NEXT: v_and_b32_e32 v18, 1, v18
; GCN-NEXT: v_and_b32_e32 v22, 1, v22
; GCN-NEXT: v_and_b32_e32 v23, 1, v23
; GCN-NEXT: v_and_b32_e32 v24, 1, v24
; GCN-NEXT: v_and_b32_e32 v25, 1, v25
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
; GCN-NEXT: s_waitcnt vmcnt(13)
; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
; GCN-NEXT: s_waitcnt vmcnt(12)
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
; GCN-NEXT: s_waitcnt vmcnt(11)
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25
; GCN-NEXT: v_cndmask_b32_e64 v25, v46, v52, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
; GCN-NEXT: v_cndmask_b32_e64 v24, v47, v51, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23
; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22
; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5]
; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208
; GCN-NEXT: v_and_b32_e32 v19, 1, v19
; GCN-NEXT: v_and_b32_e32 v20, 1, v20
; GCN-NEXT: v_and_b32_e32 v21, 1, v21
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21
; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20
; GCN-NEXT: v_cndmask_b32_e64 v20, v59, v39, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19
; GCN-NEXT: v_cndmask_b32_e64 v19, v57, v56, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
; GCN-NEXT: v_cndmask_b32_e64 v18, v47, v46, s[4:5]
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:148
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160
; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: v_and_b32_e32 v8, 1, v8
; GCN-NEXT: v_and_b32_e32 v9, 1, v9
; GCN-NEXT: v_and_b32_e32 v10, 1, v10
; GCN-NEXT: v_and_b32_e32 v14, 1, v14
; GCN-NEXT: v_and_b32_e32 v15, 1, v15
; GCN-NEXT: v_and_b32_e32 v16, 1, v16
; GCN-NEXT: v_and_b32_e32 v17, 1, v17
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17
; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
; GCN-NEXT: v_cndmask_b32_e64 v16, v50, v49, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15
; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5]
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168
; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176
; GCN-NEXT: v_and_b32_e32 v11, 1, v11
; GCN-NEXT: v_and_b32_e32 v12, 1, v12
; GCN-NEXT: v_cndmask_b32_e32 v38, v38, v40, vcc
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256
; GCN-NEXT: v_and_b32_e32 v26, 1, v26
; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46
; GCN-NEXT: s_waitcnt vmcnt(13)
; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47
; GCN-NEXT: s_waitcnt vmcnt(12)
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56
; GCN-NEXT: s_waitcnt vmcnt(11)
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57
; GCN-NEXT: s_waitcnt vmcnt(10)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58
; GCN-NEXT: s_waitcnt vmcnt(9)
; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: s_waitcnt vmcnt(6)
; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
; GCN-NEXT: v_cndmask_b32_e32 v12, v31, v13, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
; GCN-NEXT: v_cndmask_b32_e32 v11, v52, v51, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
; GCN-NEXT: v_cndmask_b32_e32 v10, v50, v49, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; GCN-NEXT: v_cndmask_b32_e32 v9, v35, v34, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
; GCN-NEXT: v_cndmask_b32_e32 v8, v33, v32, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v59, v58, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GCN-NEXT: v_cndmask_b32_e32 v6, v57, v56, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GCN-NEXT: v_cndmask_b32_e32 v5, v47, v46, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT: v_cndmask_b32_e32 v4, v48, v39, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GCN-NEXT: v_cndmask_b32_e32 v3, v45, v44, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v43, v42, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v41, v55, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v54, v53, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
; GCN-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_vselect_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228
; GFX7-NEXT: v_and_b32_e32 v25, 1, v25
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25
; GFX7-NEXT: v_and_b32_e32 v30, 1, v30
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30
; GFX7-NEXT: v_and_b32_e32 v29, 1, v29
; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29
; GFX7-NEXT: v_and_b32_e32 v28, 1, v28
; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28
; GFX7-NEXT: v_and_b32_e32 v27, 1, v27
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27
; GFX7-NEXT: v_and_b32_e32 v26, 1, v26
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26
; GFX7-NEXT: v_and_b32_e32 v23, 1, v23
; GFX7-NEXT: v_and_b32_e32 v22, 1, v22
; GFX7-NEXT: v_and_b32_e32 v21, 1, v21
; GFX7-NEXT: v_and_b32_e32 v20, 1, v20
; GFX7-NEXT: v_and_b32_e32 v19, 1, v19
; GFX7-NEXT: v_and_b32_e32 v18, 1, v18
; GFX7-NEXT: v_and_b32_e32 v17, 1, v17
; GFX7-NEXT: v_and_b32_e32 v16, 1, v16
; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_and_b32_e32 v24, 1, v24
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120
; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112
; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108
; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104
; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5]
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23
; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v32bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v3
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v7
; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v8
; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v9
; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v16
; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v17
; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v18
; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v19
; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
; GFX8-NEXT: v_writelane_b32 v34, s30, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
; GFX8-NEXT: v_writelane_b32 v34, s31, 1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
; GFX8-NEXT: v_writelane_b32 v34, s34, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
; GFX8-NEXT: v_writelane_b32 v34, s35, 3
; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
; GFX8-NEXT: v_writelane_b32 v34, s36, 4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v30
; GFX8-NEXT: v_writelane_b32 v34, s37, 5
; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32
; GFX8-NEXT: v_writelane_b32 v34, s38, 6
; GFX8-NEXT: v_writelane_b32 v34, s39, 7
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120
; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56
; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124
; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:60
; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128
; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v25
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v26
; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[38:39]
; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v26, s[36:37]
; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v28
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v27
; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[34:35]
; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v28, s[30:31]
; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[90:91]
; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v30, s[88:89]
; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v32
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[78:79]
; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[76:77]
; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v23
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[74:75]
; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73]
; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63]
; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61]
; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59]
; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57]
; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16
; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[46:47]
; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45]
; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43]
; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41]
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12
; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29]
; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10
; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25]
; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21]
; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2
; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0
; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17
; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21
; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23
; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v32
; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v30
; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v28
; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v26
; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24
; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v12, v31, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readlane_b32 s39, v34, 7
; GFX8-NEXT: v_readlane_b32 s38, v34, 6
; GFX8-NEXT: v_readlane_b32 s37, v34, 5
; GFX8-NEXT: v_readlane_b32 s36, v34, 4
; GFX8-NEXT: v_readlane_b32 s35, v34, 3
; GFX8-NEXT: v_readlane_b32 s34, v34, 2
; GFX8-NEXT: v_readlane_b32 s31, v34, 1
; GFX8-NEXT: v_readlane_b32 s30, v34, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_vselect_v32bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v3
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v5
; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v4
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v7
; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v6
; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v9
; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v8
; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v11
; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v10
; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v13
; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v12
; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v15
; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v14
; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v17
; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v19
; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v18
; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v21
; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v23
; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v22
; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v25
; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v24
; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v27
; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v26
; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v29
; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v28
; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32
; GFX9-NEXT: v_writelane_b32 v33, s30, 0
; GFX9-NEXT: v_writelane_b32 v33, s31, 1
; GFX9-NEXT: v_writelane_b32 v33, s34, 2
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: v_writelane_b32 v33, s35, 3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v30
; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35]
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31]
; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95]
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93]
; GFX9-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91]
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89]
; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79]
; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77]
; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75]
; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73]
; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63]
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61]
; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59]
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57]
; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47]
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45]
; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43]
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41]
; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29]
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25]
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21]
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17]
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13]
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4
; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4
; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4
; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4
; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4
; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4
; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4
; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4
; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4
; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4
; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4
; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4
; GFX9-NEXT: v_perm_b32 v13, v26, v29, s4
; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4
; GFX9-NEXT: v_perm_b32 v15, v31, v30, s4
; GFX9-NEXT: v_readlane_b32 s35, v33, 3
; GFX9-NEXT: v_readlane_b32 s34, v33, 2
; GFX9-NEXT: v_readlane_b32 s31, v33, 1
; GFX9-NEXT: v_readlane_b32 s30, v33, 0
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v3
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v5
; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v4
; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v7
; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v6
; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v9
; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v8
; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v11
; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v13
; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v12
; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v15
; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v14
; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v17
; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v16
; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v19
; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v18
; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v21
; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v20
; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v23
; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v22
; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v25
; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v24
; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v26
; GFX10-NEXT: v_cmp_eq_u32_e64 s40, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v29
; GFX10-NEXT: v_cmp_eq_u32_e64 s41, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v28
; GFX10-NEXT: v_cmp_eq_u32_e64 s42, 1, v0
; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s43, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v30
; GFX10-NEXT: v_cmp_eq_u32_e64 s44, 1, v0
; GFX10-NEXT: s_clause 0x1f
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:128
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v32, v30, v31, s44
; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX10-NEXT: v_cndmask_b32_e64 v30, v30, v31, s43
; GFX10-NEXT: v_cndmask_b32_e64 v31, v28, v29, s42
; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v29, s41
; GFX10-NEXT: v_cndmask_b32_e64 v29, v26, v27, s40
; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GFX10-NEXT: v_cndmask_b32_e64 v26, v26, v27, s29
; GFX10-NEXT: v_cndmask_b32_e64 v27, v24, v25, s28
; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v25, s27
; GFX10-NEXT: v_cndmask_b32_e64 v25, v22, v23, s26
; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v23, s25
; GFX10-NEXT: v_cndmask_b32_e64 v23, v20, v21, s24
; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v21, s23
; GFX10-NEXT: v_cndmask_b32_e64 v21, v18, v19, s22
; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v19, s21
; GFX10-NEXT: v_cndmask_b32_e64 v19, v16, v17, s20
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v17, s19
; GFX10-NEXT: v_cndmask_b32_e64 v17, v14, v15, s18
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v15, s17
; GFX10-NEXT: v_cndmask_b32_e64 v15, v12, v13, s16
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v13, s15
; GFX10-NEXT: v_cndmask_b32_e64 v13, v10, v11, s14
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s13
; GFX10-NEXT: v_cndmask_b32_e64 v11, v8, v9, s12
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s11
; GFX10-NEXT: v_cndmask_b32_e64 v9, v6, v7, s10
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v7, s9
; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v5, s8
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v5, s7
; GFX10-NEXT: v_cndmask_b32_e64 v5, v2, v3, s6
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, v1, s4
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v2, v5, 0x5040100
; GFX10-NEXT: v_perm_b32 v2, v4, v7, 0x5040100
; GFX10-NEXT: v_perm_b32 v4, v8, v11, 0x5040100
; GFX10-NEXT: v_perm_b32 v5, v10, v13, 0x5040100
; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
; GFX10-NEXT: v_perm_b32 v3, v6, v9, 0x5040100
; GFX10-NEXT: v_perm_b32 v6, v12, v15, 0x5040100
; GFX10-NEXT: v_perm_b32 v7, v14, v17, 0x5040100
; GFX10-NEXT: v_perm_b32 v8, v16, v19, 0x5040100
; GFX10-NEXT: v_perm_b32 v9, v18, v21, 0x5040100
; GFX10-NEXT: v_perm_b32 v10, v20, v23, 0x5040100
; GFX10-NEXT: v_perm_b32 v11, v22, v25, 0x5040100
; GFX10-NEXT: v_perm_b32 v12, v24, v27, 0x5040100
; GFX10-NEXT: v_perm_b32 v13, v26, v29, 0x5040100
; GFX10-NEXT: v_perm_b32 v14, v28, v31, 0x5040100
; GFX10-NEXT: v_perm_b32 v15, v30, v32, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v32bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x1f
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128
; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64
; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60
; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120
; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56
; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116
; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52
; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112
; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48
; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108
; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44
; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104
; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40
; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100
; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36
; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96
; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32
; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92
; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28
; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88
; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24
; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84
; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20
; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:8
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v28
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v35.l, v36.l, s26
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v34.l, v37.l, s27
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v34.h, v37.h, s28
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v38.l, v39.l, s29
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v38.h, v39.h, s25
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v48.l, v49.l, s24
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v48.h, v49.h, s23
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v50.l, v51.l, s22
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v50.h, v51.h, s21
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v52.l, v53.l, s20
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v52.h, v53.h, s19
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v54.l, v55.l, s18
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v54.h, v55.h, s17
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v64.l, v65.l, s16
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v64.h, v65.h, s15
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v66.l, v67.l, s14
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v66.h, v67.h, s13
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v68.l, v69.l, s12
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v68.h, v69.h, s11
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v70.l, v71.l, s10
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.h, v71.h, s9
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v80.l, v81.l, s8
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v80.h, v81.h, s7
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v82.l, v85.l, s4
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v87.l, s0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v82.h, v85.h, s3
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v35.h, v36.h, s0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: s_clause 0x1f
; GFX11FAKE16-NEXT: scratch_load_u16 v31, off, s32
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
; GFX11FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
; GFX11FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
; GFX11FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
; GFX11FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
; GFX11FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
; GFX11FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
; GFX11FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
; GFX11FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
; GFX11FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
; GFX11FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
; GFX11FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
; GFX11FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
; GFX11FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30
; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28
; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26
; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24
; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50
; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52
; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
ret <32 x bfloat> %op
}
declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-LABEL: v_fma_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
ret bfloat %op
}
define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_fma_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v0, v3, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
; GCN-LABEL: v_fma_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1
; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_fmac_f32 v4, v0, v2 :: v_dual_fmac_f32 v5, v1, v3
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v6, v8, v7
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v6, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_add3_u32 v2, v3, v5, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
; GCN-LABEL: v_fma_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v8
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v1, v0, v2
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v5, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v0, v7, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v7, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v4, v9, v6, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
; GFX11FAKE16-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX11FAKE16-NEXT: v_fmac_f32_e32 v7, v9, v8
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11FAKE16-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v7
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
}
declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-LABEL: v_fmuladd_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
ret bfloat %op
}
define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_fmuladd_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v1, v1, v5
; GCN-NEXT: v_add_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
}
define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
; GCN-LABEL: v_fmuladd_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v2, v2, v8
; GCN-NEXT: v_add_f32_e32 v1, v1, v7
; GCN-NEXT: v_add_f32_e32 v0, v0, v6
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
; GFX7-NEXT: v_add_f32_e32 v1, v1, v4
; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11TRUE16-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX11TRUE16-NEXT: v_add3_u32 v8, v9, v1, 0x7fff
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v3
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v1, v8, v6 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v5
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v7, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
}
define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
; GCN-LABEL: v_fmuladd_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_add_f32_e32 v3, v3, v11
; GCN-NEXT: v_add_f32_e32 v2, v2, v10
; GCN-NEXT: v_add_f32_e32 v1, v1, v9
; GCN-NEXT: v_add_f32_e32 v0, v0, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7
; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6
; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
; GFX7-NEXT: v_add_f32_e32 v3, v3, v7
; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6
; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GFX9-NEXT: v_add_f32_e32 v6, v6, v7
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX9-NEXT: v_add_f32_e32 v1, v1, v3
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX11TRUE16-NEXT: v_dual_mul_f32 v3, v9, v7 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v9, v11 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v6, v8
; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v7, v6 :: v_dual_lshlrev_b32 v6, 16, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v7, v2, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v10, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX11FAKE16-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11FAKE16-NEXT: v_mul_f32_e32 v7, v9, v7
; GFX11FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX11FAKE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v11, v0, 16, 1
; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v4
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_add_f32_e32 v3, v3, v8
; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11FAKE16-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
; GFX11FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11FAKE16-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
}