Files
clang-p2996/llvm/test/CodeGen/AMDGPU/load-hi16.ll
Ruiling Song 9119d9bfce AMDGPU/SIInsertWait: Skip dummy tied source
For D16 memory load instructions, the hardware usually only write to half
of the 32bit register, but we define the destination register using
32bit register for the MachineIR instruction. Without the extra tied
source register, LLVM framework will think previous write to the other
half of the register being dead. This is because by using 32bit register
as the destination register, LLVM will think the instruction will always
overwrite the whole 32bit register. By adding the extra tied source,
LLVM will think we are reading the register, so previous write to the
register will not be dead. This dummy tied source is introducing
unnecessary read-after-write dependency. The change here is to bypass the
tied source that can be skipped, thus avoiding an unnecessary s_waitcnt.

Reviewed by: foad

Differential Revision: https://reviews.llvm.org/D140537
2023-01-11 09:59:35 +08:00

2709 lines
116 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %in) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v2, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, v2
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ds_write_b16 v0, v2
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v1, v0
; GFX906-NEXT: ds_read_u16 v0, v0 offset:16
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_mov_b32_e32 v2, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: ds_write_b16 v2, v1
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v2, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, v2
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; GFX900-FLATSCR-NEXT: ds_write_b16 v0, v2
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
%load.lo = load i16, ptr addrspace(3) %in
%load.hi = load i16, ptr addrspace(3) %gep
store i16 %load.lo, ptr addrspace(3) null
%build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
ret <2 x i16> %build1
}
define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias %in) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v1, v0 offset:16
; GFX900-NEXT: ds_read_u16 v0, v0
; GFX900-NEXT: s_mov_b32 s4, 0x5040100
; GFX900-NEXT: v_mov_b32_e32 v2, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: ds_write_b16 v2, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v1, v0 offset:16
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_mov_b32_e32 v2, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: ds_write_b16 v2, v1
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0 offset:16
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v1, v0 offset:16
; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0
; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v1
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
%load.lo = load i16, ptr addrspace(3) %in
%load.hi = load i16, ptr addrspace(3) %gep
store i16 %load.hi, ptr addrspace(3) null
%build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
ret <2 x i16> %build1
}
define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out0, ptr addrspace(3) noalias %out1) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v3, v0
; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
; GFX900-NEXT: s_mov_b32 s4, 0x5040100
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: ds_write_b16 v1, v3
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: ds_write_b16 v2, v0
; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v3, v0
; GFX906-NEXT: ds_read_u16 v0, v0 offset:16
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: ds_write_b16 v1, v3
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: ds_write_b16 v2, v0
; GFX906-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v3, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v1, v3
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v3, v0
; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 offset:16
; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: ds_write_b16 v1, v3
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0
; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
%load.lo = load i16, ptr addrspace(3) %in
%load.hi = load i16, ptr addrspace(3) %gep
store i16 %load.lo, ptr addrspace(3) %out0
store i16 %load.hi, ptr addrspace(3) %out1
%build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
ret <2 x i16> %build1
}
define <2 x i16> @load_local_hi_v2i16_undeflo(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_hi_v2i16_undeflo:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_undeflo:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_undeflo:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_undeflo:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%build = insertelement <2 x i16> undef, i16 %load, i32 1
ret <2 x i16> %build
}
define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX900-LABEL: load_local_hi_v2i16_reglo:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_reglo:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_reglo:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
ret <2 x i16> %build1
}
define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define <2 x i16> @load_local_hi_v2i16_zerolo(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_hi_v2i16_zerolo:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v1, 0
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_zerolo:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_zerolo:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_zerolo:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
ret <2 x i16> %build
}
; FIXME: Remove m0 initialization
define i32 @load_local_hi_v2i16_zerolo_shift(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_hi_v2i16_zerolo_shift:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v0, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_zerolo_shift:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_zerolo_shift:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_zerolo_shift:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%zext = zext i16 %load to i32
%shift = shl i32 %zext, 16
ret i32 %shift
}
define void @load_local_hi_v2f16_reglo_vreg(ptr addrspace(3) %in, half %reg) #0 {
; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load half, ptr addrspace(3) %in
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_local_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u8 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u8 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u8_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr addrspace(3) %in
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_local_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(3) %in, i16 %reg) #0 {
; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_i8 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_i8 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_i8_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr addrspace(3) %in
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_local_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(3) %in, half %reg) #0 {
; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u8 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u8 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u8_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr addrspace(3) %in
%ext = zext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_local_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(3) %in, half %reg) #0 {
; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_i8 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_i8 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_i8_d16_hi v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr addrspace(3) %in
%ext = sext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0 {
; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 -2047
%load = load i16, ptr addrspace(1) %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_global_hi_v2f16_reglo_vreg(ptr addrspace(1) %in, half %reg) #0 {
; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 -2047
%load = load half, ptr addrspace(1) %gep
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_global_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i16 %reg) #0 {
; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
%load = load i8, ptr addrspace(1) %gep
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_global_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i16 %reg) #0 {
; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
%load = load i8, ptr addrspace(1) %gep
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_global_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, half %reg) #0 {
; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
%load = load i8, ptr addrspace(1) %gep
%ext = sext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_global_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, half %reg) #0 {
; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
%load = load i8, ptr addrspace(1) %gep
%ext = zext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_ushort v0, v[0:1]
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_short_d16_hi v2, v[0:1]
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr %in
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_flat_hi_v2f16_reglo_vreg(ptr %in, half %reg) #0 {
; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_ushort v0, v[0:1]
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_short_d16_hi v2, v[0:1]
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load half, ptr %in
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_flat_hi_v2i16_reglo_vreg_zexti8(ptr %in, i16 %reg) #0 {
; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_ubyte v0, v[0:1]
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr %in
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_flat_hi_v2i16_reglo_vreg_sexti8(ptr %in, i16 %reg) #0 {
; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_sbyte v0, v[0:1]
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr %in
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_flat_hi_v2f16_reglo_vreg_zexti8(ptr %in, half %reg) #0 {
; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_ubyte v0, v[0:1]
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr %in
%ext = zext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_flat_hi_v2f16_reglo_vreg_sexti8(ptr %in, half %reg) #0 {
; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_sbyte v0, v[0:1]
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i8, ptr %in
%ext = sext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(5) %in, i64 2047
%load = load i16, ptr addrspace(5) %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2f16_reglo_vreg(ptr addrspace(5) byval(half) %in, half %reg) #0 {
; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds half, ptr addrspace(5) %in, i64 2047
%load = load half, ptr addrspace(5) %gep
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load volatile i16, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2f16_reglo_vreg_nooff(ptr addrspace(5) %in, half %reg) #0 {
; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load volatile half, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, i16 %reg) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
%load = load i8, ptr addrspace(5) %gep
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, half %reg) #0 {
; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
%load = load i8, ptr addrspace(5) %gep
%ext = zext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, half %reg) #0 {
; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
%load = load i8, ptr addrspace(5) %gep
%ext = sext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, i16 %reg) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
%load = load i8, ptr addrspace(5) %gep
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i16 %reg) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, s0 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(ptr addrspace(5) %in, i16 %reg) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, s0 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, half %reg) #0 {
; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, s0 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
%ext = zext i8 %load to i16
%bc.ext = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #0 {
; GFX900-LABEL: load_constant_hi_v2i16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_constant_hi_v2i16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_constant_hi_v2i16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_hi_v2i16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 -2047
%load = load i16, ptr addrspace(4) %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_constant_hi_v2f16_reglo_vreg(ptr addrspace(4) %in, half %reg) #0 {
; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds half, ptr addrspace(4) %in, i64 -2047
%load = load half, ptr addrspace(4) %gep
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_constant_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, half %reg) #0 {
; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
%load = load i8, ptr addrspace(4) %gep
%ext = sext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
define void @load_constant_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, half %reg) #0 {
; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v2, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
%load = load i8, ptr addrspace(4) %gep
%ext = zext i8 %load to i16
%bitcast = bitcast i16 %ext to half
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
store <2 x half> %build1, ptr addrspace(1) undef
ret void
}
; Local object gives known offset, so requires converting from offen
; to offset variant.
define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX900-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4058
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX900-FLATSCR-NEXT: scratch_store_dword v1, v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:4058
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%obj1 = alloca [4096 x i16], align 2, addrspace(5)
store volatile i32 123, ptr addrspace(5) %obj0
%gep = getelementptr inbounds [4096 x i16], ptr addrspace(5) %obj1, i32 0, i32 2027
%load = load i16, ptr addrspace(5) %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX900-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4059
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4059
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4059
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX900-FLATSCR-NEXT: scratch_store_dword v1, v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v0, off, s32 offset:4059
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
store volatile i32 123, ptr addrspace(5) %obj0
%gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
%load = load i8, ptr addrspace(5) %gep
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX900-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4059
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4059
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4059
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX900-FLATSCR-NEXT: scratch_store_dword v1, v2, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v0, off, s32 offset:4059
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
store volatile i32 123, ptr addrspace(5) %obj0
%gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
%load = load i8, ptr addrspace(5) %gep
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
store <2 x i16> %build1, ptr addrspace(1) undef
ret void
}
; FIXME: Remove m0 init and waitcnt between reads
; FIXME: Is there a cost to using the extload over not?
define <2 x i16> @load_local_v2i16_split_multi_chain(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_v2i16_split_multi_chain:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_v2i16_split_multi_chain:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v1, v0
; GFX906-NEXT: ds_read_u16 v0, v0 offset:2
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_v2i16_split_multi_chain:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0
; GFX803-NEXT: ds_read_u16 v0, v0 offset:2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 1
%load0 = load volatile i16, ptr addrspace(3) %in
%load1 = load volatile i16, ptr addrspace(3) %gep
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
ret <2 x i16> %build1
}
define <2 x i16> @load_local_lo_hi_v2i16_samechain(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_samechain:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_lo_hi_v2i16_samechain:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v1, v0
; GFX906-NEXT: ds_read_u16 v0, v0 offset:16
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_lo_hi_v2i16_samechain:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v1, v0 offset:16
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
%load.lo = load i16, ptr addrspace(3) %in
%load.hi = load i16, ptr addrspace(3) %gep
%build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
ret <2 x i16> %build1
}
; FIXME: Remove and
define <2 x i16> @load_local_v2i16_broadcast(ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_v2i16_broadcast:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v0, v0
; GFX900-NEXT: s_mov_b32 s4, 0x5040100
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_v2i16_broadcast:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v0, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_v2i16_broadcast:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_v2i16_broadcast:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0
; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v0, s0
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 1
%load0 = load i16, ptr addrspace(3) %in
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
ret <2 x i16> %build1
}
define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr addrspace(3) %may.alias) #0 {
; GFX900-LABEL: load_local_lo_hi_v2i16_side_effect:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v2, v0
; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX900-NEXT: ds_write_b16 v1, v3
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: ds_read_u16_d16_hi v2, v0 offset:16
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v2
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_lo_hi_v2i16_side_effect:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX906-NEXT: ds_read_u16 v2, v0
; GFX906-NEXT: ds_write_b16 v1, v3
; GFX906-NEXT: ds_read_u16 v0, v0 offset:16
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_lo_hi_v2i16_side_effect:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX803-NEXT: ds_read_u16 v2, v0
; GFX803-NEXT: ds_write_b16 v1, v3
; GFX803-NEXT: ds_read_u16 v0, v0 offset:16
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX803-NEXT: v_or_b32_e32 v0, v2, v0
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: ds_read_u16 v2, v0
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b
; GFX900-FLATSCR-NEXT: ds_write_b16 v1, v3
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v2, v0 offset:16
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
%load.lo = load i16, ptr addrspace(3) %in
store i16 123, ptr addrspace(3) %may.alias
%load.hi = load i16, ptr addrspace(3) %gep
%build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
ret <2 x i16> %build1
}
; FIXME: Remove waitcnt between reads
define <2 x i16> @load_global_v2i16_split(ptr addrspace(1) %in) #0 {
; GFX900-LABEL: load_global_v2i16_split:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_ushort v2, v[0:1], off glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v2
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_global_v2i16_split:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ushort v2, v[0:1], off glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: global_load_ushort v3, v[0:1], off offset:2 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_global_v2i16_split:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_global_v2i16_split:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_ushort v2, v[0:1], off glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1
%load0 = load volatile i16, ptr addrspace(1) %in
%load1 = load volatile i16, ptr addrspace(1) %gep
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
ret <2 x i16> %build1
}
; FIXME: Remove waitcnt between reads
define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
; GFX900-LABEL: load_flat_v2i16_split:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_ushort v2, v[0:1] glc
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] offset:2 glc
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v2
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_flat_v2i16_split:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: flat_load_ushort v2, v[0:1] glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: flat_load_ushort v3, v[0:1] offset:2 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_flat_v2i16_split:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_flat_v2i16_split:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_ushort v2, v[0:1] glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: flat_load_short_d16_hi v2, v[0:1] offset:2 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr %in, i64 1
%load0 = load volatile i16, ptr %in
%load1 = load volatile i16, ptr %gep
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
ret <2 x i16> %build1
}
; FIXME: Remove waitcnt between reads
define <2 x i16> @load_constant_v2i16_split(ptr addrspace(4) %in) #0 {
; GFX900-LABEL: load_constant_v2i16_split:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_ushort v2, v[0:1], off glc
; GFX900-NEXT: s_nop 0
; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v2
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_constant_v2i16_split:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: global_load_ushort v2, v[0:1], off glc
; GFX906-NEXT: global_load_ushort v3, v[0:1], off offset:2 glc
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_constant_v2i16_split:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_constant_v2i16_split:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: global_load_ushort v2, v[0:1], off glc
; GFX900-FLATSCR-NEXT: s_nop 0
; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 1
%load0 = load volatile i16, ptr addrspace(4) %in
%load1 = load volatile i16, ptr addrspace(4) %gep
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
ret <2 x i16> %build1
}
; FIXME: Remove m0 init and waitcnt between reads
; FIXME: Is there a cost to using the extload over not?
define <2 x i16> @load_private_v2i16_split(ptr addrspace(5) byval(i16) %in) #0 {
; GFX900-LABEL: load_private_v2i16_split:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_private_v2i16_split:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_private_v2i16_split:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_private_v2i16_split:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_ushort v0, off, s32 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2 glc
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
%load0 = load volatile i16, ptr addrspace(5) %in
%load1 = load volatile i16, ptr addrspace(5) %gep
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
ret <2 x i16> %build1
}
; FIXME: This test should work without copying of v0.
; ds_read_u16_d16_hi preserves low 16 bits of the destination
; and ds_write_b16 only reads low 16 bits.
define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3) %in) #0 {
; GFX900-LABEL: load_local_hi_v2i16_store_local_lo:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v2, v0
; GFX900-NEXT: ds_read_u16_d16_hi v2, v1
; GFX900-NEXT: ds_write_b16 v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-NEXT: v_mov_b32_e32 v0, v2
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: load_local_hi_v2i16_store_local_lo:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v2, v1
; GFX906-NEXT: s_mov_b32 s4, 0x5040100
; GFX906-NEXT: ds_write_b16 v1, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: v_perm_b32 v2, v2, v0, s4
; GFX906-NEXT: v_mov_b32_e32 v0, v2
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX803-LABEL: load_local_hi_v2i16_store_local_lo:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v2, v1
; GFX803-NEXT: ds_write_b16 v1, v0
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_mov_b32_e32 v0, v2
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_store_local_lo:
; GFX900-FLATSCR: ; %bb.0: ; %entry
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, v0
; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v2, v1
; GFX900-FLATSCR-NEXT: ds_write_b16 v1, v0
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1)
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2
; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
%load = load i16, ptr addrspace(3) %in
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
store volatile i16 %reg, ptr addrspace(3) %in
ret <2 x i16> %build1
}
attributes #0 = { nounwind }