Files
clang-p2996/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
Jay Foad f2c164c815 [AMDGPU] Do not wait for vscnt on function entry and return
SIInsertWaitcnts inserts waitcnt instructions to resolve data
dependencies. The GFX10+ vscnt (VMEM store count) counter is never used
in this way. It is only used to resolve memory dependencies, and that is
handled by SIMemoryLegalizer. Hence there is no need to conservatively
wait for vscnt to be 0 on function entry and before returns.

Differential Revision: https://reviews.llvm.org/D153537
2023-07-04 12:22:38 +01:00

443 lines
19 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-scratch-access < %s | FileCheck --check-prefix=GFX7-ALIGNED %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX7-UNALIGNED %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-FLASTSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-FLASTSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11-FLASTSCR %s
; Should not merge this to a dword load
define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX7-ALIGNED-LABEL: private_load_2xi16_align2:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_load_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_load_2xi16_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2:
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_load_2xi16_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v0, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 2
%p.1 = load i16, ptr addrspace(5) %gep.p, align 2
%zext.0 = zext i16 %p.0 to i32
%zext.1 = zext i16 %p.1 to i32
%shl.1 = shl i32 %zext.1, 16
%or = or i32 %zext.0, %shl.1
ret i32 %or
}
; Should not merge this to a dword store
define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
; GFX7-ALIGNED-LABEL: private_store_2xi16_align2:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_store_2xi16_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLASTSCR-LABEL: private_store_2xi16_align2:
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_store_2xi16_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: scratch_store_b32 v1, v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_store_2xi16_align2:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 2
store i16 2, ptr addrspace(5) %gep.r, align 2
ret void
}
; Should produce align 1 dword when legal
define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_load_2xi16_align1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1:
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_load_2xi16_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v0, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 1
%p.1 = load i16, ptr addrspace(5) %gep.p, align 1
%zext.0 = zext i16 %p.0 to i32
%zext.1 = zext i16 %p.1 to i32
%shl.1 = shl i32 %zext.1, 16
%or = or i32 %zext.0, %shl.1
ret i32 %or
}
; Should produce align 1 dword when legal
define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
; GFX7-ALIGNED-LABEL: private_store_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_store_2xi16_align1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLASTSCR-LABEL: private_store_2xi16_align1:
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_store_2xi16_align1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: scratch_store_b32 v1, v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_store_2xi16_align1:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 1
store i16 2, ptr addrspace(5) %gep.r, align 1
ret void
}
; Should merge this to a dword load
define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
; GFX7-ALIGNED-LABEL: private_load_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_load_2xi16_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4:
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_load_2xi16_align4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v0, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 4
%p.1 = load i16, ptr addrspace(5) %gep.p, align 2
%zext.0 = zext i16 %p.0 to i32
%zext.1 = zext i16 %p.1 to i32
%shl.1 = shl i32 %zext.1, 16
%or = or i32 %zext.0, %shl.1
ret i32 %or
}
; Should merge this to a dword store
define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
; GFX7-ALIGNED-LABEL: private_store_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX7-ALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_store_2xi16_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLASTSCR-LABEL: private_store_2xi16_align4:
; GFX10-FLASTSCR: ; %bb.0:
; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off
; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_store_2xi16_align4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: scratch_store_b32 v1, v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FLASTSCR-LABEL: private_store_2xi16_align4:
; GFX11-FLASTSCR: ; %bb.0:
; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 4
store i16 2, ptr addrspace(5) %gep.r, align 2
ret void
}