Files
clang-p2996/llvm/test/CodeGen/AMDGPU/ds_write2.ll
Christudasan Devadasan 229e118559 [AMDGPU] Codegen support for constrained multi-dword sloads (#96163)
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constrained version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.
2024-07-23 13:59:15 +05:30

1059 lines
46 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_one_val_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_one_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%val = load float, ptr addrspace(1) %in.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
%val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_volatile_0:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: ds_write_b32 v0, v1 offset:32
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
%val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store volatile float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_volatile_1:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: ds_write_b32 v0, v1 offset:32
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
%val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store volatile float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
; 2 data subregisters from different super registers.
; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
; This should be an s_mov_b32. The v_mov_b32 gets introduced by an
; early legalization of the constant bus constraint on the v_lshl_add_u32,
; and then SIFoldOperands folds in an unlucky order.
define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: ; kill: killed $vgpr4
; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr <2 x float>, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile <2 x float>, ptr addrspace(1) %in.gep.0, align 8
%val1 = load volatile <2 x float>, ptr addrspace(1) %in.gep.1, align 8
%val0.0 = extractelement <2 x float> %val0, i32 0
%val1.1 = extractelement <2 x float> %val1, i32 1
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0.0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1.1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_subreg2_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
%val = load <2 x float>, ptr addrspace(1) %in.gep, align 8
%val0 = extractelement <2 x float> %val, i32 0
%val1 = extractelement <2 x float> %val, i32 1
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_subreg4_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %x.i
%val = load <4 x float>, ptr addrspace(1) %in.gep, align 16
%val0 = extractelement <4 x float> %val, i32 0
%val1 = extractelement <4 x float> %val, i32 3
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_max_offset_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
%val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 255
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_too_far_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b32 v0, v1 offset:1028
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_too_far_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 257
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_x2:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
; GFX9-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%idx.0 = add nsw i32 %tid.x, 0
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%idx.1 = add nsw i32 %tid.x, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%idx.2 = add nsw i32 %tid.x, 11
%arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
store float %val0, ptr addrspace(3) %arrayidx2, align 4
%idx.3 = add nsw i32 %tid.x, 27
%arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
store float %val1, ptr addrspace(3) %arrayidx3, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
; GFX9-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%idx.0 = add nsw i32 %tid.x, 3
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%idx.1 = add nsw i32 %tid.x, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%idx.2 = add nsw i32 %tid.x, 11
%arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
store float %val0, ptr addrspace(3) %arrayidx2, align 4
%idx.3 = add nsw i32 %tid.x, 27
%arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
store float %val1, ptr addrspace(3) %arrayidx3, align 4
ret void
}
define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 {
; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2
; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x6
; CI-NEXT: s_mov_b32 s11, 0xf000
; CI-NEXT: s_mov_b32 s10, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[8:9], s[4:5]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: s_mov_b64 s[2:3], s[10:11]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_mov_b32_e32 v1, s12
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: v_mov_b32_e32 v3, s13
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v1, v2 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b32 v3, v0 offset:32
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1 offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v3, v2 offset:32
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
%index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
%gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
%gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
%gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
; Apply an additional offset after the vector that will be more obviously folded.
%gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
store float %val0, ptr addrspace(3) %gep.0, align 4
%add.x = add nsw i32 %x.i, 8
store float %val1, ptr addrspace(3) %gep.1.offset, align 4
ret void
}
define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_one_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_one_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%val = load double, ptr addrspace(1) %in.gep, align 8
%arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
store double %val, ptr addrspace(3) %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
store double %val, ptr addrspace(3) %arrayidx1, align 8
ret void
}
define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
; CI-LABEL: misaligned_simple_write2_one_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_load_dword s4, s[2:3], 0x4
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: s_load_dword s4, s[2:3], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v2, s4, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%val = load double, ptr addrspace(1) %in.gep, align 8
%arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
store double %val, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 7
%arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
store double %val, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_load_dword s4, s[2:3], 0x4
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:5
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:9
; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: ds_write_b8 v0, v3 offset:8
; CI-NEXT: ds_write_b8 v0, v4 offset:7
; CI-NEXT: ds_write_b8 v0, v5 offset:6
; CI-NEXT: ds_write_b8 v0, v3 offset:12
; CI-NEXT: ds_write_b8 v0, v4 offset:11
; CI-NEXT: ds_write_b8 v0, v5 offset:10
; CI-NEXT: ds_write_b8 v0, v1 offset:16
; CI-NEXT: ds_write_b8 v0, v6 offset:15
; CI-NEXT: ds_write_b8 v0, v2 offset:14
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5
; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%val = load double, ptr addrspace(1) %in.gep, align 8
%base = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
%addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
store double %val, ptr addrspace(3) %addr0.i8, align 1
%addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
store double %val, ptr addrspace(3) %addr1.i8, align 1
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr double, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile double, ptr addrspace(1) %in.gep.0, align 8
%val1 = load volatile double, ptr addrspace(1) %in.gep.1, align 8
%arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
store double %val0, ptr addrspace(3) %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
store double %val1, ptr addrspace(3) %arrayidx1, align 8
ret void
}
@foo = addrspace(3) global [4 x i32] undef, align 4
define amdgpu_kernel void @store_constant_adjacent_offsets() {
; CI-LABEL: store_constant_adjacent_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b64 v2, v[0:1]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_constant_adjacent_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
; GFX9-NEXT: s_endpgm
store i32 123, ptr addrspace(3) @foo, align 4
store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
ret void
}
define amdgpu_kernel void @store_constant_disjoint_offsets() {
; CI-LABEL: store_constant_disjoint_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_constant_disjoint_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
; GFX9-NEXT: s_endpgm
store i32 123, ptr addrspace(3) @foo, align 4
store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
ret void
}
@bar = addrspace(3) global [4 x i64] undef, align 4
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; CI-LABEL: store_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, v0
; CI-NEXT: v_mov_b32_e32 v3, v1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b128 v1, v[0:3]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_misaligned64_constant_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: ds_write_b128 v1, v[0:3]
; GFX9-NEXT: s_endpgm
store i64 123, ptr addrspace(3) @bar, align 4
store i64 123, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
ret void
}
@bar.large = addrspace(3) global [4096 x i64] undef, align 4
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_mov_b64 s[0:1], 0x7b
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
; GFX9-NEXT: s_endpgm
store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
ret void
}
@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 {
; CI-LABEL: write2_sgemm_sequence:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s0, s[0:1], 0x0
; CI-NEXT: s_lshl_b32 s1, s6, 2
; CI-NEXT: s_add_i32 s2, s1, 0xc20
; CI-NEXT: s_addk_i32 s1, 0xc60
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_mov_b32_e32 v3, s0
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: v_mov_b32_e32 v0, s1
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: write2_sgemm_sequence:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10
; GFX9-NEXT: s_lshl_b32 s2, s6, 2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX9-NEXT: s_add_i32 s1, s2, 0xc20
; GFX9-NEXT: s_addk_i32 s2, 0xc60
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
%y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
%val = load float, ptr addrspace(1) %in
%arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
store float %val, ptr addrspace(3) %arrayidx44, align 4
%add47 = add nsw i32 %x.i, 1
%arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
store float %val, ptr addrspace(3) %arrayidx48, align 4
%add51 = add nsw i32 %x.i, 16
%arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
store float %val, ptr addrspace(3) %arrayidx52, align 4
%add55 = add nsw i32 %x.i, 17
%arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
store float %val, ptr addrspace(3) %arrayidx56, align 4
%arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
store float %val, ptr addrspace(3) %arrayidx60, align 4
%add63 = add nsw i32 %y.i, 1
%arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
store float %val, ptr addrspace(3) %arrayidx64, align 4
%add67 = add nsw i32 %y.i, 32
%arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
store float %val, ptr addrspace(3) %arrayidx68, align 4
%add71 = add nsw i32 %y.i, 33
%arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
store float %val, ptr addrspace(3) %arrayidx72, align 4
%add75 = add nsw i32 %y.i, 64
%arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
store float %val, ptr addrspace(3) %arrayidx76, align 4
%add79 = add nsw i32 %y.i, 65
%arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
store float %val, ptr addrspace(3) %arrayidx80, align 4
ret void
}
define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_v4f32_superreg_align4:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2
; CI-NEXT: s_load_dword s4, s[2:3], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s0
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
; GFX9-ALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %in
%val0 = load <4 x float>, ptr addrspace(1) %in.gep, align 4
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(3) %out, i32 %x.i
store <4 x float> %val0, ptr addrspace(3) %out.gep, align 4
ret void
}
@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
; CI-LABEL: write2_v2i32_align1_odd_offset:
; CI: ; %bb.0: ; %entry
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b8 v1, v0 offset:65
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: ds_write_b8 v1, v0 offset:70
; CI-NEXT: v_mov_b32_e32 v0, 0xc8
; CI-NEXT: ds_write_b8 v1, v0 offset:69
; CI-NEXT: ds_write_b8 v1, v1 offset:68
; CI-NEXT: ds_write_b8 v1, v1 offset:67
; CI-NEXT: ds_write_b8 v1, v1 offset:66
; CI-NEXT: ds_write_b8 v1, v1 offset:72
; CI-NEXT: ds_write_b8 v1, v1 offset:71
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
; GFX9-ALIGNED: ; %bb.0: ; %entry
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
; GFX9-UNALIGNED: ; %bb.0: ; %entry
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65
; GFX9-UNALIGNED-NEXT: s_endpgm
entry:
store <2 x i32> <i32 123, i32 456>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
ret void
}
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.y() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { convergent nounwind }