Files
clang-p2996/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
Mirko Brkusanin ae36c02ad0 [AMDGPU] Set DS alignment requirements to be more strict
Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are
now the same as for other GCN subtargets. This way we can avoid any
unintentional use of these instructions on systems that do not support dword
alignment and instead require natural alignment.
This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default.

Differential Revision: https://reviews.llvm.org/D87821
2020-09-18 15:26:24 +02:00

270 lines
11 KiB
LLVM

; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
%v1 = extractelement <2 x i32> %load, i32 0
%v2 = extractelement <2 x i32> %load, i32 1
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
ret void
}
; GCN-LABEL: test_local_misaligned_v4:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read_b128
; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
%v1 = extractelement <4 x i32> %load, i32 0
%v2 = extractelement <4 x i32> %load, i32 1
%v3 = extractelement <4 x i32> %load, i32 2
%v4 = extractelement <4 x i32> %load, i32 3
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
ret void
}
; GCN-LABEL: test_local_misaligned_v3:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
%v1 = extractelement <3 x i32> %load, i32 0
%v2 = extractelement <3 x i32> %load, i32 1
%v3 = extractelement <3 x i32> %load, i32 2
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
ret void
}
; GCN-LABEL: test_flat_misaligned_v2:
; VECT-DAG: flat_load_dwordx2 v
; VECT-DAG: flat_store_dwordx2 v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
%ptr = bitcast i32* %gep to <2 x i32>*
%load = load <2 x i32>, <2 x i32>* %ptr, align 4
%v1 = extractelement <2 x i32> %load, i32 0
%v2 = extractelement <2 x i32> %load, i32 1
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
store <2 x i32> %v4, <2 x i32>* %ptr, align 4
ret void
}
; GCN-LABEL: test_flat_misaligned_v4:
; VECT-DAG: flat_load_dwordx4 v
; VECT-DAG: flat_store_dwordx4 v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
%ptr = bitcast i32* %gep to <4 x i32>*
%load = load <4 x i32>, <4 x i32>* %ptr, align 4
%v1 = extractelement <4 x i32> %load, i32 0
%v2 = extractelement <4 x i32> %load, i32 1
%v3 = extractelement <4 x i32> %load, i32 2
%v4 = extractelement <4 x i32> %load, i32 3
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
store <4 x i32> %v8, <4 x i32>* %ptr, align 4
ret void
}
; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.
; GCN-LABEL: test_flat_misaligned_v3:
; xVECT-DAG: flat_load_dwordx3 v
; xVECT-DAG: flat_store_dwordx3 v
; xSPLIT-DAG: flat_load_dword v
; xSPLIT-DAG: flat_load_dword v
; xSPLIT-DAG: flat_load_dword v
; xSPLIT-DAG: flat_store_dword v
; xSPLIT-DAG: flat_store_dword v
; xSPLIT-DAG: flat_store_dword v
define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
%ptr = bitcast i32* %gep to <3 x i32>*
%load = load <3 x i32>, <3 x i32>* %ptr, align 4
%v1 = extractelement <3 x i32> %load, i32 0
%v2 = extractelement <3 x i32> %load, i32 1
%v3 = extractelement <3 x i32> %load, i32 2
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
store <3 x i32> %v7, <3 x i32>* %ptr, align 4
ret void
}
; GCN-LABEL: test_local_aligned_v2:
; GCN-DAG: ds_read_b64
; GCN-DAG: ds_write_b64
define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
%v1 = extractelement <2 x i32> %load, i32 0
%v2 = extractelement <2 x i32> %load, i32 1
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
ret void
}
; GCN-LABEL: test_local_aligned_v3:
; GCN-DAG: ds_read_b96
; GCN-DAG: ds_write_b96
define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
%v1 = extractelement <3 x i32> %load, i32 0
%v2 = extractelement <3 x i32> %load, i32 1
%v3 = extractelement <3 x i32> %load, i32 2
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
ret void
}
; GCN-LABEL: test_flat_aligned_v2:
; GCN-DAG: flat_load_dwordx2 v
; GCN-DAG: flat_store_dwordx2 v
define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
%ptr = bitcast i32* %gep to <2 x i32>*
%load = load <2 x i32>, <2 x i32>* %ptr, align 8
%v1 = extractelement <2 x i32> %load, i32 0
%v2 = extractelement <2 x i32> %load, i32 1
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
store <2 x i32> %v4, <2 x i32>* %ptr, align 8
ret void
}
; GCN-LABEL: test_flat_aligned_v4:
; GCN-DAG: flat_load_dwordx4 v
; GCN-DAG: flat_store_dwordx4 v
define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
%ptr = bitcast i32* %gep to <4 x i32>*
%load = load <4 x i32>, <4 x i32>* %ptr, align 16
%v1 = extractelement <4 x i32> %load, i32 0
%v2 = extractelement <4 x i32> %load, i32 1
%v3 = extractelement <4 x i32> %load, i32 2
%v4 = extractelement <4 x i32> %load, i32 3
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
store <4 x i32> %v8, <4 x i32>* %ptr, align 16
ret void
}
; GCN-LABEL: test_local_v4_aligned8:
; GCN-DAG: ds_read_b128
; GCN-DAG: ds_write_b128
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
%v1 = extractelement <4 x i32> %load, i32 0
%v2 = extractelement <4 x i32> %load, i32 1
%v3 = extractelement <4 x i32> %load, i32 2
%v4 = extractelement <4 x i32> %load, i32 3
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
ret void
}
; GCN-LABEL: test_flat_v4_aligned8:
; VECT-DAG: flat_load_dwordx4 v
; VECT-DAG: flat_store_dwordx4 v
; SPLIT-DAG: flat_load_dwordx2 v
; SPLIT-DAG: flat_load_dwordx2 v
; SPLIT-DAG: flat_store_dwordx2 v
; SPLIT-DAG: flat_store_dwordx2 v
define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
%ptr = bitcast i32* %gep to <4 x i32>*
%load = load <4 x i32>, <4 x i32>* %ptr, align 8
%v1 = extractelement <4 x i32> %load, i32 0
%v2 = extractelement <4 x i32> %load, i32 1
%v3 = extractelement <4 x i32> %load, i32 2
%v4 = extractelement <4 x i32> %load, i32 3
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
store <4 x i32> %v8, <4 x i32>* %ptr, align 8
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()