Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are now the same as for other GCN subtargets. This way we can avoid any unintentional use of these instructions on systems that do not support dword alignment and instead require natural alignment. This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default. Differential Revision: https://reviews.llvm.org/D87821
270 lines
11 KiB
LLVM
270 lines
11 KiB
LLVM
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s
|
|
|
|
; GCN-LABEL: test_local_misaligned_v2:
|
|
; GCN-DAG: ds_read2_b32
|
|
; GCN-DAG: ds_write2_b32
|
|
define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
|
%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
|
|
%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_local_misaligned_v4:
|
|
; GCN-DAG: ds_read2_b32
|
|
; GCN-DAG: ds_read2_b32
|
|
; GCN-DAG: ds_write2_b32
|
|
; GCN-DAG: ds_write2_b32
|
|
; UNALIGNED-DAG: ds_read_b128
|
|
; UNALIGNED-DAG: ds_write_b128
|
|
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
|
%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
|
|
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_local_misaligned_v3:
|
|
; GCN-DAG: ds_read2_b32
|
|
; GCN-DAG: ds_read_b32
|
|
; GCN-DAG: ds_write2_b32
|
|
; GCN-DAG: ds_write_b32
|
|
; UNALIGNED-DAG: ds_read_b96
|
|
; UNALIGNED-DAG: ds_write_b96
|
|
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
|
%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
|
|
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
|
|
%v1 = extractelement <3 x i32> %load, i32 0
|
|
%v2 = extractelement <3 x i32> %load, i32 1
|
|
%v3 = extractelement <3 x i32> %load, i32 2
|
|
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
|
|
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
|
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
|
store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_flat_misaligned_v2:
|
|
; VECT-DAG: flat_load_dwordx2 v
|
|
; VECT-DAG: flat_store_dwordx2 v
|
|
; SPLIT-DAG: flat_load_dword v
|
|
; SPLIT-DAG: flat_load_dword v
|
|
; SPLIT-DAG: flat_store_dword v
|
|
; SPLIT-DAG: flat_store_dword v
|
|
define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
|
|
%ptr = bitcast i32* %gep to <2 x i32>*
|
|
%load = load <2 x i32>, <2 x i32>* %ptr, align 4
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, <2 x i32>* %ptr, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_flat_misaligned_v4:
|
|
; VECT-DAG: flat_load_dwordx4 v
|
|
; VECT-DAG: flat_store_dwordx4 v
|
|
; SPLIT-DAG: flat_load_dword v
|
|
; SPLIT-DAG: flat_load_dword v
|
|
; SPLIT-DAG: flat_load_dword v
|
|
; SPLIT-DAG: flat_load_dword v
|
|
; SPLIT-DAG: flat_store_dword v
|
|
; SPLIT-DAG: flat_store_dword v
|
|
; SPLIT-DAG: flat_store_dword v
|
|
; SPLIT-DAG: flat_store_dword v
|
|
define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
|
|
%ptr = bitcast i32* %gep to <4 x i32>*
|
|
%load = load <4 x i32>, <4 x i32>* %ptr, align 4
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, <4 x i32>* %ptr, align 4
|
|
ret void
|
|
}
|
|
|
|
; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.
|
|
|
|
; GCN-LABEL: test_flat_misaligned_v3:
|
|
; xVECT-DAG: flat_load_dwordx3 v
|
|
; xVECT-DAG: flat_store_dwordx3 v
|
|
; xSPLIT-DAG: flat_load_dword v
|
|
; xSPLIT-DAG: flat_load_dword v
|
|
; xSPLIT-DAG: flat_load_dword v
|
|
; xSPLIT-DAG: flat_store_dword v
|
|
; xSPLIT-DAG: flat_store_dword v
|
|
; xSPLIT-DAG: flat_store_dword v
|
|
define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
|
|
%ptr = bitcast i32* %gep to <3 x i32>*
|
|
%load = load <3 x i32>, <3 x i32>* %ptr, align 4
|
|
%v1 = extractelement <3 x i32> %load, i32 0
|
|
%v2 = extractelement <3 x i32> %load, i32 1
|
|
%v3 = extractelement <3 x i32> %load, i32 2
|
|
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
|
|
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
|
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
|
store <3 x i32> %v7, <3 x i32>* %ptr, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_local_aligned_v2:
|
|
; GCN-DAG: ds_read_b64
|
|
; GCN-DAG: ds_write_b64
|
|
define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
|
%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
|
|
%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_local_aligned_v3:
|
|
; GCN-DAG: ds_read_b96
|
|
; GCN-DAG: ds_write_b96
|
|
define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
|
%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
|
|
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
|
|
%v1 = extractelement <3 x i32> %load, i32 0
|
|
%v2 = extractelement <3 x i32> %load, i32 1
|
|
%v3 = extractelement <3 x i32> %load, i32 2
|
|
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
|
|
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
|
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
|
store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_flat_aligned_v2:
|
|
; GCN-DAG: flat_load_dwordx2 v
|
|
; GCN-DAG: flat_store_dwordx2 v
|
|
define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
|
|
%ptr = bitcast i32* %gep to <2 x i32>*
|
|
%load = load <2 x i32>, <2 x i32>* %ptr, align 8
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, <2 x i32>* %ptr, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_flat_aligned_v4:
|
|
; GCN-DAG: flat_load_dwordx4 v
|
|
; GCN-DAG: flat_store_dwordx4 v
|
|
define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
|
|
%ptr = bitcast i32* %gep to <4 x i32>*
|
|
%load = load <4 x i32>, <4 x i32>* %ptr, align 16
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, <4 x i32>* %ptr, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_local_v4_aligned8:
|
|
; GCN-DAG: ds_read_b128
|
|
; GCN-DAG: ds_write_b128
|
|
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
|
%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
|
|
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_flat_v4_aligned8:
|
|
; VECT-DAG: flat_load_dwordx4 v
|
|
; VECT-DAG: flat_store_dwordx4 v
|
|
; SPLIT-DAG: flat_load_dwordx2 v
|
|
; SPLIT-DAG: flat_load_dwordx2 v
|
|
; SPLIT-DAG: flat_store_dwordx2 v
|
|
; SPLIT-DAG: flat_store_dwordx2 v
|
|
define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, i32* %arg, i32 %lid
|
|
%ptr = bitcast i32* %gep to <4 x i32>*
|
|
%load = load <4 x i32>, <4 x i32>* %ptr, align 8
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, <4 x i32>* %ptr, align 8
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|