Files
clang-p2996/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
Jon Chesterfield 0507448d82 [amdgpu] Implement dynamic LDS accesses from non-kernel functions
The premise here is to allow non-kernel functions to locate external LDS variables without using LDS or extra magic SGPRs to do so.

1/ First it crawls the callgraph to work out which external LDS variables are reachable from a given kernel
2/ Then it creates a new `extern char[0]` variable for each kernel, which will alias all the other extern LDS variables because that's the documented behaviour of these variables
3/ The address of that variable is written to a lookup table. The global variable is tagged with metadata to track what address it was allocated at by codegen
4/ The assembler builds the lookup table using the metadata
5/ Any non-kernel functions use the same magic intrinsic used by table lookups of non-dynamic LDS variables to find the address to use

Heavy overlap with the code paths taken for other lowering, in particular the same intrinsic is used to pass the dynamic scope information through the same sgpr as for table lookups of static LDS.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D144233
2023-04-04 20:06:34 +01:00

139 lines
6.6 KiB
LLVM

; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
@lds0 = addrspace(3) global [512 x float] undef
@lds1 = addrspace(3) global [256 x float] undef
@lds2 = addrspace(3) global [4096 x float] undef
@lds3 = addrspace(3) global [67 x i8] undef
@dynamic_shared0 = external addrspace(3) global [0 x float]
@dynamic_shared1 = external addrspace(3) global [0 x double]
@dynamic_shared2 = external addrspace(3) global [0 x double], align 4
@dynamic_shared3 = external addrspace(3) global [0 x double], align 16
; CHECK-LABEL: {{^}}dynamic_shared_array_0:
; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i32 0, i32 %tid.x
%val0 = load float, ptr addrspace(3) %arrayidx0, align 4
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val0, ptr addrspace(3) %arrayidx1, align 4
ret void
}
; CHECK-LABEL: {{^}}dynamic_shared_array_1:
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) {
entry:
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%idx.0 = add nsw i32 %tid.x, 64
%tmp = icmp eq i32 %cond, 0
br i1 %tmp, label %if, label %else
if: ; preds = %entry
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
%val0 = load float, ptr addrspace(3) %arrayidx0, align 4
br label %endif
else: ; preds = %entry
%arrayidx1 = getelementptr inbounds [256 x float], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
%val1 = load float, ptr addrspace(3) %arrayidx1, align 4
br label %endif
endif: ; preds = %else, %if
%val = phi float [ %val0, %if ], [ %val1, %else ]
%arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val, ptr addrspace(3) %arrayidx, align 4
ret void
}
; CHECK-LABEL: {{^}}dynamic_shared_array_2:
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%vidx = add i32 %tid.x, %idx
%arrayidx0 = getelementptr inbounds [4096 x float], ptr addrspace(3) @lds2, i32 0, i32 %vidx
%val0 = load float, ptr addrspace(3) %arrayidx0, align 4
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val0, ptr addrspace(3) %arrayidx1, align 4
ret void
}
; The offset to the dynamic shared memory array should be aligned on the type
; specified.
; CHECK-LABEL: {{^}}dynamic_shared_array_3:
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%vidx = add i32 %tid.x, %idx
%arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
%val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
%val1 = uitofp i8 %val0 to float
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
; The offset to the dynamic shared memory array should be aligned on the
; maximal one.
; CHECK-LABEL: {{^}}dynamic_shared_array_4:
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%vidx = add i32 %tid.x, %idx
%arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
%val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
%val1 = uitofp i8 %val0 to float
%val2 = uitofp i8 %val0 to double
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_shared1, i32 0, i32 %tid.x
store double %val2, ptr addrspace(3) %arrayidx2, align 4
ret void
}
; Honor the explicit alignment from the specified variable.
; CHECK-LABEL: {{^}}dynamic_shared_array_5:
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%vidx = add i32 %tid.x, %idx
%arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
%val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
%val1 = uitofp i8 %val0 to float
%val2 = uitofp i8 %val0 to double
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_shared2, i32 0, i32 %tid.x
store double %val2, ptr addrspace(3) %arrayidx2, align 4
ret void
}
; Honor the explicit alignment from the specified variable.
; CHECK-LABEL: {{^}}dynamic_shared_array_6:
; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%vidx = add i32 %tid.x, %idx
%arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
%val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
%val1 = uitofp i8 %val0 to float
%val2 = uitofp i8 %val0 to double
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_shared3, i32 0, i32 %tid.x
store double %val2, ptr addrspace(3) %arrayidx2, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()