Files
clang-p2996/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
Matt Arsenault c5fe075eaf AMDGPU: Use freeze poison instead of undef in alloca promotion (#131285)
Previously the value created to represent the uninitialized memory
of the alloca was undef. Use freeze poison instead. Enables some
optimization improvements (which need defeating in the limit tests),
but also a few regressions. Seems to leave behind dead code in some
cases too.
2025-03-18 17:27:02 +07:00

189 lines
6.6 KiB
LLVM

; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=LIMIT32 %s
target datalayout = "A5"
; OPT-LABEL: @alloca_8xi64_max1024(
; OPT-NOT: alloca
; OPT: <8 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <8 x i64>
define amdgpu_kernel void @alloca_8xi64_max1024(ptr addrspace(1) %out, i32 %index, i32 %index1) #0 {
entry:
%tmp = alloca [8 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i64 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_9xi64_max1024(
; OPT: alloca [9 x i64]
; OPT-NOT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
define amdgpu_kernel void @alloca_9xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_16xi64_max512(
; OPT-NOT: alloca
; OPT: <16 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <16 x i64>
define amdgpu_kernel void @alloca_16xi64_max512(ptr addrspace(1) %out, i32 %index, i32 %index1) #1 {
entry:
%tmp = alloca [16 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i64 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [16 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_17xi64_max512(
; OPT: alloca [17 x i64]
; OPT-NOT: <17 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <17 x i64>
define amdgpu_kernel void @alloca_17xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
entry:
%tmp = alloca [17 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%tmp1 = getelementptr [17 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_9xi128_max512(
; OPT: alloca [9 x i128]
; OPT-NOT: <9 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i128>
define amdgpu_kernel void @alloca_9xi128_max512(ptr addrspace(1) %out, i32 %index) #1 {
entry:
%tmp = alloca [9 x i128], addrspace(5)
store i128 0, ptr addrspace(5) %tmp
%tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i128, ptr addrspace(5) %tmp1
store i128 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_9xi128_max256(
; OPT-NOT: alloca
; OPT: <9 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i128>
define amdgpu_kernel void @alloca_9xi128_max256(ptr addrspace(1) %out, i32 %index, i32 %index1) #2 {
entry:
%tmp = alloca [9 x i128], addrspace(5)
store i128 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i128 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i128, ptr addrspace(5) %tmp1
store i128 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_16xi128_max256(
; OPT-NOT: alloca
; OPT: <16 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <16 x i128>
define amdgpu_kernel void @alloca_16xi128_max256(ptr addrspace(1) %out, i32 %index, i32 %index1) #2 {
entry:
%tmp = alloca [16 x i128], addrspace(5)
store i128 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i128 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [16 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i128, ptr addrspace(5) %tmp1
store i128 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_9xi256_max256(
; OPT: alloca [9 x i256]
; OPT-NOT: <9 x i256>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i256>
define amdgpu_kernel void @alloca_9xi256_max256(ptr addrspace(1) %out, i32 %index, i32 %index1) #2 {
entry:
%tmp = alloca [9 x i256], addrspace(5)
store i256 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i256], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i128 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [9 x i256], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i256, ptr addrspace(5) %tmp1
store i256 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alloca_9xi64_max256(
; OPT-NOT: alloca
; OPT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
define amdgpu_kernel void @alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index, i32 %index1) #2 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i64 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @func_alloca_9xi64_max256(
; OPT: alloca
; OPT-NOT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
define void @func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @alwaysinlined_func_alloca_9xi64_max256(
; OPT-NOT: alloca
; OPT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
define void @alwaysinlined_func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index, i32 %index1) #3 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
store i64 0, ptr addrspace(5) %tmp
%gep0 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index1
store i64 1, ptr addrspace(5) %gep0
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
%tmp2 = load i64, ptr addrspace(5) %tmp1
store i64 %tmp2, ptr addrspace(1) %out
ret void
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
attributes #3 = { alwaysinline "amdgpu-flat-work-group-size"="1,256" }