Files
clang-p2996/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
Matt Arsenault b3d5056c79 AMDGPU: Try to perform copy to agpr from reg_sequence at the copy (#129463)
SIFoldOperands is frustratingly written in a def-folds-into-use
iteration pattern, with a few random cases starting at the uses.
We were handling this case by looking at the reg_sequence, and finding
the copy. This did not work for the most basic pattern of materializing
a vector constant that started in SGPRs. It just happens there is an
optimization bug in SelectionDAG that produced the expected pattern.

Perform an additional attempt at the fold rooted at the copy. This
mostly shows test improvements. There were some tricky updates to
perform. remaining-virtual-register-operands.ll managed to stop failing
the allocator, so needed to be tricked into failing again. I also do
not understand what schedule-xdl-resource.ll is trying to do for the test
so this changes it to some random output that exists in the debug output.
2025-03-04 14:41:56 +07:00

44 lines
3.1 KiB
LLVM

; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=machine-scheduler -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope %s
; REQUIRES: asserts
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
; CHECK: Scheduling SU({{[0-9]+}}) {{.*}} V_MFMA_F32_32X32X4F16
; CHECK: HWXDL +16x1u
define amdgpu_kernel void @schedule-xdl-resource(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %stride) #0 {
%in_ptr.1 = getelementptr <32 x float>, ptr addrspace(1) %in, i32 %stride
%in_ptr.2 = getelementptr <32 x float>, ptr addrspace(1) %in_ptr.1, i32 %stride
%in_ptr.3 = getelementptr <32 x float>, ptr addrspace(1) %in_ptr.2, i32 %stride
%in.load.1 = load <32 x float>, ptr addrspace (1) %in_ptr.1
%in.load.2 = load <32 x float>, ptr addrspace (1) %in_ptr.2
%in.load.3 = load <32 x float>, ptr addrspace (1) %in_ptr.3
%lds_ptr.1 = getelementptr <4 x half>, ptr addrspace(3) %lds, i32 %stride
%lds_ptr.2 = getelementptr <4 x half>, ptr addrspace(3) %lds_ptr.1, i32 %stride
%lds_ptr.3 = getelementptr <4 x half>, ptr addrspace(3) %lds_ptr.2, i32 %stride
%lds.load.1 = load <4 x half>, ptr addrspace(3) %lds_ptr.1
%lds.load.2 = load <4 x half>, ptr addrspace(3) %lds_ptr.2
%lds.load.3 = load <4 x half>, ptr addrspace(3) %lds_ptr.3
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.1, <4 x half> %lds.load.1, <32 x float> %in.load.1, i32 1, i32 1, i32 1)
%mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.2, <4 x half> %lds.load.2, <32 x float> %in.load.2, i32 1, i32 1, i32 1)
%mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.3, <4 x half> %lds.load.3, <32 x float> %in.load.3, i32 1, i32 1, i32 1)
%mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.1, <4 x half> %lds.load.1, <32 x float> %in.load.1, i32 2, i32 2, i32 2)
%mai.5 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.2, <4 x half> %lds.load.2, <32 x float> %in.load.2, i32 2, i32 2, i32 2)
%mai.6 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.3, <4 x half> %lds.load.3, <32 x float> %in.load.3, i32 2, i32 2, i32 2)
%out_ptr.1 = getelementptr <32 x float>, ptr addrspace(1) %out, i32 %stride
%out_ptr.2 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.1, i32 %stride
%out_ptr.3 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.2, i32 %stride
%out_ptr.4 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.3, i32 %stride
%out_ptr.5 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.4, i32 %stride
%out_ptr.6 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.5, i32 %stride
store <32 x float> %mai.1, ptr addrspace(1) %out_ptr.1
store <32 x float> %mai.2, ptr addrspace(1) %out_ptr.2
store <32 x float> %mai.3, ptr addrspace(1) %out_ptr.3
store <32 x float> %mai.4, ptr addrspace(1) %out_ptr.4
store <32 x float> %mai.5, ptr addrspace(1) %out_ptr.5
store <32 x float> %mai.6, ptr addrspace(1) %out_ptr.6
ret void
}
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }