Files
clang-p2996/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
pvanhout d892521076 [AMDGPU] Break-up large PHIs for DAGISel
DAGISel uses CopyToReg/CopyFromReg to lower PHI nodes. With large PHIs, this can result in poor codegen.
This is because it introduces a need to have a build_vector before copying the PHI value, and that build_vector may have many undef elements. This can cause very high register pressure and abnormal stack usage in some cases.

This scalarization/phi "break-up" can be easily tuned/disabled through CL options in case it's not beneficial for some users.
It's also only enabled for DAGIsel and GlobalISel handles PHIs much better (as it works on the whole function).

This can both scalarize (break a vector into its elements) and simplify (break a vector into smaller, more manageable subvectors) PHIs.

Fixes SWDEV-321581

Reviewed By: kzhuravl

Differential Revision: https://reviews.llvm.org/D143731
2023-03-28 09:38:47 +02:00

457 lines
12 KiB
LLVM

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Note: breaking up large PHIs is disabled to prevent some testcases from becoming
; branchless.
; FIXME: This leaves behind a now unnecessary and with exec
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_vccnz_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_vccnz_ifcvt_diamond(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %else
if:
%u0 = fadd float %v, %v
br label %endif
else:
%u1 = fmul float %v, %v
br label %endif
endif:
%r = phi float [ %u0, %if ], [ %u1, %else ]
store float %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
; GCN: ; clobber vcc
; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
; GCN: s_mov_b64 vcc, [[CMP]]
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(ptr addrspace(1) %out, ptr addrspace(1) %in, float %k) #0 {
entry:
%v = load i32, ptr addrspace(1) %in
%cc = fcmp oeq float %k, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
call void asm "; clobber $0", "~{vcc}"() #0
%u = add i32 %v, %v
br label %endif
endif:
%r = phi i32 [ %v, %entry ], [ %u, %if ]
store i32 %r, ptr addrspace(1) %out
ret void
}
; Longest chain of cheap instructions to convert
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_cndmask_b32_e32
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u.0 = fmul float %v, %v
%u.1 = fmul float %v, %u.0
%u.2 = fmul float %v, %u.1
%u.3 = fmul float %v, %u.2
%u.4 = fmul float %v, %u.3
%u.5 = fmul float %v, %u.4
%u.6 = fmul float %v, %u.5
%u.7 = fmul float %v, %u.6
%u.8 = fmul float %v, %u.7
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u.8, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Short chain of cheap instructions to not convert
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: v_mul_f32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u.0 = fmul float %v, %v
%u.1 = fmul float %v, %u.0
%u.2 = fmul float %v, %u.1
%u.3 = fmul float %v, %u.2
%u.4 = fmul float %v, %u.3
%u.5 = fmul float %v, %u.4
%u.6 = fmul float %v, %u.5
%u.7 = fmul float %v, %u.6
%u.8 = fmul float %v, %u.7
%u.9 = fmul float %v, %u.8
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u.9, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Should still branch over fdiv expansion
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
; GCN: v_cmp_neq_f32_e32
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_div_scale_f32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fdiv float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; vcc branch with SGPR inputs
; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
; GCN: v_cmp_neq_f32_e64
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: s_add_i32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(4) %in, float %cnd) #0 {
entry:
%v = load i32, ptr addrspace(4) %in
%cc = fcmp oeq float %cnd, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = add i32 %v, %v
br label %endif
endif:
%r = phi i32 [ %v, %entry ], [ %u, %if ]
store i32 %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
; GCN: v_cndmask_b32
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
entry:
%v = load float, ptr addrspace(4) %in
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Due to broken cost heuristic, this is not if converted like
; test_vccnz_ifcvt_triangle_constant_load even though it should be.
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
; GCN: v_cndmask_b32
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(ptr addrspace(1) %out, float %v) #0 {
entry:
%cc = fcmp oeq float %v, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; Scalar branch and scalar inputs
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load i32, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add i32 %v, %v
br label %endif
endif:
%r = phi i32 [ %v, %entry ], [ %u, %if ]
call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
ret void
}
; FIXME: Should be able to use VALU compare and select
; Scalar branch but VGPR select operands
; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
; GCN: s_cmp_lg_u32
; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_add_f32_e32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
entry:
%v = load float, ptr addrspace(1) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = fadd float %v, %v
br label %endif
endif:
%r = phi float [ %v, %entry ], [ %u, %if ]
store float %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
; GCN: s_add_u32
; GCN: s_addc_u32
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load i64, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add i64 %v, %v
br label %endif
endif:
%r = phi i64 [ %v, %entry ], [ %u, %if ]
call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
ret void
}
; TODO: Can do s_cselect_b64; s_cselect_b32
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b32 s
; GCN-NEXT: s_cselect_b32 s
; GCN-NEXT: s_cselect_b32 s
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load <3 x i32>, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add <3 x i32> %v, %v
br label %endif
endif:
%r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
%r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
ret void
}
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(ptr addrspace(4) %in, i32 %cond) #0 {
entry:
%v = load <4 x i32>, ptr addrspace(4) %in
%cc = icmp eq i32 %cond, 1
br i1 %cc, label %if, label %endif
if:
%u = add <4 x i32> %v, %v
br label %endif
endif:
%r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
ret void
}
; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, ptr addrspace(1) %out) {
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %else, label %if
if:
br label %done
else:
br label %done
done:
%value = phi i32 [0, %if], [1, %else]
store i32 %value, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}ifcvt_undef_scc:
; GCN: {{^}}; %bb.0:
; GCN-NEXT: s_load_dwordx2
; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, ptr addrspace(1) %out) {
entry:
br i1 undef, label %else, label %if
if:
br label %done
else:
br label %done
done:
%value = phi i32 [0, %if], [1, %else]
store i32 %value, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
; GCN: v_cmp_neq_f32
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_add_i32
; GCN: v_add_i32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
entry:
%v = load <8 x i32>, ptr addrspace(1) %in
%cc = fcmp oeq float %cnd, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = add <8 x i32> %v, %v
br label %endif
endif:
%r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
store <8 x i32> %r, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
; GCN: v_cmp_neq_f32
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
; GCN: v_add_i32
; GCN: v_add_i32
; GCN: [[ENDIF]]:
; GCN: buffer_store_dword
define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
entry:
%v = load <16 x i32>, ptr addrspace(1) %in
%cc = fcmp oeq float %cnd, 1.000000e+00
br i1 %cc, label %if, label %endif
if:
%u = add <16 x i32> %v, %v
br label %endif
endif:
%r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
store <16 x i32> %r, ptr addrspace(1) %out
ret void
}
attributes #0 = { nounwind }