DAGISel uses CopyToReg/CopyFromReg to lower PHI nodes. With large PHIs, this can result in poor codegen. This is because it introduces a need to have a build_vector before copying the PHI value, and that build_vector may have many undef elements. This can cause very high register pressure and abnormal stack usage in some cases. This scalarization/phi "break-up" can be easily tuned/disabled through CL options in case it's not beneficial for some users. It's also only enabled for DAGIsel and GlobalISel handles PHIs much better (as it works on the whole function). This can both scalarize (break a vector into its elements) and simplify (break a vector into smaller, more manageable subvectors) PHIs. Fixes SWDEV-321581 Reviewed By: kzhuravl Differential Revision: https://reviews.llvm.org/D143731
457 lines
12 KiB
LLVM
457 lines
12 KiB
LLVM
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
|
; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
|
|
|
; Note: breaking up large PHIs is disabled to prevent some testcases from becoming
|
|
; branchless.
|
|
|
|
; FIXME: This leaves behind a now unnecessary and with exec
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
|
|
; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
|
|
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
|
|
; GCN: buffer_store_dword [[RESULT]]
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fadd float %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
|
|
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
|
|
; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
|
|
; GCN: buffer_store_dword [[RESULT]]
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_diamond(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %else
|
|
|
|
if:
|
|
%u0 = fadd float %v, %v
|
|
br label %endif
|
|
|
|
else:
|
|
%u1 = fmul float %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %u0, %if ], [ %u1, %else ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
|
|
; GCN: ; clobber vcc
|
|
; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
|
|
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
|
|
; GCN: s_mov_b64 vcc, [[CMP]]
|
|
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(ptr addrspace(1) %out, ptr addrspace(1) %in, float %k) #0 {
|
|
entry:
|
|
%v = load i32, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %k, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
call void asm "; clobber $0", "~{vcc}"() #0
|
|
%u = add i32 %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi i32 [ %v, %entry ], [ %u, %if ]
|
|
store i32 %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Longest chain of cheap instructions to convert
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_cndmask_b32_e32
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u.0 = fmul float %v, %v
|
|
%u.1 = fmul float %v, %u.0
|
|
%u.2 = fmul float %v, %u.1
|
|
%u.3 = fmul float %v, %u.2
|
|
%u.4 = fmul float %v, %u.3
|
|
%u.5 = fmul float %v, %u.4
|
|
%u.6 = fmul float %v, %u.5
|
|
%u.7 = fmul float %v, %u.6
|
|
%u.8 = fmul float %v, %u.7
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u.8, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Short chain of cheap instructions to not convert
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
|
|
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
|
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
; GCN: v_mul_f32
|
|
|
|
; GCN: [[ENDIF]]:
|
|
; GCN: buffer_store_dword
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u.0 = fmul float %v, %v
|
|
%u.1 = fmul float %v, %u.0
|
|
%u.2 = fmul float %v, %u.1
|
|
%u.3 = fmul float %v, %u.2
|
|
%u.4 = fmul float %v, %u.3
|
|
%u.5 = fmul float %v, %u.4
|
|
%u.6 = fmul float %v, %u.5
|
|
%u.7 = fmul float %v, %u.6
|
|
%u.8 = fmul float %v, %u.7
|
|
%u.9 = fmul float %v, %u.8
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u.9, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Should still branch over fdiv expansion
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
|
|
; GCN: v_cmp_neq_f32_e32
|
|
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
|
|
|
; GCN: v_div_scale_f32
|
|
|
|
; GCN: [[ENDIF]]:
|
|
; GCN: buffer_store_dword
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fdiv float %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; vcc branch with SGPR inputs
|
|
; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
|
|
; GCN: v_cmp_neq_f32_e64
|
|
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
|
|
|
; GCN: s_add_i32
|
|
|
|
; GCN: [[ENDIF]]:
|
|
; GCN: buffer_store_dword
|
|
define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(4) %in, float %cnd) #0 {
|
|
entry:
|
|
%v = load i32, ptr addrspace(4) %in
|
|
%cc = fcmp oeq float %cnd, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add i32 %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi i32 [ %v, %entry ], [ %u, %if ]
|
|
store i32 %r, ptr addrspace(1) %out
|
|
ret void
|
|
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
|
|
; GCN: v_cndmask_b32
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(4) %in
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fadd float %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Due to broken cost heuristic, this is not if converted like
|
|
; test_vccnz_ifcvt_triangle_constant_load even though it should be.
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
|
|
; GCN: v_cndmask_b32
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(ptr addrspace(1) %out, float %v) #0 {
|
|
entry:
|
|
%cc = fcmp oeq float %v, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fadd float %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; Scalar branch and scalar inputs
|
|
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
|
|
; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
|
; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
|
|
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
|
|
; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
|
|
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(ptr addrspace(4) %in, i32 %cond) #0 {
|
|
entry:
|
|
%v = load i32, ptr addrspace(4) %in
|
|
%cc = icmp eq i32 %cond, 1
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add i32 %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi i32 [ %v, %entry ], [ %u, %if ]
|
|
call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
|
|
ret void
|
|
}
|
|
|
|
; FIXME: Should be able to use VALU compare and select
|
|
; Scalar branch but VGPR select operands
|
|
; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
|
|
; GCN: s_cmp_lg_u32
|
|
; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
|
|
|
; GCN: v_add_f32_e32
|
|
|
|
; GCN: [[ENDIF]]:
|
|
; GCN: buffer_store_dword
|
|
define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
|
|
entry:
|
|
%v = load float, ptr addrspace(1) %in
|
|
%cc = icmp eq i32 %cond, 1
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = fadd float %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi float [ %v, %entry ], [ %u, %if ]
|
|
store float %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
|
|
; GCN: s_add_u32
|
|
; GCN: s_addc_u32
|
|
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
|
|
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
|
|
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(ptr addrspace(4) %in, i32 %cond) #0 {
|
|
entry:
|
|
%v = load i64, ptr addrspace(4) %in
|
|
%cc = icmp eq i32 %cond, 1
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add i64 %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi i64 [ %v, %entry ], [ %u, %if ]
|
|
call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
|
|
ret void
|
|
}
|
|
|
|
; TODO: Can do s_cselect_b64; s_cselect_b32
|
|
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
|
|
; GCN: s_add_i32
|
|
; GCN: s_add_i32
|
|
; GCN: s_add_i32
|
|
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
|
|
; GCN-NEXT: s_cselect_b32 s
|
|
; GCN-NEXT: s_cselect_b32 s
|
|
; GCN-NEXT: s_cselect_b32 s
|
|
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(ptr addrspace(4) %in, i32 %cond) #0 {
|
|
entry:
|
|
%v = load <3 x i32>, ptr addrspace(4) %in
|
|
%cc = icmp eq i32 %cond, 1
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add <3 x i32> %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
|
|
%r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
|
|
; GCN: s_add_i32
|
|
; GCN: s_add_i32
|
|
; GCN: s_add_i32
|
|
; GCN: s_add_i32
|
|
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
|
|
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
|
|
; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
|
|
define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(ptr addrspace(4) %in, i32 %cond) #0 {
|
|
entry:
|
|
%v = load <4 x i32>, ptr addrspace(4) %in
|
|
%cc = icmp eq i32 %cond, 1
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add <4 x i32> %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
|
|
call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
|
|
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
|
|
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
|
|
define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, ptr addrspace(1) %out) {
|
|
entry:
|
|
%cmp0 = icmp eq i32 %cond, 0
|
|
br i1 %cmp0, label %else, label %if
|
|
|
|
if:
|
|
br label %done
|
|
|
|
else:
|
|
br label %done
|
|
|
|
done:
|
|
%value = phi i32 [0, %if], [1, %else]
|
|
store i32 %value, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}ifcvt_undef_scc:
|
|
; GCN: {{^}}; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2
|
|
; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
|
|
define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, ptr addrspace(1) %out) {
|
|
entry:
|
|
br i1 undef, label %else, label %if
|
|
|
|
if:
|
|
br label %done
|
|
|
|
else:
|
|
br label %done
|
|
|
|
done:
|
|
%value = phi i32 [0, %if], [1, %else]
|
|
store i32 %value, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
|
|
; GCN: v_cmp_neq_f32
|
|
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
|
|
|
; GCN: v_add_i32
|
|
; GCN: v_add_i32
|
|
|
|
; GCN: [[ENDIF]]:
|
|
; GCN: buffer_store_dword
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
|
|
entry:
|
|
%v = load <8 x i32>, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %cnd, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add <8 x i32> %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
|
|
store <8 x i32> %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
|
|
; GCN: v_cmp_neq_f32
|
|
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
|
|
|
; GCN: v_add_i32
|
|
; GCN: v_add_i32
|
|
|
|
; GCN: [[ENDIF]]:
|
|
; GCN: buffer_store_dword
|
|
define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
|
|
entry:
|
|
%v = load <16 x i32>, ptr addrspace(1) %in
|
|
%cc = fcmp oeq float %cnd, 1.000000e+00
|
|
br i1 %cc, label %if, label %endif
|
|
|
|
if:
|
|
%u = add <16 x i32> %v, %v
|
|
br label %endif
|
|
|
|
endif:
|
|
%r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
|
|
store <16 x i32> %r, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|