Files
clang-p2996/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
Hans Wennborg a45f301f7a Revert r368339 "[MBP] Disable aggressive loop rotate in plain mode"
It caused assertions to fire when building Chromium:

  lib/CodeGen/LiveDebugValues.cpp:331: bool
  {anonymous}::LiveDebugValues::OpenRangesSet::empty() const: Assertion
  `Vars.empty() == VarLocs.empty() && "open ranges are inconsistent"' failed.

See https://crbug.com/992871#c3 for how to reproduce.

> Patch https://reviews.llvm.org/D43256 introduced more aggressive loop layout optimization which depends on profile information. If profile information is not available, the statically estimated profile information(generated by BranchProbabilityInfo.cpp) is used. If user program doesn't behave as BranchProbabilityInfo.cpp expected, the layout may be worse.
>
> To be conservative this patch restores the original layout algorithm in plain mode. But user can still try the aggressive layout optimization with -force-precise-rotation-cost=true.
>
> Differential Revision: https://reviews.llvm.org/D65673

llvm-svn: 368579
2019-08-12 14:23:13 +00:00

57 lines
1.9 KiB
LLVM

; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}i1_copy_from_loop:
;
; SI: ; %Flow
; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], exec
; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
; SI: ; %for.body
; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
; SI-DAG: s_andn2_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec
; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
; SI: ; %Flow1
; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec
; SI: ; %for.end
; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
entry:
br label %for.body
for.body:
%i = phi i32 [0, %entry], [%i.inc, %end.loop]
%cc = icmp ult i32 %i, 4
br i1 %cc, label %mid.loop, label %for.end
mid.loop:
%v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i1 false, i1 false)
%cc2 = fcmp oge float %v, 0.0
br i1 %cc2, label %end.loop, label %for.end
end.loop:
%i.inc = add i32 %i, 1
br label %for.body
for.end:
br i1 %cc, label %if, label %end
if:
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float undef, float undef, float undef, i1 true, i1 true)
br label %end
end:
ret void
}
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
attributes #0 = { nounwind readonly }
attributes #1 = { nounwind inaccessiblememonly }