Files
clang-p2996/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
Florian Hahn 90d09eb300 [LoopPeel] Allow peeling with multiple unreachable-terminated exit blocks.
Support for peeling with multiple exit blocks was added in D63921/77bb3a486fa6.

So far it has only been enabled for loops where all non-latch exits are
'de-optimizing' exits (D63923). But peeling of multi-exit loops can be
highly beneficial in other cases too, like if all non-latch exiting
blocks are unreachable.

The motivating case are loops with runtime checks, like the C++ example
below. The main issue preventing vectorization is that the invariant
accesses to load the bounds of B is conditionally executed in the loop
and cannot be hoisted out. If we peel off the first iteration, they
become dereferenceable in the loop, because they must execute before the
loop is executed, as all non-latch exits are terminated with
unreachable. This subsequently allows hoisting the loads and runtime
checks out of the loop, allowing vectorization of the loop.

     int sum(std::vector<int> *A, std::vector<int> *B, int N) {
       int cost = 0;
       for (int i = 0; i < N; ++i)
         cost += A->at(i) + B->at(i);
       return cost;
     }

This gives a ~20-30% increase of score for Geekbench5/HDR on AArch64.

Note that this requires a follow-up improvement to the peeling cost
model to actually peel iterations off loops as above. I will share that
shortly.

Also, peeling of multi-exits might be beneficial for exit blocks with
other terminators, but I would like to keep the scope limited to known
high-reward cases for now.

I removed the option to disable peeling for multi-deopt exits because
the code is more general now. Alternatively, the option could also be
generalized, but I am not sure if there's much value in the option?

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D108108
2021-08-25 13:26:40 +01:00

91 lines
3.6 KiB
LLVM

; REQUIRES: asserts
; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-runtime 2>&1 | FileCheck %s
; RUN: opt < %s -S -debug-only=loop-unroll -passes='require<profile-summary>,function(require<opt-remark-emit>,loop-unroll)' 2>&1 | FileCheck %s
; RUN: opt < %s -S -debug-only=loop-unroll -passes='require<profile-summary>,function(require<opt-remark-emit>,loop-unroll<no-profile-peeling>)' 2>&1 | FileCheck %s --check-prefixes=CHECK-NO-PEEL
; Make sure we use the profile information correctly to peel-off 3 iterations
; from the loop, and update the branch weights for the peeled loop properly.
; All side exits to deopt does not change weigths.
; CHECK: Loop Unroll: F[basic]
; CHECK: PEELING loop %for.body with iteration count 4!
; CHECK-NO-PEEL-NOT: PEELING loop %for.body
; CHECK-LABEL: @basic
; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !16
; CHECK: [[NEXT0]]:
; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !17
; CHECK: [[NEXT1]]:
; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18
; CHECK: [[NEXT2]]:
; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15
; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !19
define i32 @basic(i32* %p, i32 %k, i1 %c) #0 !prof !15 {
entry:
%cmp3 = icmp slt i32 0, %k
br i1 %cmp3, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %continue ]
%p.addr.04 = phi i32* [ %p, %for.body.lr.ph ], [ %incdec.ptr, %continue ]
%incdec.ptr = getelementptr inbounds i32, i32* %p.addr.04, i32 1
store i32 %i.05, i32* %p.addr.04, align 4
%inc = add nsw i32 %i.05, 1
%cmp = icmp slt i32 %inc, %k
br i1 %c, label %continue, label %side_exit, !prof !17
continue:
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !16
for.cond.for.end_crit_edge: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
%res = phi i32 [ 0, %entry ], [ %inc, %for.cond.for.end_crit_edge ]
ret i32 %res
side_exit:
%rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %inc) ]
ret i32 %rval
}
declare i32 @llvm.experimental.deoptimize.i32(...)
attributes #0 = { nounwind }
attributes #1 = { nounwind optsize }
!llvm.module.flags = !{!1}
!1 = !{i32 1, !"ProfileSummary", !2}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
!3 = !{!"ProfileFormat", !"InstrProf"}
!4 = !{!"TotalCount", i64 10}
!5 = !{!"MaxCount", i64 3}
!6 = !{!"MaxInternalCount", i64 1}
!7 = !{!"MaxFunctionCount", i64 3}
!8 = !{!"NumCounts", i64 2}
!9 = !{!"NumFunctions", i64 2}
!10 = !{!"DetailedSummary", !11}
!11 = !{!12, !13, !14}
!12 = !{i32 10000, i64 3, i32 2}
!13 = !{i32 999000, i64 1, i32 10}
!14 = !{i32 999999, i64 1, i32 10}
!15 = !{!"function_entry_count", i64 1}
!16 = !{!"branch_weights", i32 3001, i32 1001}
!17 = !{!"branch_weights", i32 1, i32 0}
; This is a weights of deopt side-exit.
;CHECK: !15 = !{!"branch_weights", i32 1, i32 0}
; This is a weights of latch and its copies.
;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001}
;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001}
;CHECK: !18 = !{!"branch_weights", i32 999, i32 1001}
;CHECK: !19 = !{!"branch_weights", i32 1, i32 1001}