Loop rotation often has to perform code duplication from header into preheader, which introduces PHI nodes. >>! In D99204, @thopre wrote: > > With loop peeling, it is important that unnecessary PHIs be avoided or > it will leads to spurious peeling. One source of such PHIs is loop > rotation which creates PHIs for invariant loads. Those PHIs are > particularly problematic since loop peeling is now run as part of simple > loop unrolling before GVN is run, and are thus a source of spurious > peeling. > > Note that while some of the load can be hoisted and eventually > eliminated by instruction combine, this is not always possible due to > alignment issue. In particular, the motivating example [1] was a load > inside a class instance which cannot be hoisted because the `this' > pointer has an alignment of 1. > > [1] http://lists.llvm.org/pipermail/llvm-dev/attachments/20210312/4ce73c47/attachment.cpp Now, we could enhance LoopRotate to avoid duplicating code when not needed, but instead hoist loop-invariant code, but isn't that a code duplication? (*sic*) We have LICM, and in fact we already run it right after LoopRotation. We could try to move it to before LoopRotation, that is basically free from compile-time perspective: https://llvm-compile-time-tracker.com/compare.php?from=6c93eb4477d88af046b915bc955c03693b2cbb58&to=a4bee6d07732b1184c436da489040b912f0dc271&stat=instructions But, looking at stats, i think it isn't great that we would no longer do LICM after LoopRotation, in particular: | statistic name | LoopRotate-LICM | LICM-LoopRotate | Δ | % | abs(%) | | asm-printer.EmittedInsts | 9015930 | 9015799 | -131 | 0.00% | 0.00% | | indvars.NumElimCmp | 3536 | 3544 | 8 | 0.23% | 0.23% | | indvars.NumElimExt | 36725 | 36580 | -145 | -0.39% | 0.39% | | indvars.NumElimIV | 1197 | 1187 | -10 | -0.84% | 0.84% | | indvars.NumElimIdentity | 143 | 136 | -7 | -4.90% | 4.90% | | indvars.NumElimRem | 4 | 5 | 1 | 25.00% | 25.00% | | indvars.NumLFTR | 29842 | 29890 | 48 | 0.16% | 0.16% | | indvars.NumReplaced | 2293 | 2227 | -66 | -2.88% | 2.88% | | indvars.NumSimplifiedSDiv | 6 | 8 | 2 | 33.33% | 33.33% | | indvars.NumWidened | 26438 | 26329 | -109 | -0.41% | 0.41% | | instcount.TotalBlocks | 1178338 | 1173840 | -4498 | -0.38% | 0.38% | | instcount.TotalFuncs | 111825 | 111829 | 4 | 0.00% | 0.00% | | instcount.TotalInsts | 9905442 | 9896139 | -9303 | -0.09% | 0.09% | | lcssa.NumLCSSA | 425871 | 423961 | -1910 | -0.45% | 0.45% | | licm.NumHoisted | 378357 | 378753 | 396 | 0.10% | 0.10% | | licm.NumMovedCalls | 2193 | 2208 | 15 | 0.68% | 0.68% | | licm.NumMovedLoads | 35899 | 31821 | -4078 | -11.36% | 11.36% | | licm.NumPromoted | 11178 | 11154 | -24 | -0.21% | 0.21% | | licm.NumSunk | 13359 | 13587 | 228 | 1.71% | 1.71% | | loop-delete.NumDeleted | 8547 | 8402 | -145 | -1.70% | 1.70% | | loop-instsimplify.NumSimplified | 12876 | 11890 | -986 | -7.66% | 7.66% | | loop-peel.NumPeeled | 1008 | 925 | -83 | -8.23% | 8.23% | | loop-rotate.NumNotRotatedDueToHeaderSize | 368 | 365 | -3 | -0.82% | 0.82% | | loop-rotate.NumRotated | 42015 | 42003 | -12 | -0.03% | 0.03% | | loop-simplifycfg.NumLoopBlocksDeleted | 240 | 242 | 2 | 0.83% | 0.83% | | loop-simplifycfg.NumLoopExitsDeleted | 497 | 20 | -477 | -95.98% | 95.98% | | loop-simplifycfg.NumTerminatorsFolded | 618 | 336 | -282 | -45.63% | 45.63% | | loop-unroll.NumCompletelyUnrolled | 11028 | 11032 | 4 | 0.04% | 0.04% | | loop-unroll.NumUnrolled | 12608 | 12529 | -79 | -0.63% | 0.63% | | mem2reg.NumDeadAlloca | 10222 | 10221 | -1 | -0.01% | 0.01% | | mem2reg.NumPHIInsert | 192110 | 192106 | -4 | 0.00% | 0.00% | | mem2reg.NumSingleStore | 637650 | 637643 | -7 | 0.00% | 0.00% | | scalar-evolution.NumBruteForceTripCountsComputed | 814 | 812 | -2 | -0.25% | 0.25% | | scalar-evolution.NumTripCountsComputed | 283108 | 282934 | -174 | -0.06% | 0.06% | | scalar-evolution.NumTripCountsNotComputed | 106712 | 106718 | 6 | 0.01% | 0.01% | | simple-loop-unswitch.NumBranches | 5178 | 4752 | -426 | -8.23% | 8.23% | | simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 503 | -411 | -44.97% | 44.97% | | simple-loop-unswitch.NumSwitches | 20 | 18 | -2 | -10.00% | 10.00% | | simple-loop-unswitch.NumTrivial | 183 | 95 | -88 | -48.09% | 48.09% | ... but that actually regresses LICM (-12% `licm.NumMovedLoads`), loop-simplifycfg (`NumLoopExitsDeleted`, `NumTerminatorsFolded`), simple-loop-unswitch (`NumTrivial`). What if we instead have LICM both before and after LoopRotate? | statistic name | LoopRotate-LICM | LICM-LoopRotate-LICM | Δ | % | abs(%) | | asm-printer.EmittedInsts | 9015930 | 9014474 | -1456 | -0.02% | 0.02% | | indvars.NumElimCmp | 3536 | 3546 | 10 | 0.28% | 0.28% | | indvars.NumElimExt | 36725 | 36681 | -44 | -0.12% | 0.12% | | indvars.NumElimIV | 1197 | 1185 | -12 | -1.00% | 1.00% | | indvars.NumElimIdentity | 143 | 146 | 3 | 2.10% | 2.10% | | indvars.NumElimRem | 4 | 5 | 1 | 25.00% | 25.00% | | indvars.NumLFTR | 29842 | 29899 | 57 | 0.19% | 0.19% | | indvars.NumReplaced | 2293 | 2299 | 6 | 0.26% | 0.26% | | indvars.NumSimplifiedSDiv | 6 | 8 | 2 | 33.33% | 33.33% | | indvars.NumWidened | 26438 | 26404 | -34 | -0.13% | 0.13% | | instcount.TotalBlocks | 1178338 | 1173652 | -4686 | -0.40% | 0.40% | | instcount.TotalFuncs | 111825 | 111829 | 4 | 0.00% | 0.00% | | instcount.TotalInsts | 9905442 | 9895452 | -9990 | -0.10% | 0.10% | | lcssa.NumLCSSA | 425871 | 425373 | -498 | -0.12% | 0.12% | | licm.NumHoisted | 378357 | 383352 | 4995 | 1.32% | 1.32% | | licm.NumMovedCalls | 2193 | 2204 | 11 | 0.50% | 0.50% | | licm.NumMovedLoads | 35899 | 35755 | -144 | -0.40% | 0.40% | | licm.NumPromoted | 11178 | 11163 | -15 | -0.13% | 0.13% | | licm.NumSunk | 13359 | 14321 | 962 | 7.20% | 7.20% | | loop-delete.NumDeleted | 8547 | 8538 | -9 | -0.11% | 0.11% | | loop-instsimplify.NumSimplified | 12876 | 12041 | -835 | -6.48% | 6.48% | | loop-peel.NumPeeled | 1008 | 924 | -84 | -8.33% | 8.33% | | loop-rotate.NumNotRotatedDueToHeaderSize | 368 | 365 | -3 | -0.82% | 0.82% | | loop-rotate.NumRotated | 42015 | 42005 | -10 | -0.02% | 0.02% | | loop-simplifycfg.NumLoopBlocksDeleted | 240 | 241 | 1 | 0.42% | 0.42% | | loop-simplifycfg.NumTerminatorsFolded | 618 | 619 | 1 | 0.16% | 0.16% | | loop-unroll.NumCompletelyUnrolled | 11028 | 11029 | 1 | 0.01% | 0.01% | | loop-unroll.NumUnrolled | 12608 | 12525 | -83 | -0.66% | 0.66% | | mem2reg.NumPHIInsert | 192110 | 192073 | -37 | -0.02% | 0.02% | | mem2reg.NumSingleStore | 637650 | 637652 | 2 | 0.00% | 0.00% | | scalar-evolution.NumTripCountsComputed | 283108 | 282998 | -110 | -0.04% | 0.04% | | scalar-evolution.NumTripCountsNotComputed | 106712 | 106691 | -21 | -0.02% | 0.02% | | simple-loop-unswitch.NumBranches | 5178 | 5185 | 7 | 0.14% | 0.14% | | simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 925 | 11 | 1.20% | 1.20% | | simple-loop-unswitch.NumTrivial | 183 | 179 | -4 | -2.19% | 2.19% | | simple-loop-unswitch.NumBranches | 5178 | 4752 | -426 | -8.23% | 8.23% | | simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 503 | -411 | -44.97% | 44.97% | | simple-loop-unswitch.NumSwitches | 20 | 18 | -2 | -10.00% | 10.00% | | simple-loop-unswitch.NumTrivial | 183 | 95 | -88 | -48.09% | 48.09% | I.e. we end up with less instructions, less peeling, more LICM activity, also note how none of those 4 regressions are here. Namely: | statistic name | LICM-LoopRotate | LICM-LoopRotate-LICM | Δ | % | abs(%) | | asm-printer.EmittedInsts | 9015799 | 9014474 | -1325 | -0.01% | 0.01% | | indvars.NumElimCmp | 3544 | 3546 | 2 | 0.06% | 0.06% | | indvars.NumElimExt | 36580 | 36681 | 101 | 0.28% | 0.28% | | indvars.NumElimIV | 1187 | 1185 | -2 | -0.17% | 0.17% | | indvars.NumElimIdentity | 136 | 146 | 10 | 7.35% | 7.35% | | indvars.NumLFTR | 29890 | 29899 | 9 | 0.03% | 0.03% | | indvars.NumReplaced | 2227 | 2299 | 72 | 3.23% | 3.23% | | indvars.NumWidened | 26329 | 26404 | 75 | 0.28% | 0.28% | | instcount.TotalBlocks | 1173840 | 1173652 | -188 | -0.02% | 0.02% | | instcount.TotalInsts | 9896139 | 9895452 | -687 | -0.01% | 0.01% | | lcssa.NumLCSSA | 423961 | 425373 | 1412 | 0.33% | 0.33% | | licm.NumHoisted | 378753 | 383352 | 4599 | 1.21% | 1.21% | | licm.NumMovedCalls | 2208 | 2204 | -4 | -0.18% | 0.18% | | licm.NumMovedLoads | 31821 | 35755 | 3934 | 12.36% | 12.36% | | licm.NumPromoted | 11154 | 11163 | 9 | 0.08% | 0.08% | | licm.NumSunk | 13587 | 14321 | 734 | 5.40% | 5.40% | | loop-delete.NumDeleted | 8402 | 8538 | 136 | 1.62% | 1.62% | | loop-instsimplify.NumSimplified | 11890 | 12041 | 151 | 1.27% | 1.27% | | loop-peel.NumPeeled | 925 | 924 | -1 | -0.11% | 0.11% | | loop-rotate.NumRotated | 42003 | 42005 | 2 | 0.00% | 0.00% | | loop-simplifycfg.NumLoopBlocksDeleted | 242 | 241 | -1 | -0.41% | 0.41% | | loop-simplifycfg.NumLoopExitsDeleted | 20 | 497 | 477 | 2385.00% | 2385.00% | | loop-simplifycfg.NumTerminatorsFolded | 336 | 619 | 283 | 84.23% | 84.23% | | loop-unroll.NumCompletelyUnrolled | 11032 | 11029 | -3 | -0.03% | 0.03% | | loop-unroll.NumUnrolled | 12529 | 12525 | -4 | -0.03% | 0.03% | | mem2reg.NumDeadAlloca | 10221 | 10222 | 1 | 0.01% | 0.01% | | mem2reg.NumPHIInsert | 192106 | 192073 | -33 | -0.02% | 0.02% | | mem2reg.NumSingleStore | 637643 | 637652 | 9 | 0.00% | 0.00% | | scalar-evolution.NumBruteForceTripCountsComputed | 812 | 814 | 2 | 0.25% | 0.25% | | scalar-evolution.NumTripCountsComputed | 282934 | 282998 | 64 | 0.02% | 0.02% | | scalar-evolution.NumTripCountsNotComputed | 106718 | 106691 | -27 | -0.03% | 0.03% | | simple-loop-unswitch.NumBranches | 4752 | 5185 | 433 | 9.11% | 9.11% | | simple-loop-unswitch.NumCostMultiplierSkipped | 503 | 925 | 422 | 83.90% | 83.90% | | simple-loop-unswitch.NumSwitches | 18 | 20 | 2 | 11.11% | 11.11% | | simple-loop-unswitch.NumTrivial | 95 | 179 | 84 | 88.42% | 88.42% | {F15983613} {F15983615} {F15983616} (this is vanilla llvm testsuite + rawspeed + darktable) As an example of the code where early LICM only is bad, see: https://godbolt.org/z/GzEbacs4K This does have an observable compile-time regression of +~0.5% geomean https://llvm-compile-time-tracker.com/compare.php?from=7c5222e4d1a3a14f029e5f614c9aefd0fa505f1e&to=5d81826c3411982ca26e46b9d0aff34c80577664&stat=instructions but i think that's basically nothing, and there's potential that it might be avoidable in the future by fixing clang to produce alignment information on function arguments, thus making the second run unneeded. Differential Revision: https://reviews.llvm.org/D99249
140 lines
3.8 KiB
LLVM
140 lines
3.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; TODO: Run under new PM after switch. The IR is the same but basic block labels are different.
|
|
; RUN: opt -S -O2 -scev-cheap-expansion-budget=1024 %s -enable-new-pm=0 | FileCheck %s
|
|
|
|
; See https://bugs.llvm.org/show_bug.cgi?id=45360
|
|
; This is reduced from that (runnable) test.
|
|
; The remainder operation is originally guarded, it never divides by zero.
|
|
; Indvars should not make it execute unconditionally.
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
target triple = "x86_64-pc-linux-gnu"
|
|
|
|
@f = dso_local global i32 0, align 4
|
|
@a = dso_local global i32 0, align 4
|
|
@d = dso_local global i32 0, align 4
|
|
@c = dso_local global i32 0, align 4
|
|
@b = dso_local global i32 0, align 4
|
|
@e = dso_local global i32 0, align 4
|
|
|
|
define dso_local i32 @main() {
|
|
; CHECK-LABEL: @main(
|
|
; CHECK-NEXT: bb:
|
|
; CHECK-NEXT: [[I6:%.*]] = load i32, i32* @a, align 4
|
|
; CHECK-NEXT: [[I24:%.*]] = load i32, i32* @b, align 4
|
|
; CHECK-NEXT: [[D_PROMOTED7:%.*]] = load i32, i32* @d, align 4
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[D_PROMOTED7]], [[I6]]
|
|
; CHECK-NEXT: [[I21:%.*]] = icmp eq i32 [[TMP0]], 0
|
|
; CHECK-NEXT: br i1 [[I21]], label [[BB27_THREAD:%.*]], label [[BB27_PREHEADER:%.*]]
|
|
; CHECK: bb27.preheader:
|
|
; CHECK-NEXT: [[I26:%.*]] = urem i32 [[I24]], [[TMP0]]
|
|
; CHECK-NEXT: store i32 [[I26]], i32* @e, align 4
|
|
; CHECK-NEXT: [[I30_NOT:%.*]] = icmp eq i32 [[I26]], 0
|
|
; CHECK-NEXT: br label [[BB27:%.*]]
|
|
; CHECK: bb27.thread:
|
|
; CHECK-NEXT: store i32 0, i32* @d, align 4
|
|
; CHECK-NEXT: store i32 -1, i32* @f, align 4
|
|
; CHECK-NEXT: store i32 0, i32* @c, align 4
|
|
; CHECK-NEXT: br label [[BB32:%.*]]
|
|
; CHECK: bb27:
|
|
; CHECK-NEXT: br i1 [[I30_NOT]], label [[BB32_LOOPEXIT:%.*]], label [[BB36:%.*]]
|
|
; CHECK: bb32.loopexit:
|
|
; CHECK-NEXT: store i32 [[TMP0]], i32* @d, align 4
|
|
; CHECK-NEXT: store i32 -1, i32* @f, align 4
|
|
; CHECK-NEXT: br label [[BB32]]
|
|
; CHECK: bb32:
|
|
; CHECK-NEXT: [[C_SINK:%.*]] = phi i32* [ @c, [[BB32_LOOPEXIT]] ], [ @e, [[BB27_THREAD]] ]
|
|
; CHECK-NEXT: store i32 0, i32* [[C_SINK]], align 4
|
|
; CHECK-NEXT: ret i32 0
|
|
; CHECK: bb36:
|
|
; CHECK-NEXT: store i32 1, i32* @c, align 4
|
|
; CHECK-NEXT: br i1 [[I21]], label [[BB27_THREAD]], label [[BB27]]
|
|
;
|
|
bb:
|
|
%i = alloca i32, align 4
|
|
store i32 0, i32* %i, align 4
|
|
br label %bb1
|
|
|
|
bb1:
|
|
store i32 0, i32* @f, align 4
|
|
br label %bb2
|
|
|
|
bb2:
|
|
%i3 = load i32, i32* @f, align 4
|
|
%i4 = icmp sge i32 %i3, 0
|
|
br i1 %i4, label %bb5, label %bb12
|
|
|
|
bb5:
|
|
%i6 = load i32, i32* @a, align 4
|
|
%i7 = load i32, i32* @d, align 4
|
|
%i8 = and i32 %i7, %i6
|
|
store i32 %i8, i32* @d, align 4
|
|
br label %bb9
|
|
|
|
bb9:
|
|
%i10 = load i32, i32* @f, align 4
|
|
%i11 = add nsw i32 %i10, -1
|
|
store i32 %i11, i32* @f, align 4
|
|
br label %bb2
|
|
|
|
bb12:
|
|
store i32 0, i32* @c, align 4
|
|
br label %bb13
|
|
|
|
bb13:
|
|
%i14 = load i32, i32* @c, align 4
|
|
%i15 = icmp sle i32 %i14, 0
|
|
br i1 %i15, label %bb16, label %bb39
|
|
|
|
bb16:
|
|
%i17 = load i32, i32* @f, align 4
|
|
%i18 = icmp ne i32 %i17, 0
|
|
br i1 %i18, label %bb19, label %bb34
|
|
|
|
bb19:
|
|
%i20 = load i32, i32* @d, align 4
|
|
%i21 = icmp eq i32 %i20, 0
|
|
br i1 %i21, label %bb22, label %bb23
|
|
|
|
bb22:
|
|
br label %bb27
|
|
|
|
bb23:
|
|
%i24 = load i32, i32* @b, align 4
|
|
%i25 = load i32, i32* @d, align 4
|
|
%i26 = urem i32 %i24, %i25
|
|
br label %bb27
|
|
|
|
bb27:
|
|
%i28 = phi i32 [ 0, %bb22 ], [ %i26, %bb23 ]
|
|
store i32 %i28, i32* @e, align 4
|
|
%i29 = load i32, i32* @e, align 4
|
|
%i30 = icmp ne i32 %i29, 0
|
|
br i1 %i30, label %bb31, label %bb32
|
|
|
|
bb31:
|
|
br label %bb33
|
|
|
|
bb32:
|
|
ret i32 0
|
|
|
|
bb33:
|
|
br label %bb35
|
|
|
|
bb34:
|
|
store i32 0, i32* @d, align 4
|
|
br label %bb35
|
|
|
|
bb35:
|
|
br label %bb36
|
|
|
|
bb36:
|
|
%i37 = load i32, i32* @c, align 4
|
|
%i38 = add nsw i32 %i37, 1
|
|
store i32 %i38, i32* @c, align 4
|
|
br label %bb13
|
|
|
|
bb39:
|
|
br label %bb1
|
|
}
|