Files
clang-p2996/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
Florian Hahn 175d297102 [LoopUnroll] Add CSE to remove redundant loads after unrolling. (#83860)
This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based
on EarlyCSE's implementation using ScopeHashTable and is using SCEV for
accessed pointers to check to find redundant loads after unrolling.

This applies to the late unroll pass only, for full unrolling those
redundant loads will be cleaned up by the regular pipeline.

The current approach constructs MSSA on-demand per-loop, but there is
still small but notable compile-time impact:

stage1-O3  +0.04%
stage1-ReleaseThinLTO +0.06%
stage1-ReleaseLTO-g +0.05%
stage1-O0-g +0.02%
stage2-O3 +0.09%
stage2-O0-g +0.04%
stage2-clang +0.02%


https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u

This benefits some workloads with runtime-unrolling disabled,
where users use pragmas to force unrolling, as well as with
runtime unrolling enabled.

On SPEC/MultiSource, this removes a number of loads after unrolling
on AArch64 with runtime unrolling enabled.

```
External/S...te/526.blender_r/526.blender_r    96
MultiSourc...rks/mediabench/gsm/toast/toast    39
SingleSource/Benchmarks/Misc/ffbench            4
External/SPEC/CINT2006/403.gcc/403.gcc         18
MultiSourc.../Applications/JM/ldecod/ldecod     4
MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg     6
MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG     9
MultiSourc...e/Applications/ClamAV/clamscan     4
MultiSourc.../MallocBench/espresso/espresso     3
MultiSourc...dence-flt/LinearDependence-flt     2
MultiSourc...ch/office-ispell/office-ispell     4
MultiSourc...ch/consumer-jpeg/consumer-jpeg     6
MultiSourc...ench/security-sha/security-sha    11
MultiSourc...chmarks/McCat/04-bisect/bisect     3
SingleSour...tTests/2020-01-06-coverage-009    12
MultiSourc...ench/telecomm-gsm/telecomm-gsm    39
MultiSourc...lds-flt/CrossingThresholds-flt    24
MultiSourc...dence-dbl/LinearDependence-dbl     2
External/S...C/CINT2006/445.gobmk/445.gobmk     6
MultiSourc...enchmarks/mafft/pairlocalalign    53
External/S...31.deepsjeng_r/531.deepsjeng_r     3
External/S...rate/510.parest_r/510.parest_r    58
External/S...NT2006/464.h264ref/464.h264ref    29
External/S...NT2017rate/502.gcc_r/502.gcc_r    45
External/S...C/CINT2006/456.hmmer/456.hmmer     6
External/S...te/538.imagick_r/538.imagick_r    18
External/S.../CFP2006/447.dealII/447.dealII     4
MultiSourc...OE-ProxyApps-C++/miniFE/miniFE    12
External/S...2017rate/525.x264_r/525.x264_r    36
MultiSourc...Benchmarks/7zip/7zip-benchmark    33
MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk     2
MultiSourc...chmarks/VersaBench/8b10b/8b10b     1
MultiSourc.../Applications/JM/lencod/lencod   116
MultiSourc...lds-dbl/CrossingThresholds-dbl    24
MultiSource/Benchmarks/McCat/05-eks/eks        15
```

PR: https://github.com/llvm/llvm-project/pull/83860
2024-05-02 11:01:24 +01:00

162 lines
8.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-macosx11.0.0"
define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) {
; CHECK-LABEL: define void @partial_unroll_forced(
; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK: loop.latch.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[N]], 1
; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
; CHECK: loop.latch.preheader.new:
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
; CHECK-NEXT: br label [[LOOP_LATCH:%.*]]
; CHECK: loop.latch:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
; CHECK-NEXT: [[SRC_IDX:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[L:%.*]] = load <8 x half>, ptr [[SRC_IDX]], align 16
; CHECK-NEXT: [[DST_IDX:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[ADD:%.*]] = fadd <8 x half> [[L]], [[L]]
; CHECK-NEXT: store <8 x half> [[ADD]], ptr [[DST_IDX]], align 16
; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[SRC_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: [[L_1:%.*]] = load <8 x half>, ptr [[SRC_IDX_1]], align 16
; CHECK-NEXT: [[DST_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: [[ADD_1:%.*]] = fadd <8 x half> [[L_1]], [[L_1]]
; CHECK-NEXT: store <8 x half> [[ADD_1]], ptr [[DST_IDX_1]], align 16
; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: exit.loopexit.unr-lcssa:
; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
; CHECK: loop.latch.epil:
; CHECK-NEXT: [[SRC_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_UNR]]
; CHECK-NEXT: [[L_EPIL:%.*]] = load <8 x half>, ptr [[SRC_IDX_EPIL]], align 16
; CHECK-NEXT: [[DST_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
; CHECK-NEXT: [[ADD_EPIL:%.*]] = fadd <8 x half> [[L_EPIL]], [[L_EPIL]]
; CHECK-NEXT: store <8 x half> [[ADD_EPIL]], ptr [[DST_IDX_EPIL]], align 16
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
entry:
br label %loop.header
loop.header:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
%cmp14 = icmp slt i32 %iv, %N
br i1 %cmp14, label %loop.latch, label %exit
loop.latch:
%iv.ext = zext i32 %iv to i64
%src.idx = getelementptr <8 x half>, ptr %src, i64 %iv.ext
%l = load <8 x half>, ptr %src.idx, align 16
%dst.idx = getelementptr <8 x half>, ptr %dst, i64 %iv.ext
%add = fadd <8 x half> %l, %l
store <8 x half> %add, ptr %dst.idx, align 16
%iv.next = add i32 %iv, 1
br label %loop.header, !llvm.loop !0
exit:
ret void
}
define void @cse_matching_load_from_previous_unrolled_iteration(i32 %N, ptr %src, ptr noalias %dst) {
; CHECK-LABEL: define void @cse_matching_load_from_previous_unrolled_iteration(
; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
; CHECK-NEXT: [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
; CHECK-NEXT: [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
; CHECK-NEXT: br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK: loop.latch.preheader:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[N]], 1
; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
; CHECK: loop.latch.preheader.new:
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
; CHECK-NEXT: br label [[LOOP_LATCH:%.*]]
; CHECK: loop.latch:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[L_12:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12]], align 8
; CHECK-NEXT: [[GEP_SRC_4:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[L_4:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4]], align 8
; CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[L_4]], [[L_12]]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store <2 x i32> [[MUL]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[GEP_SRC_12_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: [[L_12_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_1]], align 8
; CHECK-NEXT: [[MUL_1:%.*]] = mul <2 x i32> [[L_12]], [[L_12_1]]
; CHECK-NEXT: [[GEP_DST_1:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: store <2 x i32> [[MUL_1]], ptr [[GEP_DST_1]], align 8
; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: exit.loopexit.unr-lcssa:
; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
; CHECK: loop.latch.epil:
; CHECK-NEXT: [[GEP_SRC_12_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_UNR]]
; CHECK-NEXT: [[L_12_EPIL:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_EPIL]], align 8
; CHECK-NEXT: [[GEP_SRC_4_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV_UNR]]
; CHECK-NEXT: [[L_4_EPIL:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4_EPIL]], align 8
; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul <2 x i32> [[L_4_EPIL]], [[L_12_EPIL]]
; CHECK-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
; CHECK-NEXT: store <2 x i32> [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
entry:
%src.4 = getelementptr i8, ptr %src, i64 4
%src.12 = getelementptr i8, ptr %src, i64 12
br label %loop.header
loop.header:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
%cmp14 = icmp slt i32 %iv, %N
br i1 %cmp14, label %loop.latch, label %exit
loop.latch:
%iv.ext = zext i32 %iv to i64
%gep.src.12 = getelementptr <2 x i32>, ptr %src.12, i64 %iv.ext
%l.12 = load <2 x i32>, ptr %gep.src.12, align 8
%gep.src.4 = getelementptr <2 x i32>, ptr %src.4, i64 %iv.ext
%l.4 = load <2 x i32>, ptr %gep.src.4, align 8
%mul = mul <2 x i32> %l.12, %l.4
%gep.dst = getelementptr <2 x i32>, ptr %dst, i64 %iv.ext
store <2 x i32> %mul, ptr %gep.dst
%iv.next = add nuw nsw i32 %iv, 1
br label %loop.header, !llvm.loop !0
exit:
ret void
}
!0 = distinct !{!0, !1, !2}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.unroll.count", i32 2}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
;.