This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based on EarlyCSE's implementation using ScopeHashTable and is using SCEV for accessed pointers to check to find redundant loads after unrolling. This applies to the late unroll pass only, for full unrolling those redundant loads will be cleaned up by the regular pipeline. The current approach constructs MSSA on-demand per-loop, but there is still small but notable compile-time impact: stage1-O3 +0.04% stage1-ReleaseThinLTO +0.06% stage1-ReleaseLTO-g +0.05% stage1-O0-g +0.02% stage2-O3 +0.09% stage2-O0-g +0.04% stage2-clang +0.02% https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u This benefits some workloads with runtime-unrolling disabled, where users use pragmas to force unrolling, as well as with runtime unrolling enabled. On SPEC/MultiSource, this removes a number of loads after unrolling on AArch64 with runtime unrolling enabled. ``` External/S...te/526.blender_r/526.blender_r 96 MultiSourc...rks/mediabench/gsm/toast/toast 39 SingleSource/Benchmarks/Misc/ffbench 4 External/SPEC/CINT2006/403.gcc/403.gcc 18 MultiSourc.../Applications/JM/ldecod/ldecod 4 MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg 6 MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG 9 MultiSourc...e/Applications/ClamAV/clamscan 4 MultiSourc.../MallocBench/espresso/espresso 3 MultiSourc...dence-flt/LinearDependence-flt 2 MultiSourc...ch/office-ispell/office-ispell 4 MultiSourc...ch/consumer-jpeg/consumer-jpeg 6 MultiSourc...ench/security-sha/security-sha 11 MultiSourc...chmarks/McCat/04-bisect/bisect 3 SingleSour...tTests/2020-01-06-coverage-009 12 MultiSourc...ench/telecomm-gsm/telecomm-gsm 39 MultiSourc...lds-flt/CrossingThresholds-flt 24 MultiSourc...dence-dbl/LinearDependence-dbl 2 External/S...C/CINT2006/445.gobmk/445.gobmk 6 MultiSourc...enchmarks/mafft/pairlocalalign 53 External/S...31.deepsjeng_r/531.deepsjeng_r 3 External/S...rate/510.parest_r/510.parest_r 58 External/S...NT2006/464.h264ref/464.h264ref 29 External/S...NT2017rate/502.gcc_r/502.gcc_r 45 External/S...C/CINT2006/456.hmmer/456.hmmer 6 External/S...te/538.imagick_r/538.imagick_r 18 External/S.../CFP2006/447.dealII/447.dealII 4 MultiSourc...OE-ProxyApps-C++/miniFE/miniFE 12 External/S...2017rate/525.x264_r/525.x264_r 36 MultiSourc...Benchmarks/7zip/7zip-benchmark 33 MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk 2 MultiSourc...chmarks/VersaBench/8b10b/8b10b 1 MultiSourc.../Applications/JM/lencod/lencod 116 MultiSourc...lds-dbl/CrossingThresholds-dbl 24 MultiSource/Benchmarks/McCat/05-eks/eks 15 ``` PR: https://github.com/llvm/llvm-project/pull/83860
162 lines
8.9 KiB
LLVM
162 lines
8.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
|
|
|
|
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
|
|
target triple = "arm64-apple-macosx11.0.0"
|
|
|
|
define void @partial_unroll_forced(i32 %N, ptr %src, ptr noalias %dst) {
|
|
; CHECK-LABEL: define void @partial_unroll_forced(
|
|
; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
|
|
; CHECK-NEXT: br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
|
|
; CHECK: loop.latch.preheader:
|
|
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
|
|
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
|
|
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[N]], 1
|
|
; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
|
|
; CHECK: loop.latch.preheader.new:
|
|
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
|
|
; CHECK-NEXT: br label [[LOOP_LATCH:%.*]]
|
|
; CHECK: loop.latch:
|
|
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
|
|
; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
|
|
; CHECK-NEXT: [[SRC_IDX:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: [[L:%.*]] = load <8 x half>, ptr [[SRC_IDX]], align 16
|
|
; CHECK-NEXT: [[DST_IDX:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: [[ADD:%.*]] = fadd <8 x half> [[L]], [[L]]
|
|
; CHECK-NEXT: store <8 x half> [[ADD]], ptr [[DST_IDX]], align 16
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[SRC_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_NEXT]]
|
|
; CHECK-NEXT: [[L_1:%.*]] = load <8 x half>, ptr [[SRC_IDX_1]], align 16
|
|
; CHECK-NEXT: [[DST_IDX_1:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
|
|
; CHECK-NEXT: [[ADD_1:%.*]] = fadd <8 x half> [[L_1]], [[L_1]]
|
|
; CHECK-NEXT: store <8 x half> [[ADD_1]], ptr [[DST_IDX_1]], align 16
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
|
|
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
|
|
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
|
|
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; CHECK: exit.loopexit.unr-lcssa:
|
|
; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
|
|
; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
|
|
; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
|
|
; CHECK: loop.latch.epil:
|
|
; CHECK-NEXT: [[SRC_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[SRC]], i64 [[INDVARS_IV_UNR]]
|
|
; CHECK-NEXT: [[L_EPIL:%.*]] = load <8 x half>, ptr [[SRC_IDX_EPIL]], align 16
|
|
; CHECK-NEXT: [[DST_IDX_EPIL:%.*]] = getelementptr <8 x half>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
|
|
; CHECK-NEXT: [[ADD_EPIL:%.*]] = fadd <8 x half> [[L_EPIL]], [[L_EPIL]]
|
|
; CHECK-NEXT: store <8 x half> [[ADD_EPIL]], ptr [[DST_IDX_EPIL]], align 16
|
|
; CHECK-NEXT: br label [[EXIT]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %loop.header
|
|
|
|
loop.header:
|
|
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
|
|
%cmp14 = icmp slt i32 %iv, %N
|
|
br i1 %cmp14, label %loop.latch, label %exit
|
|
|
|
loop.latch:
|
|
%iv.ext = zext i32 %iv to i64
|
|
%src.idx = getelementptr <8 x half>, ptr %src, i64 %iv.ext
|
|
%l = load <8 x half>, ptr %src.idx, align 16
|
|
%dst.idx = getelementptr <8 x half>, ptr %dst, i64 %iv.ext
|
|
%add = fadd <8 x half> %l, %l
|
|
store <8 x half> %add, ptr %dst.idx, align 16
|
|
%iv.next = add i32 %iv, 1
|
|
br label %loop.header, !llvm.loop !0
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
define void @cse_matching_load_from_previous_unrolled_iteration(i32 %N, ptr %src, ptr noalias %dst) {
|
|
; CHECK-LABEL: define void @cse_matching_load_from_previous_unrolled_iteration(
|
|
; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
|
|
; CHECK-NEXT: [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
|
|
; CHECK-NEXT: [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
|
|
; CHECK-NEXT: br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
|
|
; CHECK: loop.latch.preheader:
|
|
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
|
|
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
|
|
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[N]], 1
|
|
; CHECK-NEXT: br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
|
|
; CHECK: loop.latch.preheader.new:
|
|
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
|
|
; CHECK-NEXT: br label [[LOOP_LATCH:%.*]]
|
|
; CHECK: loop.latch:
|
|
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
|
|
; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
|
|
; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: [[L_12:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12]], align 8
|
|
; CHECK-NEXT: [[GEP_SRC_4:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: [[L_4:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4]], align 8
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[L_4]], [[L_12]]
|
|
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: store <2 x i32> [[MUL]], ptr [[GEP_DST]], align 8
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[GEP_SRC_12_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_NEXT]]
|
|
; CHECK-NEXT: [[L_12_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_1]], align 8
|
|
; CHECK-NEXT: [[MUL_1:%.*]] = mul <2 x i32> [[L_12]], [[L_12_1]]
|
|
; CHECK-NEXT: [[GEP_DST_1:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
|
|
; CHECK-NEXT: store <2 x i32> [[MUL_1]], ptr [[GEP_DST_1]], align 8
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
|
|
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
|
|
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
|
|
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP3:![0-9]+]]
|
|
; CHECK: exit.loopexit.unr-lcssa:
|
|
; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
|
|
; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
|
|
; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
|
|
; CHECK: loop.latch.epil:
|
|
; CHECK-NEXT: [[GEP_SRC_12_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_UNR]]
|
|
; CHECK-NEXT: [[L_12_EPIL:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_EPIL]], align 8
|
|
; CHECK-NEXT: [[GEP_SRC_4_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV_UNR]]
|
|
; CHECK-NEXT: [[L_4_EPIL:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4_EPIL]], align 8
|
|
; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul <2 x i32> [[L_4_EPIL]], [[L_12_EPIL]]
|
|
; CHECK-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
|
|
; CHECK-NEXT: store <2 x i32> [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
|
|
; CHECK-NEXT: br label [[EXIT]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%src.4 = getelementptr i8, ptr %src, i64 4
|
|
%src.12 = getelementptr i8, ptr %src, i64 12
|
|
br label %loop.header
|
|
|
|
loop.header:
|
|
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
|
|
%cmp14 = icmp slt i32 %iv, %N
|
|
br i1 %cmp14, label %loop.latch, label %exit
|
|
|
|
loop.latch:
|
|
%iv.ext = zext i32 %iv to i64
|
|
%gep.src.12 = getelementptr <2 x i32>, ptr %src.12, i64 %iv.ext
|
|
%l.12 = load <2 x i32>, ptr %gep.src.12, align 8
|
|
%gep.src.4 = getelementptr <2 x i32>, ptr %src.4, i64 %iv.ext
|
|
%l.4 = load <2 x i32>, ptr %gep.src.4, align 8
|
|
%mul = mul <2 x i32> %l.12, %l.4
|
|
%gep.dst = getelementptr <2 x i32>, ptr %dst, i64 %iv.ext
|
|
store <2 x i32> %mul, ptr %gep.dst
|
|
%iv.next = add nuw nsw i32 %iv, 1
|
|
br label %loop.header, !llvm.loop !0
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
!0 = distinct !{!0, !1, !2}
|
|
!1 = !{!"llvm.loop.mustprogress"}
|
|
!2 = !{!"llvm.loop.unroll.count", i32 2}
|
|
;.
|
|
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
|
|
; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
|
|
; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"}
|
|
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
|
|
;.
|