This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based on EarlyCSE's implementation using ScopeHashTable and is using SCEV for accessed pointers to check to find redundant loads after unrolling. This applies to the late unroll pass only, for full unrolling those redundant loads will be cleaned up by the regular pipeline. The current approach constructs MSSA on-demand per-loop, but there is still small but notable compile-time impact: stage1-O3 +0.04% stage1-ReleaseThinLTO +0.06% stage1-ReleaseLTO-g +0.05% stage1-O0-g +0.02% stage2-O3 +0.09% stage2-O0-g +0.04% stage2-clang +0.02% https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u This benefits some workloads with runtime-unrolling disabled, where users use pragmas to force unrolling, as well as with runtime unrolling enabled. On SPEC/MultiSource, this removes a number of loads after unrolling on AArch64 with runtime unrolling enabled. ``` External/S...te/526.blender_r/526.blender_r 96 MultiSourc...rks/mediabench/gsm/toast/toast 39 SingleSource/Benchmarks/Misc/ffbench 4 External/SPEC/CINT2006/403.gcc/403.gcc 18 MultiSourc.../Applications/JM/ldecod/ldecod 4 MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg 6 MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG 9 MultiSourc...e/Applications/ClamAV/clamscan 4 MultiSourc.../MallocBench/espresso/espresso 3 MultiSourc...dence-flt/LinearDependence-flt 2 MultiSourc...ch/office-ispell/office-ispell 4 MultiSourc...ch/consumer-jpeg/consumer-jpeg 6 MultiSourc...ench/security-sha/security-sha 11 MultiSourc...chmarks/McCat/04-bisect/bisect 3 SingleSour...tTests/2020-01-06-coverage-009 12 MultiSourc...ench/telecomm-gsm/telecomm-gsm 39 MultiSourc...lds-flt/CrossingThresholds-flt 24 MultiSourc...dence-dbl/LinearDependence-dbl 2 External/S...C/CINT2006/445.gobmk/445.gobmk 6 MultiSourc...enchmarks/mafft/pairlocalalign 53 External/S...31.deepsjeng_r/531.deepsjeng_r 3 External/S...rate/510.parest_r/510.parest_r 58 External/S...NT2006/464.h264ref/464.h264ref 29 External/S...NT2017rate/502.gcc_r/502.gcc_r 45 External/S...C/CINT2006/456.hmmer/456.hmmer 6 External/S...te/538.imagick_r/538.imagick_r 18 External/S.../CFP2006/447.dealII/447.dealII 4 MultiSourc...OE-ProxyApps-C++/miniFE/miniFE 12 External/S...2017rate/525.x264_r/525.x264_r 36 MultiSourc...Benchmarks/7zip/7zip-benchmark 33 MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk 2 MultiSourc...chmarks/VersaBench/8b10b/8b10b 1 MultiSourc.../Applications/JM/lencod/lencod 116 MultiSourc...lds-dbl/CrossingThresholds-dbl 24 MultiSource/Benchmarks/McCat/05-eks/eks 15 ``` PR: https://github.com/llvm/llvm-project/pull/83860
103 lines
4.4 KiB
LLVM
103 lines
4.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=loop-unroll -unroll-count=2 -S | FileCheck %s
|
|
|
|
; LoopUnroll should unroll this loop into one big basic block.
|
|
define void @latch_exit(ptr nocapture %p, i64 %n) nounwind {
|
|
; CHECK-LABEL: @latch_exit(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[MUL10:%.*]] = shl i64 [[N:%.*]], 1
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP16_1:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr double, ptr [[P:%.*]], i64 [[I_013]]
|
|
; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[I_013]], 1
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX7]], align 8
|
|
; CHECK-NEXT: [[MUL9:%.*]] = fmul double [[TMP8]], [[TMP4]]
|
|
; CHECK-NEXT: store double [[MUL9]], ptr [[ARRAYIDX7]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16]]
|
|
; CHECK-NEXT: [[TMP16_1]] = add i64 [[I_013]], 2
|
|
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16_1]]
|
|
; CHECK-NEXT: [[TMP4_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8
|
|
; CHECK-NEXT: [[MUL9_1:%.*]] = fmul double [[TMP4]], [[TMP4_1]]
|
|
; CHECK-NEXT: store double [[MUL9_1]], ptr [[ARRAYIDX7_1]], align 8
|
|
; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[TMP16_1]], [[MUL10]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; CHECK: for.end:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%mul10 = shl i64 %n, 1
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%i.013 = phi i64 [ %tmp16, %for.body ], [ 0, %entry ]
|
|
%arrayidx7 = getelementptr double, ptr %p, i64 %i.013
|
|
%tmp16 = add i64 %i.013, 1
|
|
%arrayidx = getelementptr double, ptr %p, i64 %tmp16
|
|
%tmp4 = load double, ptr %arrayidx
|
|
%tmp8 = load double, ptr %arrayidx7
|
|
%mul9 = fmul double %tmp8, %tmp4
|
|
store double %mul9, ptr %arrayidx7
|
|
%exitcond = icmp eq i64 %tmp16, %mul10
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end:
|
|
ret void
|
|
}
|
|
|
|
; Same as previous test case, but with a non-latch exit. There shouldn't
|
|
; be a conditional branch after the first block.
|
|
define void @non_latch_exit(ptr nocapture %p, i64 %n) nounwind {
|
|
; CHECK-LABEL: @non_latch_exit(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[MUL10:%.*]] = shl i64 [[N:%.*]], 1
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP16_1:%.*]], [[LATCH_1:%.*]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr double, ptr [[P:%.*]], i64 [[I_013]]
|
|
; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[I_013]], 1
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARRAYIDX7]], align 8
|
|
; CHECK-NEXT: [[MUL9:%.*]] = fmul double [[TMP8]], [[TMP4]]
|
|
; CHECK-NEXT: store double [[MUL9]], ptr [[ARRAYIDX7]], align 8
|
|
; CHECK-NEXT: br label [[LATCH:%.*]]
|
|
; CHECK: latch:
|
|
; CHECK-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16]]
|
|
; CHECK-NEXT: [[TMP16_1]] = add i64 [[I_013]], 2
|
|
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16_1]]
|
|
; CHECK-NEXT: [[TMP4_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8
|
|
; CHECK-NEXT: [[MUL9_1:%.*]] = fmul double [[TMP4]], [[TMP4_1]]
|
|
; CHECK-NEXT: store double [[MUL9_1]], ptr [[ARRAYIDX7_1]], align 8
|
|
; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[TMP16_1]], [[MUL10]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[LATCH_1]]
|
|
; CHECK: latch.1:
|
|
; CHECK-NEXT: br label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
|
|
; CHECK: for.end:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%mul10 = shl i64 %n, 1
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%i.013 = phi i64 [ %tmp16, %latch ], [ 0, %entry ]
|
|
%arrayidx7 = getelementptr double, ptr %p, i64 %i.013
|
|
%tmp16 = add i64 %i.013, 1
|
|
%arrayidx = getelementptr double, ptr %p, i64 %tmp16
|
|
%tmp4 = load double, ptr %arrayidx
|
|
%tmp8 = load double, ptr %arrayidx7
|
|
%mul9 = fmul double %tmp8, %tmp4
|
|
store double %mul9, ptr %arrayidx7
|
|
%exitcond = icmp eq i64 %tmp16, %mul10
|
|
br i1 %exitcond, label %for.end, label %latch
|
|
|
|
latch:
|
|
br label %for.body
|
|
|
|
for.end:
|
|
ret void
|
|
}
|