This patch adds metadata to disable runtime unrolling to the vectorized loop. If runtime unrolling/interleaving is considered profitable, LV will interleave the loop directly. There should be no need to perform runtime unrolling at a later stage. Note that we already add metadata to disable runtime unrolling to the scalar loop after vectorization. The additional unrolling unnecessarily increases code size and compile time. In addition to that we have several bug reports of unncessary runtime unrolling for vectorized loops, e.g. PR40961 Compile-time improvements: NewPM-O3: -1.04% NewPM-ReleaseThinLTO: -0.59% NewPM-ReleaseLTO-g: -0.97% https://llvm-compile-time-tracker.com/compare.php?from=ce1be13a868d0f8afa367975558c1a6175cce33a&to=78bc2e67f22e9e10e61cdb6cdac4bb857d95eb1b&stat=instructions:u Fixes #40306. Reviewed By: lebedev.ri, nikic Differential Revision: https://reviews.llvm.org/D115261
109 lines
5.3 KiB
LLVM
109 lines
5.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
|
|
|
; Vectorization with dependence checks.
|
|
|
|
; Check that a non-power-of-2 MaxVF, calculated based on maximum safe distance,
|
|
; does not lead fold-tail to think that no tail will be generated for any chosen
|
|
; (power of 2) VF.
|
|
; Dependence distance here is 3 iterations.
|
|
; Tiny trip count of 15 divides 3, but any (even) VF will have a tail.
|
|
|
|
;unsigned char a [15+3];
|
|
;void maxvf3(){
|
|
; for (int j = 0; j < 15; ++j) {
|
|
; a[j] = 69;
|
|
; a[j+3] = 7;
|
|
; }
|
|
;}
|
|
|
|
@a = common local_unnamed_addr global [18 x i8] zeroinitializer, align 16
|
|
|
|
define void @maxvf3() {
|
|
; CHECK-LABEL: @maxvf3(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
|
|
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 14, i32 14>
|
|
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
|
|
; CHECK: pred.store.if:
|
|
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP2]]
|
|
; CHECK-NEXT: store i8 69, ptr [[TMP3]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
|
|
; CHECK: pred.store.continue:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
|
|
; CHECK: pred.store.if1:
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
|
|
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP5]]
|
|
; CHECK-NEXT: store i8 69, ptr [[TMP6]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
|
|
; CHECK: pred.store.continue2:
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <2 x i32> <i32 3, i32 3>, [[VEC_IND]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
|
|
; CHECK: pred.store.if3:
|
|
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
|
|
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP9]]
|
|
; CHECK-NEXT: store i8 7, ptr [[TMP10]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
|
|
; CHECK: pred.store.continue4:
|
|
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
|
|
; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
|
|
; CHECK: pred.store.if5:
|
|
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
|
|
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP12]]
|
|
; CHECK-NEXT: store i8 7, ptr [[TMP13]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
|
|
; CHECK: pred.store.continue6:
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
|
|
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
|
|
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
|
|
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[J]]
|
|
; CHECK-NEXT: store i8 69, ptr [[AJ]], align 8
|
|
; CHECK-NEXT: [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
|
|
; CHECK-NEXT: [[AJP3:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[JP3]]
|
|
; CHECK-NEXT: store i8 7, ptr [[AJP3]], align 8
|
|
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
|
|
; CHECK: for.end:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
|
|
%aj = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 %j
|
|
store i8 69, ptr %aj, align 8
|
|
%jp3 = add nuw nsw i32 3, %j
|
|
%ajp3 = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 %jp3
|
|
store i8 7, ptr %ajp3, align 8
|
|
%j.next = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %j.next, 15
|
|
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
|
|
|
|
for.end:
|
|
ret void
|
|
}
|
|
|
|
!0 = distinct !{!0, !1}
|
|
!1 = !{!"llvm.loop.vectorize.enable", i1 true}
|