[LV] Vectorize (some) early and multiple exit loops

This patch is a major step towards supporting multiple exit loops in the vectorizer. This patch on it's own extends the loop forms allowed in two ways:

    single exit loops which are not bottom tested
    multiple exit loops w/ a single exit block reached from all exits and no phis in the exit block (because of LCSSA this implies no values defined in the loop used later)

The restrictions on multiple exit loop structures will be removed in follow up patches; disallowing cases for now makes the code changes smaller and more obvious. As before, we can only handle loops with entirely analyzable exits. Removing that restriction is much harder, and is not part of currently planned efforts.

The basic idea here is that we can force the last iteration to run in the scalar epilogue loop (if we have one). From the definition of SCEV's backedge taken count, we know that no earlier iteration can exit the vector body. As such, we can leave the decision on which exit to be taken to the scalar code and generate a bottom tested vector loop which runs all but the last iteration.

The existing code already had the notion of requiring one iteration in the scalar epilogue, this patch is mainly about generalizing that support slightly, making sure we don't try to use this mechanism when tail folding, and updating the code to reflect the difference between a single exit block and a unique exit block (very mechanical).

Differential Revision: https://reviews.llvm.org/D93317
This commit is contained in:
Philip Reames
2020-12-28 09:39:09 -08:00
parent dcd21572f9
commit e4df6a40da
5 changed files with 514 additions and 50 deletions

View File

@@ -1095,9 +1095,15 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
return false;
}
// We must have a single exiting block.
if (!Lp->getExitingBlock()) {
reportVectorizationFailure("The loop must have an exiting block",
// We currently must have a single "exit block" after the loop. Note that
// multiple "exiting blocks" inside the loop are allowed, provided they all
// reach the single exit block.
// TODO: This restriction can be relaxed in the near future, it's here solely
// to allow separation of changes for review. We need to generalize the phi
// update logic in a number of places.
BasicBlock *ExitBB = Lp->getUniqueExitBlock();
if (!ExitBB) {
reportVectorizationFailure("The loop must have a unique exit block",
"loop control flow is not understood by vectorizer",
"CFGNotUnderstood", ORE, TheLoop);
if (DoExtraAnalysis)
@@ -1106,11 +1112,14 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
return false;
}
// We only handle bottom-tested loops, i.e. loop in which the condition is
// checked at the end of each iteration. With that we can assume that all
// instructions in the loop are executed the same number of times.
if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
reportVectorizationFailure("The exiting block is not the loop latch",
// The existing code assumes that LCSSA implies that phis are single entry
// (which was true when we had at most a single exiting edge from the latch).
// In general, there's nothing which prevents an LCSSA phi in exit block from
// having two or more values if there are multiple exiting edges leading to
// the exit block. (TODO: implement general case)
if (!empty(ExitBB->phis()) && !ExitBB->getSinglePredecessor()) {
reportVectorizationFailure("The loop must have no live-out values if "
"it has more than one exiting block",
"loop control flow is not understood by vectorizer",
"CFGNotUnderstood", ORE, TheLoop);
if (DoExtraAnalysis)

View File

@@ -837,7 +837,8 @@ protected:
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
/// The ExitBlock of the scalar loop.
/// The (unique) ExitBlock of the scalar loop. Note that
/// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;
/// The vector loop body.
@@ -1548,11 +1549,16 @@ public:
return InterleaveInfo.getInterleaveGroup(Instr);
}
/// Returns true if an interleaved group requires a scalar iteration
/// to handle accesses with gaps, and there is nothing preventing us from
/// creating a scalar epilogue.
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue() const {
return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
if (!isScalarEpilogueAllowed())
return false;
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
return true;
return InterleaveInfo.requiresScalarEpilogue();
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -2912,7 +2918,7 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
Induction->addIncoming(Next, Latch);
// Create the compare.
Value *ICmp = Builder.CreateICmpEQ(Next, End);
Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
// Now we have two terminators. Remove the old one from the block.
Latch->getTerminator()->eraseFromParent();
@@ -3000,13 +3006,17 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
// If there is a non-reversed interleaved group that may speculatively access
// memory out-of-bounds, we need to ensure that there will be at least one
// iteration of the scalar epilogue loop. Thus, if the step evenly divides
// There are two cases where we need to ensure (at least) the last iteration
// runs in the scalar remainder loop. Thus, if the step evenly divides
// the trip count, we set the remainder to be equal to the step. If the step
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
// check ensures that N >= Step.
// check ensures that N >= Step. The cases are:
// 1) If there is a non-reversed interleaved group that may speculatively
// access memory out-of-bounds.
// 2) If any instruction may follow a conditionally taken exit. That is, if
// the loop contains multiple exiting blocks, or a single exiting block
// which is not the latch.
if (VF.isVector() && Cost->requiresScalarEpilogue()) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
@@ -3301,7 +3311,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
LoopExitBlock = OrigLoop->getExitBlock();
LoopExitBlock = OrigLoop->getUniqueExitBlock();
assert(LoopExitBlock && "Must have an exit block");
assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -3572,7 +3582,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
assert(OrigLoop->getExitBlock() && "Expected a single exit block");
assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
DenseMap<Value *, Value *> MissingVals;
@@ -5490,11 +5500,28 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// for size.
if (runtimeChecksRequired())
return None;
break;
}
// Now try the tail folding
// The only loops we can vectorize without a scalar epilogue, are loops with
// a bottom-test and a single exiting block. We'd have to handle the fact
// that not every instruction executes on the last iteration. This will
// require a lane mask which varies through the vector loop body. (TODO)
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
return MaxVF;
}
return None;
}
// Now try the tail folding
// Invalidate interleave groups that require an epilogue if we can't mask
// the interleave-group.
if (!useMaskedInterleavedAccesses(TTI)) {
@@ -7921,6 +7948,12 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
return EdgeMaskCache[Edge] = SrcMask;
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
if (OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition");

View File

@@ -10,7 +10,7 @@
; return 0;
; }
; CHECK: remark: source.cpp:5:9: loop not vectorized: loop control flow is not understood by vectorizer
; CHECK: remark: source.cpp:5:9: loop not vectorized: could not determine number of loop iterations
; CHECK: remark: source.cpp:5:9: loop not vectorized
; CHECK: _Z4testPii

View File

@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -force-vector-width=2 < %s | FileCheck %s
; RUN: opt -S -loop-vectorize -force-vector-width=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck --check-prefix TAILFOLD %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @bottom_tested(i16* %p, i32 %n) {
@@ -42,6 +44,63 @@ define void @bottom_tested(i16* %p, i32 %n) {
; CHECK: if.end:
; CHECK-NEXT: ret void
;
; TAILFOLD-LABEL: @bottom_tested(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0
; TAILFOLD-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0
; TAILFOLD-NEXT: [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1
; TAILFOLD-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; TAILFOLD: vector.ph:
; TAILFOLD-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
; TAILFOLD-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
; TAILFOLD-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
; TAILFOLD-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
; TAILFOLD-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
; TAILFOLD-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]]
; TAILFOLD: vector.body:
; TAILFOLD-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
; TAILFOLD-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
; TAILFOLD-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
; TAILFOLD-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 1
; TAILFOLD-NEXT: [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; TAILFOLD-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64>
; TAILFOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
; TAILFOLD-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; TAILFOLD: pred.store.if:
; TAILFOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
; TAILFOLD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP7]]
; TAILFOLD-NEXT: store i16 0, i16* [[TMP8]], align 4
; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]]
; TAILFOLD: pred.store.continue:
; TAILFOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
; TAILFOLD-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
; TAILFOLD: pred.store.if1:
; TAILFOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
; TAILFOLD-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP10]]
; TAILFOLD-NEXT: store i16 0, i16* [[TMP11]], align 4
; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE2]]
; TAILFOLD: pred.store.continue2:
; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; TAILFOLD-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; TAILFOLD-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; TAILFOLD-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; TAILFOLD: middle.block:
; TAILFOLD-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]]
; TAILFOLD: scalar.ph:
; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ]
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], [[LOOP2:!llvm.loop !.*]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: ret void
;
entry:
br label %for.cond
@@ -61,6 +120,90 @@ if.end:
define void @early_exit(i16* %p, i32 %n) {
; CHECK-LABEL: @early_exit(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[SMAX]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP1]], 2
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 2, i32 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[TMP3]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP4]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP7]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <2 x i16>*
; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
; CHECK: for.body:
; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
; CHECK-NEXT: store i16 0, i16* [[B]], align 4
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
; CHECK-NEXT: br label [[FOR_COND]], [[LOOP5:!llvm.loop !.*]]
; CHECK: if.end:
; CHECK-NEXT: ret void
;
; TAILFOLD-LABEL: @early_exit(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: br label [[FOR_COND]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: ret void
;
entry:
br label %for.cond
for.cond:
%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp slt i32 %i, %n
br i1 %cmp, label %for.body, label %if.end
for.body:
%iprom = sext i32 %i to i64
%b = getelementptr inbounds i16, i16* %p, i64 %iprom
store i16 0, i16* %b, align 4
%inc = add nsw i32 %i, 1
br label %for.cond
if.end:
ret void
}
; Same as early_exit, but with optsize to prevent the use of
; a scalar epilogue. -- Can't vectorize this in either case.
define void @optsize(i16* %p, i32 %n) optsize {
; CHECK-LABEL: @optsize(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
@@ -75,6 +218,22 @@ define void @early_exit(i16* %p, i32 %n) {
; CHECK: if.end:
; CHECK-NEXT: ret void
;
; TAILFOLD-LABEL: @optsize(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: br label [[FOR_COND]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: ret void
;
entry:
br label %for.cond
@@ -99,21 +258,70 @@ if.end:
define void @multiple_unique_exit(i16* %p, i32 %n) {
; CHECK-LABEL: @multiple_unique_exit(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[SMAX]], 2096
; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[SMAX]], i32 2096
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[UMIN]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 2
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 2, i32 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[TMP4]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP5]] to i64
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <2 x i16>*
; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP10]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
; CHECK: for.body:
; CHECK-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
; CHECK-NEXT: store i16 0, i16* [[B]], align 4
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP7:!llvm.loop !.*]]
; CHECK: if.end:
; CHECK-NEXT: ret void
;
; TAILFOLD-LABEL: @multiple_unique_exit(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: ret void
;
entry:
br label %for.cond
@@ -154,6 +362,24 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
; CHECK-NEXT: [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[FOR_BODY]] ], [ [[I]], [[FOR_COND]] ]
; CHECK-NEXT: ret i32 [[I_LCSSA]]
;
; TAILFOLD-LABEL: @multiple_unique_exit2(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[FOR_BODY]] ], [ [[I]], [[FOR_COND]] ]
; TAILFOLD-NEXT: ret i32 [[I_LCSSA]]
;
entry:
br label %for.cond
@@ -194,6 +420,24 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
; CHECK-NEXT: [[EXIT:%.*]] = phi i32 [ 0, [[FOR_COND]] ], [ 1, [[FOR_BODY]] ]
; CHECK-NEXT: ret i32 [[EXIT]]
;
; TAILFOLD-LABEL: @multiple_unique_exit3(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: [[EXIT:%.*]] = phi i32 [ 0, [[FOR_COND]] ], [ 1, [[FOR_BODY]] ]
; TAILFOLD-NEXT: ret i32 [[EXIT]]
;
entry:
br label %for.cond
@@ -236,6 +480,25 @@ define i32 @multiple_exit_blocks(i16* %p, i32 %n) {
; CHECK: if.end2:
; CHECK-NEXT: ret i32 1
;
; TAILFOLD-LABEL: @multiple_exit_blocks(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N:%.*]]
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
; TAILFOLD-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END2:%.*]]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: ret i32 0
; TAILFOLD: if.end2:
; TAILFOLD-NEXT: ret i32 1
;
entry:
br label %for.cond
@@ -279,6 +542,23 @@ define i32 @multiple_exit_switch(i16* %p, i32 %n) {
; CHECK-NEXT: [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[FOR_COND]] ], [ [[I]], [[FOR_COND]] ]
; CHECK-NEXT: ret i32 [[I_LCSSA]]
;
; TAILFOLD-LABEL: @multiple_exit_switch(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: switch i32 [[I]], label [[FOR_COND]] [
; TAILFOLD-NEXT: i32 2096, label [[IF_END:%.*]]
; TAILFOLD-NEXT: i32 2097, label [[IF_END]]
; TAILFOLD-NEXT: ]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[FOR_COND]] ], [ [[I]], [[FOR_COND]] ]
; TAILFOLD-NEXT: ret i32 [[I_LCSSA]]
;
entry:
br label %for.cond
@@ -318,6 +598,24 @@ define i32 @multiple_exit_switch2(i16* %p, i32 %n) {
; CHECK: if.end2:
; CHECK-NEXT: ret i32 1
;
; TAILFOLD-LABEL: @multiple_exit_switch2(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_COND:%.*]]
; TAILFOLD: for.cond:
; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1
; TAILFOLD-NEXT: switch i32 [[I]], label [[FOR_COND]] [
; TAILFOLD-NEXT: i32 2096, label [[IF_END:%.*]]
; TAILFOLD-NEXT: i32 2097, label [[IF_END2:%.*]]
; TAILFOLD-NEXT: ]
; TAILFOLD: if.end:
; TAILFOLD-NEXT: ret i32 0
; TAILFOLD: if.end2:
; TAILFOLD-NEXT: ret i32 1
;
entry:
br label %for.cond
@@ -359,6 +657,25 @@ define i32 @multiple_latch1(i16* %p) {
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
; TAILFOLD-LABEL: @multiple_latch1(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_BODY:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[I_02:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY_BACKEDGE:%.*]] ]
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I_02]], 1
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 16
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY_BACKEDGE]], label [[FOR_SECOND:%.*]]
; TAILFOLD: for.second:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I_02]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16
; TAILFOLD-NEXT: br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]]
; TAILFOLD: for.body.backedge:
; TAILFOLD-NEXT: br label [[FOR_BODY]]
; TAILFOLD: for.end:
; TAILFOLD-NEXT: ret i32 0
;
entry:
br label %for.body
@@ -405,6 +722,25 @@ define i32 @multiple_latch2(i16* %p) {
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
; TAILFOLD-LABEL: @multiple_latch2(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[FOR_BODY:%.*]]
; TAILFOLD: for.body:
; TAILFOLD-NEXT: [[I_02:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY_BACKEDGE:%.*]] ]
; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I_02]], 1
; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 16
; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_BODY_BACKEDGE]], label [[FOR_SECOND:%.*]]
; TAILFOLD: for.body.backedge:
; TAILFOLD-NEXT: br label [[FOR_BODY]]
; TAILFOLD: for.second:
; TAILFOLD-NEXT: [[IPROM:%.*]] = sext i32 [[I_02]] to i64
; TAILFOLD-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[IPROM]]
; TAILFOLD-NEXT: store i16 0, i16* [[B]], align 4
; TAILFOLD-NEXT: [[CMPS:%.*]] = icmp sgt i32 [[INC]], 16
; TAILFOLD-NEXT: br i1 [[CMPS]], label [[FOR_BODY_BACKEDGE]], label [[FOR_END:%.*]]
; TAILFOLD: for.end:
; TAILFOLD-NEXT: ret i32 0
;
entry:
br label %for.body
@@ -425,4 +761,111 @@ for.end:
ret i32 0
}
declare void @foo()
; Check interaction between block predication and early exits. We need the
; condition on the early exit to remain dead (i.e. not be used when forming
; the predicate mask).
define void @scalar_predication(float* %addr) {
; CHECK-LABEL: @scalar_predication(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fcmp oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: store float 1.000000e+01, float* [[TMP1]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
; CHECK: pred.store.if1:
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[ADDR]], i64 [[TMP8]]
; CHECK-NEXT: store float 1.000000e+01, float* [[TMP9]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
; CHECK: pred.store.continue2:
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 201, 200
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
; CHECK: loop.header:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, float* [[ADDR]], i64 [[IV]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY:%.*]]
; CHECK: loop.body:
; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[GEP]], align 4
; CHECK-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP11]], 0.000000e+00
; CHECK-NEXT: br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]]
; CHECK: then:
; CHECK-NEXT: store float 1.000000e+01, float* [[GEP]], align 4
; CHECK-NEXT: br label [[LOOP_LATCH]]
; CHECK: loop.latch:
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: br label [[LOOP_HEADER]], [[LOOP9:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; TAILFOLD-LABEL: @scalar_predication(
; TAILFOLD-NEXT: entry:
; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]]
; TAILFOLD: loop.header:
; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr float, float* [[ADDR:%.*]], i64 [[IV]]
; TAILFOLD-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
; TAILFOLD-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
; TAILFOLD: loop.body:
; TAILFOLD-NEXT: [[TMP0:%.*]] = load float, float* [[GEP]], align 4
; TAILFOLD-NEXT: [[PRED:%.*]] = fcmp oeq float [[TMP0]], 0.000000e+00
; TAILFOLD-NEXT: br i1 [[PRED]], label [[LOOP_LATCH]], label [[THEN:%.*]]
; TAILFOLD: then:
; TAILFOLD-NEXT: store float 1.000000e+01, float* [[GEP]], align 4
; TAILFOLD-NEXT: br label [[LOOP_LATCH]]
; TAILFOLD: loop.latch:
; TAILFOLD-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; TAILFOLD-NEXT: br label [[LOOP_HEADER]]
; TAILFOLD: exit:
; TAILFOLD-NEXT: ret void
;
entry:
br label %loop.header
loop.header:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
%gep = getelementptr float, float* %addr, i64 %iv
%exitcond.not = icmp eq i64 %iv, 200
br i1 %exitcond.not, label %exit, label %loop.body
loop.body:
%0 = load float, float* %gep, align 4
%pred = fcmp oeq float %0, 0.0
br i1 %pred, label %loop.latch, label %then
then:
store float 10.0, float* %gep, align 4
br label %loop.latch
loop.latch:
%iv.next = add nuw nsw i64 %iv, 1
br label %loop.header
exit:
ret void
}

View File

@@ -1,30 +1,9 @@
; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -S -disable-output 2>&1 | FileCheck %s
; REQUIRES: asserts
; Make sure LV legal bails out when the exiting block != loop latch.
; CHECK-LABEL: "latch_is_not_exiting"
; CHECK: LV: Not vectorizing: The exiting block is not the loop latch.
define i32 @latch_is_not_exiting() {
entry:
br label %for.body
for.body:
%i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ], [%inc, %for.second]
%inc = add nsw i32 %i.02, 1
%cmp = icmp slt i32 %inc, 16
br i1 %cmp, label %for.body, label %for.second
for.second:
%cmps = icmp sgt i32 %inc, 16
br i1 %cmps, label %for.body, label %for.end
for.end:
ret i32 0
}
; Make sure LV legal bails out when there is no exiting block
; CHECK-LABEL: "no_exiting_block"
; CHECK: LV: Not vectorizing: The loop must have an exiting block.
; CHECK: LV: Not vectorizing: The loop must have a unique exit block.
define i32 @no_exiting_block() {
entry:
br label %for.body