[VPlan] Simplify branch on False in VPlan transform (NFC). (#140409)

Simplify branch on false, starting with the branch from the middle block
to the scalar preheader. Initially this helps simplifying the initial
VPlan construction.

Depends on https://github.com/llvm/llvm-project/pull/140405.

PR: https://github.com/llvm/llvm-project/pull/140409
This commit is contained in:
Florian Hahn
2025-05-31 20:32:45 +01:00
committed by GitHub
parent 29f79ea3c5
commit 0f00a96fed
6 changed files with 105 additions and 78 deletions

View File

@@ -2380,10 +2380,12 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
// We just connected a new block to the scalar preheader. Update all
// VPPhis by adding an incoming value for it, replicating the last value.
unsigned NumPredecessors = ScalarPH->getNumPredecessors();
for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
auto *ResumePhi = cast<VPPhi>(&R);
ResumePhi->addOperand(
ResumePhi->getOperand(ResumePhi->getNumOperands() - 1));
assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
"must have incoming values for all operands");
R.addOperand(R.getOperand(NumPredecessors - 2));
}
}

View File

@@ -1136,6 +1136,10 @@ public:
return getAsRecipe()->getNumOperands();
}
/// Removes the incoming value for \p IncomingBlock, which must be a
/// predecessor.
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
@@ -3545,14 +3549,13 @@ template <> struct CastIsPossible<VPPhiAccessors, const VPRecipeBase *> {
};
/// Support casting from VPRecipeBase -> VPPhiAccessors, by down-casting to the
/// recipe types implementing VPPhiAccessors. Used by cast<>, dyn_cast<> & co.
template <>
struct CastInfo<VPPhiAccessors, const VPRecipeBase *>
: public CastIsPossible<VPPhiAccessors, const VPRecipeBase *> {
template <typename SrcTy>
struct CastInfoVPPhiAccessors : public CastIsPossible<VPPhiAccessors, SrcTy> {
using Self = CastInfo<VPPhiAccessors, const VPRecipeBase *>;
using Self = CastInfo<VPPhiAccessors, SrcTy>;
/// doCast is used by cast<>.
static inline VPPhiAccessors *doCast(const VPRecipeBase *R) {
static inline VPPhiAccessors *doCast(SrcTy R) {
return const_cast<VPPhiAccessors *>([R]() -> const VPPhiAccessors * {
switch (R->getVPDefID()) {
case VPDef::VPInstructionSC:
@@ -3568,12 +3571,18 @@ struct CastInfo<VPPhiAccessors, const VPRecipeBase *>
}
/// doCastIfPossible is used by dyn_cast<>.
static inline VPPhiAccessors *doCastIfPossible(const VPRecipeBase *f) {
static inline VPPhiAccessors *doCastIfPossible(SrcTy f) {
if (!Self::isPossible(f))
return nullptr;
return doCast(f);
}
};
template <>
struct CastInfo<VPPhiAccessors, VPRecipeBase *>
: CastInfoVPPhiAccessors<VPRecipeBase *> {};
template <>
struct CastInfo<VPPhiAccessors, const VPRecipeBase *>
: CastInfoVPPhiAccessors<const VPRecipeBase *> {};
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
/// holds a sequence of zero or more VPRecipe's each representing a sequence of

View File

@@ -501,8 +501,10 @@ void VPlanTransforms::prepareForVectorization(
cast<VPBasicBlock>(HeaderVPB),
cast<VPBasicBlock>(LatchVPB), Range);
HandledUncountableEarlyExit = true;
} else {
for (VPRecipeBase &R : EB->phis())
cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
}
cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
VPBlockUtils::disconnectBlocks(Pred, EB);
}
@@ -526,32 +528,6 @@ void VPlanTransforms::prepareForVectorization(
VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
// If needed, add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop. Three cases:
// 1) If we require a scalar epilogue, there is no conditional branch as
// we unconditionally branch to the scalar preheader. Remove the recipes
// from the exit blocks.
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
// Thus if tail is to be folded, we know we don't need to run the
// remainder and we can set the condition to true.
// 3) Otherwise, construct a runtime check.
if (!RequiresScalarEpilogueCheck) {
if (auto *LatchExitVPB = MiddleVPBB->getSingleSuccessor())
VPBlockUtils::disconnectBlocks(MiddleVPBB, LatchExitVPB);
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
Plan.getEntry()->swapSuccessors();
// The exit blocks are unreachable, remove their recipes to make sure no
// users remain that may pessimize transforms.
for (auto *EB : Plan.getExitBlocks()) {
for (VPRecipeBase &R : make_early_inc_range(*EB))
R.eraseFromParent();
}
return;
}
// The connection order corresponds to the operands of the conditional branch,
// with the middle block already connected to the exit block.
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -561,21 +537,45 @@ void VPlanTransforms::prepareForVectorization(
VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
Plan.getEntry()->swapSuccessors();
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
// If MiddleVPBB has a single successor then the original loop does not exit
// via the latch and the single successor must be the scalar preheader.
// There's no need to add a runtime check to MiddleVPBB.
if (MiddleVPBB->getNumSuccessors() == 1) {
assert(MiddleVPBB->getSingleSuccessor() == ScalarPH &&
"must have ScalarPH as single successor");
return;
}
assert(MiddleVPBB->getNumSuccessors() == 2 && "must have 2 successors");
// Add a check in the middle block to see if we have completed all of the
// iterations in the first vector loop.
//
// Three cases:
// 1) If we require a scalar epilogue, the scalar ph must execute. Set the
// condition to false.
// 2) If (N - N%VF) == N, then we *don't* need to run the
// remainder. Thus if tail is to be folded, we know we don't need to run
// the remainder and we can set the condition to true.
// 3) Otherwise, construct a runtime check.
// We use the same DebugLoc as the scalar loop latch terminator instead of
// the corresponding compare because they may have ended up with different
// line numbers and we want to avoid awkward line stepping while debugging.
// E.g., if the compare has got a line number inside the loop.
DebugLoc LatchDL = TheLoop->getLoopLatch()->getTerminator()->getDebugLoc();
VPBuilder Builder(MiddleVPBB);
VPValue *Cmp =
TailFolded
? Plan.getOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(TripCount->getType()->getContext())))
: Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(),
ScalarLatchTerm->getDebugLoc(), "cmp.n");
Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
ScalarLatchTerm->getDebugLoc());
VPValue *Cmp;
if (!RequiresScalarEpilogueCheck)
Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse(
IntegerType::getInt1Ty(TripCount->getType()->getContext())));
else if (TailFolded)
Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(TripCount->getType()->getContext())));
else
Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(), LatchDL, "cmp.n");
Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp}, LatchDL);
}
void VPlanTransforms::createLoopRegions(VPlan &Plan) {

View File

@@ -1185,6 +1185,14 @@ void VPIRPhi::execute(VPTransformState &State) {
State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
}
void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
"Number of phi operands must match number of predecessors");
unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
R->removeOperand(Position);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPhiAccessors::printPhiOperands(raw_ostream &O,
VPSlotTracker &SlotTracker) const {

View File

@@ -1841,40 +1841,37 @@ void VPlanTransforms::truncateToMinimalBitwidths(
}
}
/// Remove BranchOnCond recipes with true conditions together with removing
/// dead edges to their successors.
static void removeBranchOnCondTrue(VPlan &Plan) {
/// Remove BranchOnCond recipes with true or false conditions together with
/// removing dead edges to their successors.
static void removeBranchOnConst(VPlan &Plan) {
using namespace llvm::VPlanPatternMatch;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()))) {
VPValue *Cond;
if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
!match(&VPBB->back(), m_BranchOnCond(m_True())))
!match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
continue;
VPBasicBlock *RemovedSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
unsigned DeadIdx = RemovedSucc->getIndexForPredecessor(VPBB);
unsigned RemovedIdx;
if (match(Cond, m_True()))
RemovedIdx = 1;
else if (match(Cond, m_False()))
RemovedIdx = 0;
else
continue;
// Values coming from VPBB into ResumePhi recipes of RemoveSucc are removed
// from these recipes.
for (VPRecipeBase &R : make_early_inc_range(*RemovedSucc)) {
assert((!isa<VPIRInstruction>(&R) ||
!isa<PHINode>(cast<VPIRInstruction>(&R)->getInstruction())) &&
!isa<VPHeaderPHIRecipe>(&R) &&
"Cannot update VPIRInstructions wrapping phis or header phis yet");
auto *VPI = dyn_cast<VPPhi>(&R);
if (!VPI)
break;
VPBuilder B(VPI);
SmallVector<VPValue *> NewOperands;
// Create new operand list, with the dead incoming value filtered out.
for (const auto &[Idx, Op] : enumerate(VPI->operands())) {
if (Idx == DeadIdx)
continue;
NewOperands.push_back(Op);
}
VPI->replaceAllUsesWith(
B.createScalarPhi(NewOperands, VPI->getDebugLoc(), VPI->getName()));
VPI->eraseFromParent();
VPBasicBlock *RemovedSucc =
cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
const auto &Preds = RemovedSucc->getPredecessors();
assert(count(Preds, VPBB) == 1 &&
"There must be a single edge between VPBB and its successor");
// Values coming from VPBB into phi recipes of RemoveSucc are removed from
// these recipes.
for (VPRecipeBase &R : RemovedSucc->phis()) {
auto *Phi = cast<VPPhiAccessors>(&R);
assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
"VPIRPhis must have a single predecessor");
Phi->removeIncomingValueFor(VPBB);
}
// Disconnect blocks and remove the terminator. RemovedSucc will be deleted
// automatically on VPlan destruction if it becomes unreachable.
@@ -1894,7 +1891,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
runPass(legalizeAndOptimizeInductions, Plan);
runPass(removeRedundantExpandSCEVRecipes, Plan);
runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
runPass(removeBranchOnCondTrue, Plan);
runPass(removeBranchOnConst, Plan);
runPass(removeDeadRecipes, Plan);
runPass(createAndOptimizeReplicateRegions, Plan);

View File

@@ -38,6 +38,7 @@ class VPSlotTracker;
class VPUser;
class VPRecipeBase;
class VPInterleaveRecipe;
class VPPhiAccessors;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
// flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -199,8 +200,18 @@ raw_ostream &operator<<(raw_ostream &OS, const VPRecipeBase &R);
/// This class augments VPValue with operands which provide the inverse def-use
/// edges from VPValue's users to their defs.
class VPUser {
/// Grant access to removeOperand for VPPhiAccessors, the only supported user.
friend class VPPhiAccessors;
SmallVector<VPValue *, 2> Operands;
/// Removes the operand at index \p Idx. This also removes the VPUser from the
/// use-list of the operand.
void removeOperand(unsigned Idx) {
getOperand(Idx)->removeUser(*this);
Operands.erase(Operands.begin() + Idx);
}
protected:
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the operands to \p O.