[VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC) (#124432)
Use HCFGBuilder to build an initial VPlan 0, which wraps all input instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes to replace the VPInstructions with widened recipes. At the moment, widened recipes are created based on the underlying instruction of the VPInstruction. Masks are also still created based on the input IR basic blocks and the loop CFG is flattened in the main loop processing the VPInstructions. This patch also incldues support for Switch instructions in HCFGBuilder using just a VPInstruction with Instruction::Switch opcode. There are multiple follow-ups planned: * Perform predication on the VPlan directly, * Unify code constructing VPlan 0 to be shared by both inner and outer loop code paths. * Construct VPlan 0 once, clone subsequent ones for VFs PR: https://github.com/llvm/llvm-project/pull/124432
This commit is contained in:
@@ -9298,6 +9298,7 @@ static void addExitUsersForFirstOrderRecurrences(
|
||||
VPlanPtr
|
||||
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
||||
|
||||
using namespace llvm::VPlanPatternMatch;
|
||||
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -9321,6 +9322,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
||||
PSE, RequiresScalarEpilogueCheck,
|
||||
CM.foldTailByMasking(), OrigLoop);
|
||||
|
||||
// Build hierarchical CFG.
|
||||
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
|
||||
HCFGBuilder.buildHierarchicalCFG();
|
||||
|
||||
// Don't use getDecisionAndClampRange here, because we don't know the UF
|
||||
// so this function is better to be conservative, rather than to split
|
||||
// it up into different VPlans.
|
||||
@@ -9371,12 +9376,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
||||
// Construct recipes for the instructions in the loop
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Scan the body of the loop in a topological order to visit each basic block
|
||||
// after having visited its predecessor basic blocks.
|
||||
LoopBlocksDFS DFS(OrigLoop);
|
||||
DFS.perform(LI);
|
||||
|
||||
VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
|
||||
VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
|
||||
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
|
||||
VPBasicBlock *VPBB = HeaderVPBB;
|
||||
BasicBlock *HeaderBB = OrigLoop->getHeader();
|
||||
bool NeedsMasks =
|
||||
@@ -9389,26 +9390,70 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
||||
RecipeBuilder.collectScaledReductions(Range);
|
||||
|
||||
auto *MiddleVPBB = Plan->getMiddleBlock();
|
||||
|
||||
// Scan the body of the loop in a topological order to visit each basic block
|
||||
// after having visited its predecessor basic blocks.
|
||||
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
|
||||
HeaderVPBB);
|
||||
|
||||
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
|
||||
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
|
||||
// Relevant instructions from basic block BB will be grouped into VPRecipe
|
||||
// ingredients and fill a new VPBasicBlock.
|
||||
if (VPBB != HeaderVPBB)
|
||||
VPBB->setName(BB->getName());
|
||||
Builder.setInsertPoint(VPBB);
|
||||
VPBlockBase *PrevVPBB = nullptr;
|
||||
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
|
||||
// Handle VPBBs down to the latch.
|
||||
if (VPBB == LoopRegion->getExiting()) {
|
||||
assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
|
||||
"the latch block shouldn't have a corresponding IRBB");
|
||||
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
|
||||
break;
|
||||
}
|
||||
|
||||
if (VPBB == HeaderVPBB)
|
||||
// Create mask based on the IR BB corresponding to VPBB.
|
||||
// TODO: Predicate directly based on VPlan.
|
||||
Builder.setInsertPoint(VPBB, VPBB->begin());
|
||||
if (VPBB == HeaderVPBB) {
|
||||
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
|
||||
RecipeBuilder.createHeaderMask();
|
||||
else if (NeedsMasks)
|
||||
RecipeBuilder.createBlockInMask(BB);
|
||||
} else if (NeedsMasks) {
|
||||
// FIXME: At the moment, masks need to be placed at the beginning of the
|
||||
// block, as blends introduced for phi nodes need to use it. The created
|
||||
// blends should be sunk after the mask recipes.
|
||||
RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
|
||||
}
|
||||
|
||||
// Introduce each ingredient into VPlan.
|
||||
// TODO: Model and preserve debug intrinsics in VPlan.
|
||||
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
|
||||
Instruction *Instr = &I;
|
||||
// Convert input VPInstructions to widened recipes.
|
||||
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
|
||||
auto *SingleDef = cast<VPSingleDefRecipe>(&R);
|
||||
auto *UnderlyingValue = SingleDef->getUnderlyingValue();
|
||||
// Skip recipes that do not need transforming, including canonical IV,
|
||||
// wide canonical IV and VPInstructions without underlying values. The
|
||||
// latter are added above for masking.
|
||||
// FIXME: Migrate code relying on the underlying instruction from VPlan0
|
||||
// to construct recipes below to not use the underlying instruction.
|
||||
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
|
||||
(isa<VPInstruction>(&R) && !UnderlyingValue))
|
||||
continue;
|
||||
|
||||
// FIXME: VPlan0, which models a copy of the original scalar loop, should
|
||||
// not use VPWidenPHIRecipe to model the phis.
|
||||
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
|
||||
UnderlyingValue && "unsupported recipe");
|
||||
|
||||
if (isa<VPInstruction>(&R) &&
|
||||
(cast<VPInstruction>(&R)->getOpcode() ==
|
||||
VPInstruction::BranchOnCond ||
|
||||
(cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
|
||||
R.eraseFromParent();
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: Gradually replace uses of underlying instruction by analyses on
|
||||
// VPlan.
|
||||
Instruction *Instr = cast<Instruction>(UnderlyingValue);
|
||||
Builder.setInsertPoint(SingleDef);
|
||||
SmallVector<VPValue *, 4> Operands;
|
||||
auto *Phi = dyn_cast<PHINode>(Instr);
|
||||
if (Phi && Phi->getParent() == HeaderBB) {
|
||||
// The backedge value will be added in fixHeaderPhis later.
|
||||
Operands.push_back(Plan->getOrAddLiveIn(
|
||||
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
|
||||
} else {
|
||||
@@ -9420,15 +9465,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
||||
// in the exit block, a uniform store recipe will be created for the final
|
||||
// invariant store of the reduction.
|
||||
StoreInst *SI;
|
||||
if ((SI = dyn_cast<StoreInst>(&I)) &&
|
||||
if ((SI = dyn_cast<StoreInst>(Instr)) &&
|
||||
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
|
||||
// Only create recipe for the final invariant store of the reduction.
|
||||
if (!Legal->isInvariantStoreOfReduction(SI))
|
||||
continue;
|
||||
auto *Recipe = new VPReplicateRecipe(
|
||||
SI, make_range(Operands.begin(), Operands.end()),
|
||||
true /* IsUniform */);
|
||||
Recipe->insertBefore(*MiddleVPBB, MBIP);
|
||||
if (Legal->isInvariantStoreOfReduction(SI)) {
|
||||
auto *Recipe = new VPReplicateRecipe(
|
||||
SI, make_range(Operands.begin(), Operands.end()),
|
||||
true /* IsUniform */);
|
||||
Recipe->insertBefore(*MiddleVPBB, MBIP);
|
||||
}
|
||||
R.eraseFromParent();
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -9438,25 +9484,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
||||
Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
|
||||
|
||||
RecipeBuilder.setRecipe(Instr, Recipe);
|
||||
if (isa<VPHeaderPHIRecipe>(Recipe)) {
|
||||
// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
|
||||
// the following cases, VPHeaderPHIRecipes may be created after non-phi
|
||||
// recipes and need to be moved to the phi section of HeaderVPBB:
|
||||
// * tail-folding (non-phi recipes computing the header mask are
|
||||
// introduced earlier than regular header phi recipes, and should appear
|
||||
// after them)
|
||||
// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
|
||||
|
||||
assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
|
||||
CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
|
||||
"unexpected recipe needs moving");
|
||||
if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
|
||||
// Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
|
||||
// moved to the phi section in the header.
|
||||
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
|
||||
} else
|
||||
VPBB->appendRecipe(Recipe);
|
||||
} else {
|
||||
Builder.insert(Recipe);
|
||||
}
|
||||
if (Recipe->getNumDefinedValues() == 1)
|
||||
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
|
||||
else
|
||||
assert(Recipe->getNumDefinedValues() == 0 &&
|
||||
"Unexpected multidef recipe");
|
||||
R.eraseFromParent();
|
||||
}
|
||||
|
||||
VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
|
||||
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
|
||||
// Flatten the CFG in the loop. Masks for blocks have already been generated
|
||||
// and added to recipes as needed. To do so, first disconnect VPBB from its
|
||||
// successors. Then connect VPBB to the previously visited VPBB.
|
||||
for (auto *Succ : to_vector(VPBB->getSuccessors()))
|
||||
VPBlockUtils::disconnectBlocks(VPBB, Succ);
|
||||
if (PrevVPBB)
|
||||
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
|
||||
PrevVPBB = VPBB;
|
||||
}
|
||||
|
||||
// After here, VPBB should not be used.
|
||||
|
||||
@@ -600,19 +600,28 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
|
||||
}
|
||||
|
||||
const VPRecipeBase *R = &VPBB->back();
|
||||
bool IsSwitch = isa<VPInstruction>(R) &&
|
||||
cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
|
||||
bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
|
||||
match(R, m_BranchOnCond(m_VPValue())) ||
|
||||
match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
|
||||
(void)IsCondBranch;
|
||||
|
||||
if (VPBB->getNumSuccessors() >= 2 ||
|
||||
(void)IsSwitch;
|
||||
if (VPBB->getNumSuccessors() == 2 ||
|
||||
(VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
|
||||
assert(IsCondBranch && "block with multiple successors not terminated by "
|
||||
"conditional branch recipe");
|
||||
assert((IsCondBranch || IsSwitch) &&
|
||||
"block with multiple successors not terminated by "
|
||||
"conditional branch nor switch recipe");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (VPBB->getNumSuccessors() > 2) {
|
||||
assert(IsSwitch && "block with more than 2 successors not terminated by "
|
||||
"a switch recipe");
|
||||
return true;
|
||||
}
|
||||
|
||||
assert(
|
||||
!IsCondBranch &&
|
||||
"block with 0 or 1 successors terminated by conditional branch recipe");
|
||||
|
||||
@@ -75,7 +75,7 @@ public:
|
||||
: TheLoop(Lp), LI(LI), Plan(P) {}
|
||||
|
||||
/// Build plain CFG for TheLoop and connects it to Plan's entry.
|
||||
void buildPlainCFG();
|
||||
void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
@@ -242,10 +242,10 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
|
||||
// Instruction definition is in outermost loop PH.
|
||||
return false;
|
||||
|
||||
// Check whether Instruction definition is in the loop exit.
|
||||
BasicBlock *Exit = TheLoop->getUniqueExitBlock();
|
||||
assert(Exit && "Expected loop with single exit.");
|
||||
if (InstParent == Exit) {
|
||||
// Check whether Instruction definition is in a loop exit.
|
||||
SmallVector<BasicBlock *> ExitBlocks;
|
||||
TheLoop->getExitBlocks(ExitBlocks);
|
||||
if (is_contained(ExitBlocks, InstParent)) {
|
||||
// Instruction definition is in outermost loop exit.
|
||||
return false;
|
||||
}
|
||||
@@ -288,6 +288,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
|
||||
void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
|
||||
BasicBlock *BB) {
|
||||
VPIRBuilder.setInsertPoint(VPBB);
|
||||
// TODO: Model and preserve debug intrinsics in VPlan.
|
||||
for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
|
||||
Instruction *Inst = &InstRef;
|
||||
|
||||
@@ -313,6 +314,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
|
||||
SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
|
||||
for (auto Case : SI->cases())
|
||||
Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
|
||||
VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
|
||||
continue;
|
||||
}
|
||||
|
||||
VPValue *NewVPV;
|
||||
if (auto *Phi = dyn_cast<PHINode>(Inst)) {
|
||||
// Phi node's operands may have not been visited at this point. We create
|
||||
@@ -339,7 +348,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
|
||||
}
|
||||
|
||||
// Main interface to build the plain CFG.
|
||||
void PlainCFGBuilder::buildPlainCFG() {
|
||||
void PlainCFGBuilder::buildPlainCFG(
|
||||
DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
|
||||
// 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
|
||||
// skeleton. These were created directly rather than via getOrCreateVPBB(),
|
||||
// revisit them now to update BB2VPBB. Note that header/entry and
|
||||
@@ -428,6 +438,14 @@ void PlainCFGBuilder::buildPlainCFG() {
|
||||
// Set VPBB successors. We create empty VPBBs for successors if they don't
|
||||
// exist already. Recipes will be created when the successor is visited
|
||||
// during the RPO traversal.
|
||||
if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
|
||||
SmallVector<VPBlockBase *> Succs = {
|
||||
getOrCreateVPBB(SI->getDefaultDest())};
|
||||
for (auto Case : SI->cases())
|
||||
Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
|
||||
VPBB->setSuccessors(Succs);
|
||||
continue;
|
||||
}
|
||||
auto *BI = cast<BranchInst>(BB->getTerminator());
|
||||
unsigned NumSuccs = succ_size(BB);
|
||||
if (NumSuccs == 1) {
|
||||
@@ -481,11 +499,14 @@ void PlainCFGBuilder::buildPlainCFG() {
|
||||
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
|
||||
// VPlan operands.
|
||||
fixPhiNodes();
|
||||
|
||||
for (const auto &[IRBB, VPB] : BB2VPBB)
|
||||
VPB2IRBB[VPB] = IRBB;
|
||||
}
|
||||
|
||||
void VPlanHCFGBuilder::buildPlainCFG() {
|
||||
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
|
||||
PCFGBuilder.buildPlainCFG();
|
||||
PCFGBuilder.buildPlainCFG(VPB2IRBB);
|
||||
}
|
||||
|
||||
// Public interface to build a H-CFG.
|
||||
|
||||
@@ -53,6 +53,10 @@ private:
|
||||
// are introduced.
|
||||
VPDominatorTree VPDomTree;
|
||||
|
||||
/// Map of create VP blocks to their input IR basic blocks, if they have been
|
||||
/// created for a input IR basic block.
|
||||
DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
|
||||
|
||||
/// Build plain CFG for TheLoop and connects it to Plan's entry.
|
||||
void buildPlainCFG();
|
||||
|
||||
@@ -62,6 +66,14 @@ public:
|
||||
|
||||
/// Build H-CFG for TheLoop and update Plan accordingly.
|
||||
void buildHierarchicalCFG();
|
||||
|
||||
/// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
|
||||
/// there is no such corresponding block.
|
||||
/// FIXME: This is a temporary workaround to drive the createBlockInMask.
|
||||
/// Remove once mask creation is done on VPlan.
|
||||
BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
|
||||
return VPB2IRBB.lookup(VPB);
|
||||
}
|
||||
};
|
||||
} // namespace llvm
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
|
||||
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
|
||||
; CHECK-NEXT: LV: Using user VF vscale x 4.
|
||||
; CHECK-NEXT: LV: Loop does not require scalar epilogue
|
||||
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
|
||||
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
|
||||
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
|
||||
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
|
||||
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
|
||||
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
|
||||
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
|
||||
; CHECK-NEXT: LV: Using user VF vscale x 4.
|
||||
; CHECK-NEXT: LV: Loop does not require scalar epilogue
|
||||
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
|
||||
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
|
||||
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
|
||||
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
|
||||
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
|
||||
|
||||
Reference in New Issue
Block a user