[VPlan] Remove unneeded State.UF after 8ec406757c (NFC).

State.UF is not needed any longer after 8ec406757c
(https://github.com/llvm/llvm-project/pull/95842). Clean it up,
simplifying ::execute of existing recipes.
This commit is contained in:
Florian Hahn
2024-09-22 20:42:37 +01:00
parent 76cffc2aa5
commit 06c3a7d2d7
4 changed files with 508 additions and 704 deletions

View File

@@ -7440,7 +7440,7 @@ static void createAndCollectMergePhiForReduction(
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
Value *FinalValue =
State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
State.get(RedResult, VPIteration(0, VPLane::getFirstLane()));
auto *ResumePhi =
dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
@@ -9453,24 +9453,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
}
if (IsUniform) {
// If the recipe is uniform across all parts (instead of just per VF), only
// generate a single instance.
if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
all_of(operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) {
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
if (user_begin() != user_end()) {
for (unsigned Part = 1; Part < State.UF; ++Part)
State.set(this, State.get(this, VPIteration(0, 0)),
VPIteration(Part, 0));
}
return;
}
// Uniform within VL means we need to generate lane 0 only for each
// unrolled copy.
for (unsigned Part = 0; Part < State.UF; ++Part)
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
// Uniform within VL means we need to generate lane 0.
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
return;
}
@@ -9479,17 +9463,15 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
if (isa<StoreInst>(UI) &&
vputils::isUniformAfterVectorization(getOperand(1))) {
auto Lane = VPLane::getLastLaneForVF(State.VF);
State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
State);
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State);
return;
}
// Generate scalar instances for all VF lanes of all UF parts.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
const unsigned EndLane = State.VF.getKnownMinValue();
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State);
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising

View File

@@ -225,7 +225,7 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan)
: VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
: VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
@@ -772,9 +772,6 @@ void VPRegionBlock::execute(VPTransformState *State) {
// Enter replicating mode.
State->Instance = VPIteration(0, 0);
for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
State->Instance->Part = Part;
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
@@ -784,7 +781,6 @@ void VPRegionBlock::execute(VPTransformState *State) {
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
}
}
// Exit replicating mode.
@@ -963,16 +959,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
unsigned UF = getUF();
if (VF.getNumUsers()) {
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
VF.setUnderlyingValue(RuntimeVF);
VFxUF.setUnderlyingValue(
State.UF > 1
? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, State.UF))
: RuntimeVF);
UF > 1 ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, UF))
: RuntimeVF);
} else {
VFxUF.setUnderlyingValue(
createStepForVF(Builder, TCTy, State.VF, State.UF));
VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
}
// When vectorizing the epilogue loop, the canonical induction start value
@@ -1019,10 +1014,6 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
/// Assumes a single pre-header basic-block was created for this. Introduce
/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
// Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled
// the VPlan.
// TODO: Remove State::UF and all uses.
State->UF = 1;
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
@@ -1106,28 +1097,13 @@ void VPlan::execute(VPTransformState *State) {
}
auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
// For canonical IV, first-order recurrences and in-order reduction phis,
// only a single part is generated, which provides the last part from the
// previous iteration. For non-ordered reductions all UF parts are
// generated.
bool SinglePartNeeded =
isa<VPCanonicalIVPHIRecipe>(PhiR) ||
isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
bool NeedsScalar =
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
Value *Phi = State->get(PhiR, Part, NeedsScalar);
Value *Val =
State->get(PhiR->getBackedgeValue(),
SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
Value *Phi = State->get(PhiR, 0, NeedsScalar);
Value *Val = State->get(PhiR->getBackedgeValue(), 0, NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
State->CFG.DTU.flush();

View File

@@ -256,7 +256,6 @@ struct VPTransformState {
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
unsigned UF;
/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
@@ -309,7 +308,7 @@ struct VPTransformState {
assert((VF.isScalar() || V->getType()->isVectorTy()) &&
"scalar values must be stored as (Part, 0)");
if (!Data.PerPartOutput.count(Def)) {
DataState::PerPartValuesTy Entry(UF);
DataState::PerPartValuesTy Entry(1);
Data.PerPartOutput[Def] = Entry;
}
Data.PerPartOutput[Def][Part] = V;
@@ -1306,11 +1305,10 @@ private:
/// needed.
bool canGenerateScalarForFirstLane() const;
/// Utility methods serving execute(): generates a single instance of the
/// modeled instruction for a given part. \returns the generated value for \p
/// Part. In some cases an existing value is returned rather than a generated
/// one.
Value *generatePerPart(VPTransformState &State, unsigned Part);
/// Utility methods serving execute(): generates a single vector instance of
/// the modeled instruction. \returns the generated value. . In some cases an
/// existing value is returned rather than a generated one.
Value *generate(VPTransformState &State);
/// Utility methods serving execute(): generates a scalar single instance of
/// the modeled instruction for a given lane. \returns the scalar generated
@@ -1616,7 +1614,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
Type *ResultTy;
Value *generate(VPTransformState &State, unsigned Part);
Value *generate(VPTransformState &State);
public:
VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)

File diff suppressed because it is too large Load Diff