[LV] Use getFixedValue instead of getKnownMinValue when appropriate (#143526)

There are many places in VPlan and LoopVectorize where we use
getKnownMinValue to discover the number of elements in a vector. Where
we expect the vector to have a fixed length, I have used the stronger
getFixedValue call. I believe this is clearer and adds extra protection
in the form of an assert in getFixedValue that the vector is not
scalable.

While looking at VPFirstOrderRecurrencePHIRecipe::computeCost I also
took the liberty of simplifying the code.

In theory I believe this patch should be NFC, but I'm reluctant to add
that to the title in case we're just missing tests for some of the VPlan
changes. I built and ran the LLVM test suite when targeting neoverse-v1
and it seemed ok.
This commit is contained in:
David Sherwood
2025-06-13 11:43:50 +01:00
committed by GitHub
parent 2019553a0b
commit 541e5118ce
3 changed files with 31 additions and 28 deletions

View File

@@ -3116,12 +3116,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// that we will create. This cost is likely to be zero. The phi node
// cost, if any, should be scaled by the block probability because it
// models a copy at the end of each predicated block.
ScalarizationCost += VF.getKnownMinValue() *
TTI.getCFInstrCost(Instruction::PHI, CostKind);
ScalarizationCost +=
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
// The cost of the non-predicated instruction.
ScalarizationCost += VF.getKnownMinValue() *
TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
ScalarizationCost +=
VF.getFixedValue() *
TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
// The cost of insertelement and extractelement instructions needed for
// scalarization.
@@ -4289,7 +4290,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
return NumLegalParts <= VF.getKnownMinValue();
}
// Two or more elements that share a register - are vectorized.
return NumLegalParts < VF.getKnownMinValue();
return NumLegalParts < VF.getFixedValue();
};
// If no def nor is a store, e.g., branches, continue - no value to check.
@@ -4574,8 +4575,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
assert(!isa<SCEVCouldNotCompute>(TC) &&
"Trip count SCEV must be computable");
RemainingIterations = SE.getURemExpr(
TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
MaxTripCount =
@@ -4586,7 +4587,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
}
if (SE.isKnownPredicate(
CmpInst::ICMP_UGT,
SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
SE.getConstant(TCType, NextVF.Width.getFixedValue()),
RemainingIterations))
continue;
}
@@ -5257,14 +5258,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Get the cost of the scalar memory instruction and address computation.
InstructionCost Cost =
VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
const Align Alignment = getLoadStoreAlignment(I);
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
Alignment, AS, CostKind);
Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
Alignment, AS, CostKind);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5280,7 +5281,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
auto *VecI1Ty =
VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
Cost += TTI.getScalarizationOverhead(
VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
/*Insert=*/false, /*Extract=*/true, CostKind);
Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
@@ -5341,6 +5342,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
StoreInst *SI = cast<StoreInst>(I);
bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
// TODO: We have existing tests that request the cost of extracting element
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
// the actual generated code, which involves extracting the last element of
// a scalable vector where the lane to extract is unknown at compile time.
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
@@ -5623,7 +5628,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
for (Type *VectorTy : getContainedTypes(RetTy)) {
Cost += TTI.getScalarizationOverhead(
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
/*Insert=*/true,
/*Extract=*/false, CostKind);
}

View File

@@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
bool IsSingleScalar = vputils::isSingleScalar(Def);
VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, LastLane)) {
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
@@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
set(Def, Undef);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
packScalarIntoVectorizedValue(Def, Lane);
VectorValue = get(Def);
}
@@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Entry);
State->Lane = VPLane(0);
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
State->Lane = VPLane(Lane, VPLane::Kind::First);
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {

View File

@@ -871,7 +871,7 @@ void VPInstruction::execute(VPTransformState &State) {
isVectorToScalar() || isSingleScalar());
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
if (GeneratesPerAllLanes) {
for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
Lane != NumLanes; ++Lane) {
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
assert(GeneratedValue && "generatePerLane must produce a value");
@@ -2787,8 +2787,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
}
// Generate scalar instances for all VF lanes.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
const unsigned EndLane = State.VF.getKnownMinValue();
const unsigned EndLane = State.VF.getFixedValue();
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
scalarizeInstruction(UI, this, VPLane(Lane), State);
}
@@ -2841,7 +2840,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
UI->getOpcode(), ResultTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
Op2Info, Operands, UI, &Ctx.TLI) *
(isSingleScalar() ? 1 : VF.getKnownMinValue());
(isSingleScalar() ? 1 : VF.getFixedValue());
}
}
@@ -3390,7 +3389,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
Value *ResBlockInMask = State.get(BlockInMask);
Value *ShuffledMask = State.Builder.CreateShuffleVector(
ResBlockInMask,
createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
"interleaved.mask");
return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
ShuffledMask, MaskForGaps)
@@ -3402,8 +3401,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
if (NeedsMaskForGaps) {
MaskForGaps = createBitMaskForGaps(State.Builder,
State.VF.getKnownMinValue(), *Group);
MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
}
@@ -3454,6 +3453,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
return;
}
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
// For each member in the group, shuffle out the appropriate data from the
// wide loads.
@@ -3466,13 +3466,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
continue;
auto StrideMask =
createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
Value *StridedVec =
State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
StridedVec =
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
@@ -3808,7 +3807,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
if (VF.isScalar())
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
if (VF.isScalable() && VF.getKnownMinValue() == 1)
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
return 0;