[LV] Use getFixedValue instead of getKnownMinValue when appropriate (#143526)
There are many places in VPlan and LoopVectorize where we use getKnownMinValue to discover the number of elements in a vector. Where we expect the vector to have a fixed length, I have used the stronger getFixedValue call. I believe this is clearer and adds extra protection in the form of an assert in getFixedValue that the vector is not scalable. While looking at VPFirstOrderRecurrencePHIRecipe::computeCost I also took the liberty of simplifying the code. In theory I believe this patch should be NFC, but I'm reluctant to add that to the title in case we're just missing tests for some of the VPlan changes. I built and ran the LLVM test suite when targeting neoverse-v1 and it seemed ok.
This commit is contained in:
@@ -3116,12 +3116,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
|
||||
// that we will create. This cost is likely to be zero. The phi node
|
||||
// cost, if any, should be scaled by the block probability because it
|
||||
// models a copy at the end of each predicated block.
|
||||
ScalarizationCost += VF.getKnownMinValue() *
|
||||
TTI.getCFInstrCost(Instruction::PHI, CostKind);
|
||||
ScalarizationCost +=
|
||||
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
|
||||
|
||||
// The cost of the non-predicated instruction.
|
||||
ScalarizationCost += VF.getKnownMinValue() *
|
||||
TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
|
||||
ScalarizationCost +=
|
||||
VF.getFixedValue() *
|
||||
TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
|
||||
|
||||
// The cost of insertelement and extractelement instructions needed for
|
||||
// scalarization.
|
||||
@@ -4289,7 +4290,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
|
||||
return NumLegalParts <= VF.getKnownMinValue();
|
||||
}
|
||||
// Two or more elements that share a register - are vectorized.
|
||||
return NumLegalParts < VF.getKnownMinValue();
|
||||
return NumLegalParts < VF.getFixedValue();
|
||||
};
|
||||
|
||||
// If no def nor is a store, e.g., branches, continue - no value to check.
|
||||
@@ -4574,8 +4575,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
|
||||
assert(!isa<SCEVCouldNotCompute>(TC) &&
|
||||
"Trip count SCEV must be computable");
|
||||
RemainingIterations = SE.getURemExpr(
|
||||
TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
|
||||
MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
|
||||
TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
|
||||
MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
|
||||
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
|
||||
SE.getConstant(TCType, MaxTripCount))) {
|
||||
MaxTripCount =
|
||||
@@ -4586,7 +4587,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
|
||||
}
|
||||
if (SE.isKnownPredicate(
|
||||
CmpInst::ICMP_UGT,
|
||||
SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
|
||||
SE.getConstant(TCType, NextVF.Width.getFixedValue()),
|
||||
RemainingIterations))
|
||||
continue;
|
||||
}
|
||||
@@ -5257,14 +5258,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
|
||||
|
||||
// Get the cost of the scalar memory instruction and address computation.
|
||||
InstructionCost Cost =
|
||||
VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
|
||||
VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
|
||||
|
||||
// Don't pass *I here, since it is scalar but will actually be part of a
|
||||
// vectorized loop where the user of it is a vectorized instruction.
|
||||
const Align Alignment = getLoadStoreAlignment(I);
|
||||
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
|
||||
ValTy->getScalarType(),
|
||||
Alignment, AS, CostKind);
|
||||
Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
|
||||
ValTy->getScalarType(),
|
||||
Alignment, AS, CostKind);
|
||||
|
||||
// Get the overhead of the extractelement and insertelement instructions
|
||||
// we might create due to scalarization.
|
||||
@@ -5280,7 +5281,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
|
||||
auto *VecI1Ty =
|
||||
VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
|
||||
Cost += TTI.getScalarizationOverhead(
|
||||
VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
|
||||
VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
|
||||
/*Insert=*/false, /*Extract=*/true, CostKind);
|
||||
Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
|
||||
|
||||
@@ -5341,6 +5342,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
|
||||
StoreInst *SI = cast<StoreInst>(I);
|
||||
|
||||
bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
|
||||
// TODO: We have existing tests that request the cost of extracting element
|
||||
// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
|
||||
// the actual generated code, which involves extracting the last element of
|
||||
// a scalable vector where the lane to extract is unknown at compile time.
|
||||
return TTI.getAddressComputationCost(ValTy) +
|
||||
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
|
||||
CostKind) +
|
||||
@@ -5623,7 +5628,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
|
||||
|
||||
for (Type *VectorTy : getContainedTypes(RetTy)) {
|
||||
Cost += TTI.getScalarizationOverhead(
|
||||
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
|
||||
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
|
||||
/*Insert=*/true,
|
||||
/*Extract=*/false, CostKind);
|
||||
}
|
||||
|
||||
@@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
|
||||
|
||||
bool IsSingleScalar = vputils::isSingleScalar(Def);
|
||||
|
||||
VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
|
||||
VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
|
||||
// Check if there is a scalar value for the selected lane.
|
||||
if (!hasScalarValue(Def, LastLane)) {
|
||||
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
|
||||
@@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
|
||||
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
|
||||
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
|
||||
set(Def, Undef);
|
||||
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
|
||||
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
|
||||
packScalarIntoVectorizedValue(Def, Lane);
|
||||
VectorValue = get(Def);
|
||||
}
|
||||
@@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
|
||||
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
|
||||
Entry);
|
||||
State->Lane = VPLane(0);
|
||||
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
|
||||
++Lane) {
|
||||
for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
|
||||
State->Lane = VPLane(Lane, VPLane::Kind::First);
|
||||
// Visit the VPBlocks connected to \p this, starting from it.
|
||||
for (VPBlockBase *Block : RPOT) {
|
||||
|
||||
@@ -871,7 +871,7 @@ void VPInstruction::execute(VPTransformState &State) {
|
||||
isVectorToScalar() || isSingleScalar());
|
||||
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
|
||||
if (GeneratesPerAllLanes) {
|
||||
for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
|
||||
for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
|
||||
Lane != NumLanes; ++Lane) {
|
||||
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
|
||||
assert(GeneratedValue && "generatePerLane must produce a value");
|
||||
@@ -2787,8 +2787,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
|
||||
}
|
||||
|
||||
// Generate scalar instances for all VF lanes.
|
||||
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
|
||||
const unsigned EndLane = State.VF.getKnownMinValue();
|
||||
const unsigned EndLane = State.VF.getFixedValue();
|
||||
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
|
||||
scalarizeInstruction(UI, this, VPLane(Lane), State);
|
||||
}
|
||||
@@ -2841,7 +2840,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
|
||||
UI->getOpcode(), ResultTy, CostKind,
|
||||
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
|
||||
Op2Info, Operands, UI, &Ctx.TLI) *
|
||||
(isSingleScalar() ? 1 : VF.getKnownMinValue());
|
||||
(isSingleScalar() ? 1 : VF.getFixedValue());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3390,7 +3389,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
|
||||
Value *ResBlockInMask = State.get(BlockInMask);
|
||||
Value *ShuffledMask = State.Builder.CreateShuffleVector(
|
||||
ResBlockInMask,
|
||||
createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
|
||||
createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
|
||||
"interleaved.mask");
|
||||
return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
|
||||
ShuffledMask, MaskForGaps)
|
||||
@@ -3402,8 +3401,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
|
||||
if (isa<LoadInst>(Instr)) {
|
||||
Value *MaskForGaps = nullptr;
|
||||
if (NeedsMaskForGaps) {
|
||||
MaskForGaps = createBitMaskForGaps(State.Builder,
|
||||
State.VF.getKnownMinValue(), *Group);
|
||||
MaskForGaps =
|
||||
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
|
||||
assert(MaskForGaps && "Mask for Gaps is required but it is null");
|
||||
}
|
||||
|
||||
@@ -3454,6 +3453,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
|
||||
|
||||
return;
|
||||
}
|
||||
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
|
||||
|
||||
// For each member in the group, shuffle out the appropriate data from the
|
||||
// wide loads.
|
||||
@@ -3466,13 +3466,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
|
||||
continue;
|
||||
|
||||
auto StrideMask =
|
||||
createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
|
||||
createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
|
||||
Value *StridedVec =
|
||||
State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
|
||||
|
||||
// If this member has different type, cast the result type.
|
||||
if (Member->getType() != ScalarTy) {
|
||||
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
|
||||
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
|
||||
StridedVec =
|
||||
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
|
||||
@@ -3808,7 +3807,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
|
||||
if (VF.isScalar())
|
||||
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
|
||||
|
||||
if (VF.isScalable() && VF.getKnownMinValue() == 1)
|
||||
if (VF == ElementCount::getScalable(1))
|
||||
return InstructionCost::getInvalid();
|
||||
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user