[LV] Use getFixedValue instead of getKnownMinValue when appropriate (#143526)

There are many places in VPlan and LoopVectorize where we use getKnownMinValue to discover the number of elements in a vector. Where we expect the vector to have a fixed length, I have used the stronger getFixedValue call. I believe this is clearer and adds extra protection in the form of an assert in getFixedValue that the vector is not scalable. While looking at VPFirstOrderRecurrencePHIRecipe::computeCost I also took the liberty of simplifying the code. In theory I believe this patch should be NFC, but I'm reluctant to add that to the title in case we're just missing tests for some of the VPlan changes. I built and ran the LLVM test suite when targeting neoverse-v1 and it seemed ok.
2025-06-13 11:43:50 +01:00
parent 2019553a0b
commit 541e5118ce
3 changed files with 31 additions and 28 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3116,12 +3116,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
    // that we will create. This cost is likely to be zero. The phi node
    // cost, if any, should be scaled by the block probability because it
    // models a copy at the end of each predicated block.
-    ScalarizationCost += VF.getKnownMinValue() *
-      TTI.getCFInstrCost(Instruction::PHI, CostKind);
+    ScalarizationCost +=
+        VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);

    // The cost of the non-predicated instruction.
-    ScalarizationCost += VF.getKnownMinValue() *
-      TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
+    ScalarizationCost +=
+        VF.getFixedValue() *
+        TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);

    // The cost of insertelement and extractelement instructions needed for
    // scalarization.
@@ -4289,7 +4290,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
          return NumLegalParts <= VF.getKnownMinValue();
        }
        // Two or more elements that share a register - are vectorized.
-        return NumLegalParts < VF.getKnownMinValue();
+        return NumLegalParts < VF.getFixedValue();
      };

      // If no def nor is a store, e.g., branches, continue - no value to check.
@@ -4574,8 +4575,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
        assert(!isa<SCEVCouldNotCompute>(TC) &&
               "Trip count SCEV must be computable");
        RemainingIterations = SE.getURemExpr(
-            TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
-        MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
+            TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+        MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
        if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
                                SE.getConstant(TCType, MaxTripCount))) {
          MaxTripCount =
@@ -4586,7 +4587,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
      }
      if (SE.isKnownPredicate(
              CmpInst::ICMP_UGT,
-              SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
+              SE.getConstant(TCType, NextVF.Width.getFixedValue()),
              RemainingIterations))
        continue;
    }
@@ -5257,14 +5258,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

  // Get the cost of the scalar memory instruction and address computation.
  InstructionCost Cost =
-      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+      VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

  // Don't pass *I here, since it is scalar but will actually be part of a
  // vectorized loop where the user of it is a vectorized instruction.
  const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
-                                                      ValTy->getScalarType(),
-                                                      Alignment, AS, CostKind);
+  Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
+                                                   ValTy->getScalarType(),
+                                                   Alignment, AS, CostKind);

  // Get the overhead of the extractelement and insertelement instructions
  // we might create due to scalarization.
@@ -5280,7 +5281,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
    auto *VecI1Ty =
        VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
    Cost += TTI.getScalarizationOverhead(
-        VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
+        VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
        /*Insert=*/false, /*Extract=*/true, CostKind);
    Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);

@@ -5341,6 +5342,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
  StoreInst *SI = cast<StoreInst>(I);

  bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
+  // TODO: We have existing tests that request the cost of extracting element
+  // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
+  // the actual generated code, which involves extracting the last element of
+  // a scalable vector where the lane to extract is unknown at compile time.
  return TTI.getAddressComputationCost(ValTy) +
         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
                             CostKind) +
@@ -5623,7 +5628,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

    for (Type *VectorTy : getContainedTypes(RetTy)) {
      Cost += TTI.getScalarizationOverhead(
-          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
          /*Insert=*/true,
          /*Extract=*/false, CostKind);
    }
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {

  bool IsSingleScalar = vputils::isSingleScalar(Def);

-  VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
+  VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
  // Check if there is a scalar value for the selected lane.
  if (!hasScalarValue(Def, LastLane)) {
    // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
@@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
    Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
    set(Def, Undef);
-    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+    for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
      packScalarIntoVectorizedValue(Def, Lane);
    VectorValue = get(Def);
  }
@@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
      Entry);
  State->Lane = VPLane(0);
-  for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
-       ++Lane) {
+  for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
    State->Lane = VPLane(Lane, VPLane::Kind::First);
    // Visit the VPBlocks connected to \p this, starting from it.
    for (VPBlockBase *Block : RPOT) {
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -871,7 +871,7 @@ void VPInstruction::execute(VPTransformState &State) {
                                    isVectorToScalar() || isSingleScalar());
  bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
  if (GeneratesPerAllLanes) {
-    for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
+    for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
         Lane != NumLanes; ++Lane) {
      Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
      assert(GeneratedValue && "generatePerLane must produce a value");
@@ -2787,8 +2787,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
  }

  // Generate scalar instances for all VF lanes.
-  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-  const unsigned EndLane = State.VF.getKnownMinValue();
+  const unsigned EndLane = State.VF.getFixedValue();
  for (unsigned Lane = 0; Lane < EndLane; ++Lane)
    scalarizeInstruction(UI, this, VPLane(Lane), State);
 }
@@ -2841,7 +2840,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
               UI->getOpcode(), ResultTy, CostKind,
               {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
               Op2Info, Operands, UI, &Ctx.TLI) *
-           (isSingleScalar() ? 1 : VF.getKnownMinValue());
+           (isSingleScalar() ? 1 : VF.getFixedValue());
  }
  }

@@ -3390,7 +3389,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
    Value *ResBlockInMask = State.get(BlockInMask);
    Value *ShuffledMask = State.Builder.CreateShuffleVector(
        ResBlockInMask,
-        createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
+        createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
        "interleaved.mask");
    return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
                                                   ShuffledMask, MaskForGaps)
@@ -3402,8 +3401,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
  if (isa<LoadInst>(Instr)) {
    Value *MaskForGaps = nullptr;
    if (NeedsMaskForGaps) {
-      MaskForGaps = createBitMaskForGaps(State.Builder,
-                                         State.VF.getKnownMinValue(), *Group);
+      MaskForGaps =
+          createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
      assert(MaskForGaps && "Mask for Gaps is required but it is null");
    }

@@ -3454,6 +3453,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {

      return;
    }
+    assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");

    // For each member in the group, shuffle out the appropriate data from the
    // wide loads.
@@ -3466,13 +3466,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
        continue;

      auto StrideMask =
-          createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
+          createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
      Value *StridedVec =
          State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");

      // If this member has different type, cast the result type.
      if (Member->getType() != ScalarTy) {
-        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
        VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
        StridedVec =
            createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
@@ -3808,7 +3807,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
  if (VF.isScalar())
    return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);

-  if (VF.isScalable() && VF.getKnownMinValue() == 1)
+  if (VF == ElementCount::getScalable(1))
    return InstructionCost::getInvalid();

  return 0;