[LV] Update call widening decision when scalarzing calls.

collectInstsToScalarize may decide to scalarize a call. If so, we have to update the widening decision for the call, otherwise the call won't be scalarized as expected during VPlan construction. This issue was uncovered by f82543d509.
2024-09-03 14:12:40 +01:00
parent 0797c184c6
commit dd94537b40
2 changed files with 64 additions and 1 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5386,8 +5386,18 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
        // 3. Emulated masked memrefs, if a hacked cost is needed.
        if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
            !useEmulatedMaskMemRefHack(&I, VF) &&
-            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+            computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+          // Check if we decided to scalarize a call. If so, update the widening
+          // decision of the call to CM_Scalarize with the computed scalar cost.
+          for (const auto &[I, _] : ScalarCosts) {
+            auto *CI = dyn_cast<CallInst>(I);
+            if (!CI || !CallWideningDecisions.contains({CI, VF}))
+              continue;
+            CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
+            CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
+          }
+        }
        // Remember that BB will remain after vectorization.
        PredicatedBBsAfterVectorization[VF].insert(BB);
        for (auto *Pred : predecessors(BB)) {
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -126,6 +126,59 @@ exit:
  ret void
 }

+define void @call_scalarized(ptr noalias %src, ptr noalias %dst, double %0) {
+; CHECK-LABEL: define void @call_scalarized(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], double [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 100, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[CMP295:%.*]] = fcmp ugt double [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[CMP299:%.*]] = fcmp ugt double [[L]], 0.000000e+00
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP295]], [[CMP299]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[L]])
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    store double [[SQRT]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 100, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, -1
+  %gep.src = getelementptr double, ptr %src, i64 %iv.next
+  %l = load double, ptr %gep.src, align 8
+  %cmp295 = fcmp ugt double %0, 0.000000e+00
+  %cmp299 = fcmp ugt double %l, 0.000000e+00
+  %or.cond = or i1 %cmp295, %cmp299
+  br i1 %or.cond, label %loop.latch, label %then
+
+then:
+  %sqrt = call double @llvm.sqrt.f64(double %l)
+  %gep.dst = getelementptr double, ptr %dst, i64 %iv.next
+  store double %sqrt, ptr %gep.dst, align 8
+  br label %loop.latch
+
+loop.latch:
+  %tobool.not = icmp eq i64 %iv.next, 0
+  br i1 %tobool.not, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double) #0
 declare double @llvm.powi.f64.i32(double, i32)
 declare i64 @llvm.fshl.i64(i64, i64, i64)
 ;.