[X86][CodeGenPrepare] Try to reuse IV's incremented value instead of adding the offset, part 1

While optimizing the memory instruction, we sometimes need to add offset to the value of `IV`. We could avoid doing so if the `IV.next` is already defined at the point of interest. In this case, we may get two possible advantages from this: - If the `IV` step happens to match with the offset, we don't need to add the offset at all; - We reduce overlap of live ranges of `IV` and `IV.next`. They may stop overlapping and it will lead to better register allocation. Even if the overlap will preserve, we are not introducing a new overlap, so it should be a neutral transform (Disabled this patch, will come with follow-up). Currently I've only added support for IVs that get decremented using `usub` intrinsic. We could also support `AddInstr`, however there is some weird interaction with some other transform that may lead to infinite compilation in this case (seems like same transform is done and undone over and over). I need to investigate why it happens, but generally we could do that too. The first part only handles case where this reuse fully elimiates the offset. Differential Revision: https://reviews.llvm.org/D96399 Reviewed By: reames
2021-03-04 12:01:39 +07:00
parent b15ce2f344
commit d9e93e8e57
4 changed files with 97 additions and 49 deletions
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3064,6 +3064,8 @@ class AddressingModeMatcher {
  const TargetLowering &TLI;
  const TargetRegisterInfo &TRI;
  const DataLayout &DL;
+  const LoopInfo &LI;
+  const DominatorTree &DT;

  /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
  /// the memory instruction that we're computing this address for.
@@ -3099,16 +3101,17 @@ class AddressingModeMatcher {

  AddressingModeMatcher(
      SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
-      const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
+      const TargetRegisterInfo &TRI, const LoopInfo &LI,
+      const DominatorTree &DT, Type *AT, unsigned AS, Instruction *MI,
      ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
      InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
      bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
-        DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
-        MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
-        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
-        OptSize(OptSize), PSI(PSI), BFI(BFI) {
+        DL(MI->getModule()->getDataLayout()), LI(LI), DT(DT), AccessTy(AT),
+        AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
+        InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
+        LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
    IgnoreProfitability = false;
  }

@@ -3123,18 +3126,17 @@ public:
  static ExtAddrMode
  Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
        SmallVectorImpl<Instruction *> &AddrModeInsts,
-        const TargetLowering &TLI, const TargetRegisterInfo &TRI,
-        const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
-        TypePromotionTransaction &TPT,
+        const TargetLowering &TLI, const LoopInfo &LI, const DominatorTree &DT,
+        const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts,
+        InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
        bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
    ExtAddrMode Result;

-    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
-                                         MemoryInst, Result, InsertedInsts,
-                                         PromotedInsts, TPT, LargeOffsetGEP,
-                                         OptSize, PSI, BFI)
-                       .matchAddr(V, 0);
+    bool Success = AddressingModeMatcher(
+        AddrModeInsts, TLI, TRI, LI, DT, AccessTy, AS, MemoryInst, Result,
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        BFI).matchAddr(V, 0);
    (void)Success; assert(Success && "Couldn't select *anything*?");
    return Result;
  }
@@ -3830,10 +3832,12 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,

  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
-  // X*Scale + C*Scale to addr mode.
+  // X*Scale + C*Scale to addr mode. If we found available IV increment, do not
+  // go any further: we can reuse it and cannot eliminate it.
  ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
  if (isa<Instruction>(ScaleReg) && // not a constant expr.
      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
+      !isIVIncrement(cast<BinaryOperator>(ScaleReg), &LI) &&
      CI->getValue().isSignedIntN(64)) {
    TestAddrMode.InBounds = false;
    TestAddrMode.ScaledReg = AddLHS;
@@ -3846,9 +3850,63 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
      AddrMode = TestAddrMode;
      return true;
    }
+    // Restore status quo.
+    TestAddrMode = AddrMode;
  }

-  // Otherwise, not (x+c)*scale, just return what we have.
+  auto GetConstantStep = [this](const Value * V)
+      ->Optional<std::pair<Instruction *, APInt> > {
+    auto *PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return None;
+    auto IVInc = getIVIncrement(PN, &LI);
+    if (!IVInc)
+      return None;
+    // TODO: The result of the intrinsics above is two-compliment. However when
+    // IV inc is expressed as add or sub, iv.next is potentially a poison value.
+    // If it has nuw or nsw flags, we need to make sure that these flags are
+    // inferrable at the point of memory instruction. Otherwise we are replacing
+    // well-defined two-compliment computation with poison. Currently, to avoid
+    // potentially complex analysis needed to prove this, we reject such cases.
+    if (auto *OIVInc = dyn_cast<OverflowingBinaryOperator>(IVInc->first))
+      if (OIVInc->hasNoSignedWrap() || OIVInc->hasNoUnsignedWrap())
+        return None;
+    if (auto *ConstantStep = dyn_cast<ConstantInt>(IVInc->second))
+      return std::make_pair(IVInc->first, ConstantStep->getValue());
+    return None;
+  };
+
+  // Try to account for the following special case:
+  // 1. ScaleReg is an inductive variable;
+  // 2. We use it with non-zero offset;
+  // 3. IV's increment is available at the point of memory instruction.
+  //
+  // In this case, we may reuse the IV increment instead of the IV Phi to
+  // achieve the following advantages:
+  // 1. If IV step matches the offset, we will have no need in the offset;
+  if (AddrMode.BaseOffs) {
+    if (auto IVStep = GetConstantStep(ScaleReg)) {
+      Instruction *IVInc = IVStep->first;
+      APInt Step = IVStep->second;
+      APInt Offset = Step * AddrMode.Scale;
+      if (Offset.isSignedIntN(64) && TestAddrMode.BaseOffs == Offset &&
+          DT.dominates(IVInc, MemoryInst)) {
+        TestAddrMode.InBounds = false;
+        TestAddrMode.ScaledReg = IVInc;
+        TestAddrMode.BaseOffs -= Offset.getLimitedValue();
+        // If this addressing mode is legal, commit it..
+        if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
+          AddrModeInsts.push_back(cast<Instruction>(IVInc));
+          AddrMode = TestAddrMode;
+          return true;
+        }
+        // Restore status quo.
+        TestAddrMode = AddrMode;
+      }
+    }
+  }
+
+  // Otherwise, just return what we have.
  return true;
 }

@@ -4938,9 +4996,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
                                                                      0);
    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
        TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(
-        MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, DT,
+                                  AddressAccessTy, AS, MemoryInst, Result,
+                                  InsertedInsts, PromotedInsts, TPT,
+                                  LargeOffsetGEP, OptSize, PSI, BFI);
    Matcher.IgnoreProfitability = true;
    bool Success = Matcher.matchAddr(Address, 0);
    (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -5043,9 +5102,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
    AddrModeInsts.clear();
    std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
                                                                      0);
+    Function *F = MemoryInst->getParent()->getParent();
    ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDT(*F),
+        *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
        BFI.get());

    GetElementPtrInst *GEP = LargeOffsetGEP.first;
--- a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -1,20 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s

-; TODO: We can get rid of movq here by using different offset and %rax.
 define i32 @test_01(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_01:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB0_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -89,16 +86,14 @@ failure:                                          ; preds = %backedge
 define i32 @test_02(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_02:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB2_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB2_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB2_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB2_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
@@ -133,16 +128,14 @@ failure:                                          ; preds = %backedge
 define i32 @test_03(i32* %p, i64 %len, i32 %x) {
 ; CHECK-LABEL: test_03:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB3_1: ## %loop
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subq $1, %rax
+; CHECK-NEXT:    subq $1, %rsi
 ; CHECK-NEXT:    jb LBB3_4
 ; CHECK-NEXT:  ## %bb.2: ## %backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    cmpl %edx, -4(%rdi,%rsi,4)
-; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    cmpl %edx, (%rdi,%rsi,4)
 ; CHECK-NEXT:    jne LBB3_1
 ; CHECK-NEXT:  ## %bb.3: ## %failure
 ; CHECK-NEXT:    ud2
--- a/llvm/test/CodeGen/X86/uadd_inc_iv.ll
+++ b/llvm/test/CodeGen/X86/uadd_inc_iv.ll
@@ -14,11 +14,10 @@ define i32 @test_01(i32* %p, i64 %len, i32 %x) {
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR3:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[SUNKADDR4:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR4]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
--- a/llvm/test/CodeGen/X86/usub_inc_iv.ll
+++ b/llvm/test/CodeGen/X86/usub_inc_iv.ll
@@ -12,11 +12,10 @@ define i32 @test_01(i32* %p, i64 %len, i32 %x) {
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
@@ -108,11 +107,10 @@ define i32 @test_02(i32* %p, i64 %len, i32 %x) {
 ; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
@@ -161,11 +159,10 @@ define i32 @test_03_neg(i32* %p, i64 %len, i32 %x) {
 ; CHECK-NEXT:    [[COND_1:%.*]] = icmp eq i64 [[IV]], 0
 ; CHECK-NEXT:    br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV_NEXT]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE]], label [[LOOP]]
@@ -272,11 +269,10 @@ define i32 @test_05_neg(i32* %p, i64 %len, i32 %x, i1 %cond) {
 ; CHECK-NEXT:    [[COND_1:%.*]] = icmp eq i64 [[IV]], 0
 ; CHECK-NEXT:    br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = mul i64 [[IV_NEXT]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
 ; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]]
-; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
 ; CHECK-NEXT:    [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
 ; CHECK-NEXT:    [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]