[X86][CodeGenPrepare] Try to reuse IV's incremented value instead of adding the offset, part 1

While optimizing the memory instruction, we sometimes need to add
offset to the value of `IV`. We could avoid doing so if the `IV.next` is
already defined at the point of interest. In this case, we may get two
possible advantages from this:

- If the `IV` step happens to match with the offset, we don't need to add
  the offset at all;
- We reduce overlap of live ranges of `IV` and `IV.next`. They may stop overlapping
  and it will lead to better register allocation. Even if the overlap will preserve,
  we are not introducing a new overlap, so it should be a neutral transform (Disabled
  this patch, will come with follow-up).

Currently I've only added support for IVs that get decremented using `usub`
intrinsic. We could also support `AddInstr`, however there is some weird
interaction with some other transform that may lead to infinite compilation
in this case (seems like same transform is done and undone over and over).
I need to investigate why it happens, but generally we could do that too.

The first part only handles case where this reuse fully elimiates the offset.

Differential Revision: https://reviews.llvm.org/D96399
Reviewed By: reames
This commit is contained in:
Max Kazantsev
2021-03-04 12:01:39 +07:00
parent b15ce2f344
commit d9e93e8e57
4 changed files with 97 additions and 49 deletions

View File

@@ -3064,6 +3064,8 @@ class AddressingModeMatcher {
const TargetLowering &TLI;
const TargetRegisterInfo &TRI;
const DataLayout &DL;
const LoopInfo &LI;
const DominatorTree &DT;
/// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
/// the memory instruction that we're computing this address for.
@@ -3099,16 +3101,17 @@ class AddressingModeMatcher {
AddressingModeMatcher(
SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
const TargetRegisterInfo &TRI, const LoopInfo &LI,
const DominatorTree &DT, Type *AT, unsigned AS, Instruction *MI,
ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
: AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
OptSize(OptSize), PSI(PSI), BFI(BFI) {
DL(MI->getModule()->getDataLayout()), LI(LI), DT(DT), AccessTy(AT),
AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
IgnoreProfitability = false;
}
@@ -3123,18 +3126,17 @@ public:
static ExtAddrMode
Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
SmallVectorImpl<Instruction *> &AddrModeInsts,
const TargetLowering &TLI, const TargetRegisterInfo &TRI,
const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
TypePromotionTransaction &TPT,
const TargetLowering &TLI, const LoopInfo &LI, const DominatorTree &DT,
const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts,
InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
ExtAddrMode Result;
bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
MemoryInst, Result, InsertedInsts,
PromotedInsts, TPT, LargeOffsetGEP,
OptSize, PSI, BFI)
.matchAddr(V, 0);
bool Success = AddressingModeMatcher(
AddrModeInsts, TLI, TRI, LI, DT, AccessTy, AS, MemoryInst, Result,
InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
BFI).matchAddr(V, 0);
(void)Success; assert(Success && "Couldn't select *anything*?");
return Result;
}
@@ -3830,10 +3832,12 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
// Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now
// to see if ScaleReg is actually X+C. If so, we can turn this into adding
// X*Scale + C*Scale to addr mode.
// X*Scale + C*Scale to addr mode. If we found available IV increment, do not
// go any further: we can reuse it and cannot eliminate it.
ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
if (isa<Instruction>(ScaleReg) && // not a constant expr.
match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
!isIVIncrement(cast<BinaryOperator>(ScaleReg), &LI) &&
CI->getValue().isSignedIntN(64)) {
TestAddrMode.InBounds = false;
TestAddrMode.ScaledReg = AddLHS;
@@ -3846,9 +3850,63 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
AddrMode = TestAddrMode;
return true;
}
// Restore status quo.
TestAddrMode = AddrMode;
}
// Otherwise, not (x+c)*scale, just return what we have.
auto GetConstantStep = [this](const Value * V)
->Optional<std::pair<Instruction *, APInt> > {
auto *PN = dyn_cast<PHINode>(V);
if (!PN)
return None;
auto IVInc = getIVIncrement(PN, &LI);
if (!IVInc)
return None;
// TODO: The result of the intrinsics above is two-compliment. However when
// IV inc is expressed as add or sub, iv.next is potentially a poison value.
// If it has nuw or nsw flags, we need to make sure that these flags are
// inferrable at the point of memory instruction. Otherwise we are replacing
// well-defined two-compliment computation with poison. Currently, to avoid
// potentially complex analysis needed to prove this, we reject such cases.
if (auto *OIVInc = dyn_cast<OverflowingBinaryOperator>(IVInc->first))
if (OIVInc->hasNoSignedWrap() || OIVInc->hasNoUnsignedWrap())
return None;
if (auto *ConstantStep = dyn_cast<ConstantInt>(IVInc->second))
return std::make_pair(IVInc->first, ConstantStep->getValue());
return None;
};
// Try to account for the following special case:
// 1. ScaleReg is an inductive variable;
// 2. We use it with non-zero offset;
// 3. IV's increment is available at the point of memory instruction.
//
// In this case, we may reuse the IV increment instead of the IV Phi to
// achieve the following advantages:
// 1. If IV step matches the offset, we will have no need in the offset;
if (AddrMode.BaseOffs) {
if (auto IVStep = GetConstantStep(ScaleReg)) {
Instruction *IVInc = IVStep->first;
APInt Step = IVStep->second;
APInt Offset = Step * AddrMode.Scale;
if (Offset.isSignedIntN(64) && TestAddrMode.BaseOffs == Offset &&
DT.dominates(IVInc, MemoryInst)) {
TestAddrMode.InBounds = false;
TestAddrMode.ScaledReg = IVInc;
TestAddrMode.BaseOffs -= Offset.getLimitedValue();
// If this addressing mode is legal, commit it..
if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
AddrModeInsts.push_back(cast<Instruction>(IVInc));
AddrMode = TestAddrMode;
return true;
}
// Restore status quo.
TestAddrMode = AddrMode;
}
}
}
// Otherwise, just return what we have.
return true;
}
@@ -4938,9 +4996,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
0);
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
AddressingModeMatcher Matcher(
MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, DT,
AddressAccessTy, AS, MemoryInst, Result,
InsertedInsts, PromotedInsts, TPT,
LargeOffsetGEP, OptSize, PSI, BFI);
Matcher.IgnoreProfitability = true;
bool Success = Matcher.matchAddr(Address, 0);
(void)Success; assert(Success && "Couldn't select *anything*?");
@@ -5043,9 +5102,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
AddrModeInsts.clear();
std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
0);
Function *F = MemoryInst->getParent()->getParent();
ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDT(*F),
*TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
BFI.get());
GetElementPtrInst *GEP = LargeOffsetGEP.first;

View File

@@ -1,20 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
; TODO: We can get rid of movq here by using different offset and %rax.
define i32 @test_01(i32* %p, i64 %len, i32 %x) {
; CHECK-LABEL: test_01:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_1: ## %loop
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subq $1, %rax
; CHECK-NEXT: subq $1, %rsi
; CHECK-NEXT: jb LBB0_4
; CHECK-NEXT: ## %bb.2: ## %backedge
; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4)
; CHECK-NEXT: movq %rax, %rsi
; CHECK-NEXT: cmpl %edx, (%rdi,%rsi,4)
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.3: ## %failure
; CHECK-NEXT: ud2
@@ -89,16 +86,14 @@ failure: ; preds = %backedge
define i32 @test_02(i32* %p, i64 %len, i32 %x) {
; CHECK-LABEL: test_02:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB2_1: ## %loop
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subq $1, %rax
; CHECK-NEXT: subq $1, %rsi
; CHECK-NEXT: jb LBB2_4
; CHECK-NEXT: ## %bb.2: ## %backedge
; CHECK-NEXT: ## in Loop: Header=BB2_1 Depth=1
; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4)
; CHECK-NEXT: movq %rax, %rsi
; CHECK-NEXT: cmpl %edx, (%rdi,%rsi,4)
; CHECK-NEXT: jne LBB2_1
; CHECK-NEXT: ## %bb.3: ## %failure
; CHECK-NEXT: ud2
@@ -133,16 +128,14 @@ failure: ; preds = %backedge
define i32 @test_03(i32* %p, i64 %len, i32 %x) {
; CHECK-LABEL: test_03:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB3_1: ## %loop
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subq $1, %rax
; CHECK-NEXT: subq $1, %rsi
; CHECK-NEXT: jb LBB3_4
; CHECK-NEXT: ## %bb.2: ## %backedge
; CHECK-NEXT: ## in Loop: Header=BB3_1 Depth=1
; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4)
; CHECK-NEXT: movq %rax, %rsi
; CHECK-NEXT: cmpl %edx, (%rdi,%rsi,4)
; CHECK-NEXT: jne LBB3_1
; CHECK-NEXT: ## %bb.3: ## %failure
; CHECK-NEXT: ud2

View File

@@ -14,11 +14,10 @@ define i32 @test_01(i32* %p, i64 %len, i32 %x) {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
; CHECK: backedge:
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
; CHECK-NEXT: [[SUNKADDR3:%.*]] = mul i64 [[MATH]], 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 4
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
; CHECK-NEXT: [[SUNKADDR4:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR3]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR4]] to i32*
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]

View File

@@ -12,11 +12,10 @@ define i32 @test_01(i32* %p, i64 %len, i32 %x) {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
; CHECK: backedge:
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
@@ -108,11 +107,10 @@ define i32 @test_02(i32* %p, i64 %len, i32 %x) {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
; CHECK: backedge:
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
@@ -161,11 +159,10 @@ define i32 @test_03_neg(i32* %p, i64 %len, i32 %x) {
; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0
; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]]
; CHECK: backedge:
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV_NEXT]], 4
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]]
; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE]], label [[LOOP]]
@@ -272,11 +269,10 @@ define i32 @test_05_neg(i32* %p, i64 %len, i32 %x, i1 %cond) {
; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0
; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]]
; CHECK: backedge:
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV_NEXT]], 4
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]]
; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR1]] to i32*
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]