[LICM] Fold associative binary ops to promote code hoisting (#81608)

Perform the transformation

  "(LV op C1) op C2" ==> "LV op (C1 op C2)"

where op is an associative binary op, LV is a loop variant, and C1 and
C2 are loop invariants to hoist.

Similar patterns could be folded (left in comment) but this one seems to
be the most impactful.
This commit is contained in:
Ricardo Jesus
2024-07-23 10:03:26 +01:00
committed by GitHub
parent 528a662d3a
commit f2ccf80136
6 changed files with 335 additions and 163 deletions

View File

@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
STATISTIC(NumIntAssociationsHoisted,
"Number of invariant int expressions "
"reassociated and hoisted out of the loop");
STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
"reassociated and hoisted out of the loop");
/// Memory promotion is enabled by default.
static cl::opt<bool>
@@ -2779,6 +2781,60 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
return true;
}
/// Reassociate general associative binary expressions of the form
///
/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
///
/// where op is an associative binary op, LV is a loop variant, and C1 and C2
/// are loop invariants that we want to hoist.
///
/// TODO: This can be extended to more cases such as
/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
static bool hoistBOAssociation(Instruction &I, Loop &L,
ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU, AssumptionCache *AC,
DominatorTree *DT) {
BinaryOperator *BO = dyn_cast<BinaryOperator>(&I);
if (!BO || !BO->isAssociative())
return false;
Instruction::BinaryOps Opcode = BO->getOpcode();
BinaryOperator *Op0 = dyn_cast<BinaryOperator>(BO->getOperand(0));
// Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
if (Op0 && Op0->getOpcode() == Opcode) {
Value *LV = Op0->getOperand(0);
Value *C1 = Op0->getOperand(1);
Value *C2 = BO->getOperand(1);
if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) ||
!L.isLoopInvariant(C2))
return false;
auto *Preheader = L.getLoopPreheader();
assert(Preheader && "Loop is not in simplify form?");
IRBuilder<> Builder(Preheader->getTerminator());
Value *Inv = Builder.CreateBinOp(Opcode, C1, C2, "invariant.op");
auto *NewBO =
BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
NewBO->copyIRFlags(BO);
BO->replaceAllUsesWith(NewBO);
eraseInstruction(*BO, SafetyInfo, MSSAU);
// Note: (LV op C1) might not be erased if it has more uses than the one we
// just replaced.
if (Op0->use_empty())
eraseInstruction(*Op0, SafetyInfo, MSSAU);
return true;
}
return false;
}
static bool hoistArithmetics(Instruction &I, Loop &L,
ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2816,6 +2872,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
return true;
}
if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
++NumHoisted;
++NumBOAssociationsHoisted;
return true;
}
return false;
}

View File

@@ -642,8 +642,8 @@ define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2
; CHECK-NEXT: cmpdi r7, 0
; CHECK-NEXT: ble cr0, .LBB6_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: add r6, r6, r4
; CHECK-NEXT: add r5, r5, r4
; CHECK-NEXT: add r6, r6, r4
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r5, r3, r5
@@ -743,214 +743,219 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
; CHECK-NEXT: std r9, -184(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r3, -160(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r4, -160(r1) # 8-byte Folded Spill
; CHECK-NEXT: ble cr0, .LBB7_7
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r6, r6, 2
; CHECK-NEXT: li r7, 1
; CHECK-NEXT: mr r30, r10
; CHECK-NEXT: cmpdi r6, 1
; CHECK-NEXT: iselgt r7, r6, r7
; CHECK-NEXT: addi r8, r7, -1
; CHECK-NEXT: clrldi r6, r7, 63
; CHECK-NEXT: cmpldi r8, 3
; CHECK-NEXT: sldi r4, r6, 2
; CHECK-NEXT: li r6, 1
; CHECK-NEXT: mr r0, r10
; CHECK-NEXT: std r10, -192(r1) # 8-byte Folded Spill
; CHECK-NEXT: cmpdi r4, 1
; CHECK-NEXT: iselgt r4, r4, r6
; CHECK-NEXT: addi r7, r4, -1
; CHECK-NEXT: clrldi r6, r4, 63
; CHECK-NEXT: cmpldi r7, 3
; CHECK-NEXT: blt cr0, .LBB7_4
; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
; CHECK-NEXT: ld r14, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: mulli r24, r30, 24
; CHECK-NEXT: ld r16, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r15, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r3, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: rldicl r0, r7, 62, 2
; CHECK-NEXT: sldi r11, r30, 5
; CHECK-NEXT: sldi r19, r30, 4
; CHECK-NEXT: sldi r7, r14, 3
; CHECK-NEXT: add r14, r30, r14
; CHECK-NEXT: sldi r10, r16, 3
; CHECK-NEXT: sldi r12, r15, 3
; CHECK-NEXT: add r16, r30, r16
; CHECK-NEXT: add r15, r30, r15
; CHECK-NEXT: add r27, r11, r7
; CHECK-NEXT: add r22, r24, r7
; CHECK-NEXT: add r17, r19, r7
; CHECK-NEXT: sldi r2, r14, 3
; CHECK-NEXT: add r26, r24, r10
; CHECK-NEXT: add r25, r24, r12
; CHECK-NEXT: add r21, r19, r10
; CHECK-NEXT: add r20, r19, r12
; CHECK-NEXT: add r8, r11, r10
; CHECK-NEXT: sldi r16, r16, 3
; CHECK-NEXT: add r29, r5, r27
; CHECK-NEXT: add r28, r4, r27
; CHECK-NEXT: add r27, r3, r27
; CHECK-NEXT: add r24, r5, r22
; CHECK-NEXT: add r23, r4, r22
; CHECK-NEXT: add r22, r3, r22
; CHECK-NEXT: add r19, r5, r17
; CHECK-NEXT: add r18, r4, r17
; CHECK-NEXT: add r17, r3, r17
; CHECK-NEXT: add r14, r5, r2
; CHECK-NEXT: add r31, r4, r2
; CHECK-NEXT: add r2, r3, r2
; CHECK-NEXT: add r9, r5, r8
; CHECK-NEXT: add r8, r11, r12
; CHECK-NEXT: ld r0, -192(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r30, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r8, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: rldicl r7, r4, 62, 2
; CHECK-NEXT: ld r9, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r11, r0, r30
; CHECK-NEXT: add r4, r0, r0
; CHECK-NEXT: mulli r23, r0, 24
; CHECK-NEXT: add r14, r0, r8
; CHECK-NEXT: sldi r12, r0, 5
; CHECK-NEXT: add r31, r0, r9
; CHECK-NEXT: sldi r9, r9, 3
; CHECK-NEXT: sldi r18, r0, 4
; CHECK-NEXT: sldi r8, r8, 3
; CHECK-NEXT: add r10, r4, r4
; CHECK-NEXT: sldi r4, r30, 3
; CHECK-NEXT: sldi r11, r11, 3
; CHECK-NEXT: add r26, r12, r9
; CHECK-NEXT: add r16, r18, r9
; CHECK-NEXT: add r29, r12, r8
; CHECK-NEXT: add r19, r18, r8
; CHECK-NEXT: add r30, r12, r4
; CHECK-NEXT: mr r20, r4
; CHECK-NEXT: std r4, -200(r1) # 8-byte Folded Spill
; CHECK-NEXT: ld r4, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r15, r5, r11
; CHECK-NEXT: sldi r11, r14, 3
; CHECK-NEXT: add r29, r5, r29
; CHECK-NEXT: add r28, r3, r26
; CHECK-NEXT: add r19, r5, r19
; CHECK-NEXT: add r21, r23, r9
; CHECK-NEXT: add r24, r23, r8
; CHECK-NEXT: add r14, r5, r11
; CHECK-NEXT: sldi r11, r31, 3
; CHECK-NEXT: add r25, r23, r20
; CHECK-NEXT: add r20, r18, r20
; CHECK-NEXT: add r30, r5, r30
; CHECK-NEXT: add r18, r3, r16
; CHECK-NEXT: add r24, r5, r24
; CHECK-NEXT: add r23, r3, r21
; CHECK-NEXT: add r27, r4, r26
; CHECK-NEXT: add r22, r4, r21
; CHECK-NEXT: add r17, r4, r16
; CHECK-NEXT: add r2, r4, r11
; CHECK-NEXT: rldicl r4, r7, 2, 1
; CHECK-NEXT: sub r7, r8, r9
; CHECK-NEXT: ld r8, -200(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r26, r5, r26
; CHECK-NEXT: add r25, r5, r25
; CHECK-NEXT: add r21, r5, r21
; CHECK-NEXT: add r20, r5, r20
; CHECK-NEXT: add r16, r5, r16
; CHECK-NEXT: add r8, r5, r8
; CHECK-NEXT: rldicl r3, r0, 2, 1
; CHECK-NEXT: addi r3, r3, -4
; CHECK-NEXT: sub r0, r12, r7
; CHECK-NEXT: sub r12, r10, r7
; CHECK-NEXT: li r7, 0
; CHECK-NEXT: mr r10, r30
; CHECK-NEXT: sldi r15, r15, 3
; CHECK-NEXT: add r15, r5, r15
; CHECK-NEXT: rldicl r3, r3, 62, 2
; CHECK-NEXT: addi r3, r3, 1
; CHECK-NEXT: mtctr r3
; CHECK-NEXT: add r31, r5, r11
; CHECK-NEXT: add r11, r3, r11
; CHECK-NEXT: addi r4, r4, -4
; CHECK-NEXT: rldicl r4, r4, 62, 2
; CHECK-NEXT: sub r8, r8, r9
; CHECK-NEXT: li r9, 0
; CHECK-NEXT: addi r4, r4, 1
; CHECK-NEXT: mtctr r4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_3: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: lfd f0, 0(r2)
; CHECK-NEXT: lfd f0, 0(r11)
; CHECK-NEXT: lfd f1, 0(r2)
; CHECK-NEXT: add r0, r0, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r31)
; CHECK-NEXT: add r3, r10, r30
; CHECK-NEXT: add r3, r3, r30
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r14)
; CHECK-NEXT: add r3, r3, r30
; CHECK-NEXT: add r10, r3, r30
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r14)
; CHECK-NEXT: add r14, r14, r11
; CHECK-NEXT: lfdx f0, r2, r0
; CHECK-NEXT: lfdx f1, r31, r0
; CHECK-NEXT: stfd f0, 0(r31)
; CHECK-NEXT: add r31, r31, r12
; CHECK-NEXT: lfdx f0, r11, r7
; CHECK-NEXT: lfdx f1, r2, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r15, r7
; CHECK-NEXT: lfdx f1, r14, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r15, r7
; CHECK-NEXT: lfdx f0, r2, r12
; CHECK-NEXT: lfdx f1, r31, r12
; CHECK-NEXT: add r2, r2, r11
; CHECK-NEXT: add r31, r31, r11
; CHECK-NEXT: stfdx f0, r14, r9
; CHECK-NEXT: lfdx f0, r11, r8
; CHECK-NEXT: lfdx f1, r2, r8
; CHECK-NEXT: add r11, r11, r12
; CHECK-NEXT: add r2, r2, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r16, r7
; CHECK-NEXT: lfdx f1, r15, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r16, r7
; CHECK-NEXT: lfd f0, 0(r17)
; CHECK-NEXT: lfd f1, 0(r18)
; CHECK-NEXT: stfdx f0, r15, r9
; CHECK-NEXT: lfd f0, 0(r18)
; CHECK-NEXT: lfd f1, 0(r17)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r19, r7
; CHECK-NEXT: lfdx f1, r16, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r19, r7
; CHECK-NEXT: lfdx f0, r17, r0
; CHECK-NEXT: lfdx f1, r18, r0
; CHECK-NEXT: stfdx f0, r16, r9
; CHECK-NEXT: lfdx f0, r18, r7
; CHECK-NEXT: lfdx f1, r17, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r20, r7
; CHECK-NEXT: lfdx f1, r19, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r20, r7
; CHECK-NEXT: lfdx f0, r17, r12
; CHECK-NEXT: lfdx f1, r18, r12
; CHECK-NEXT: add r17, r17, r11
; CHECK-NEXT: add r18, r18, r11
; CHECK-NEXT: stfdx f0, r19, r9
; CHECK-NEXT: lfdx f0, r18, r8
; CHECK-NEXT: lfdx f1, r17, r8
; CHECK-NEXT: add r18, r18, r12
; CHECK-NEXT: add r17, r17, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r21, r7
; CHECK-NEXT: lfdx f1, r20, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r21, r7
; CHECK-NEXT: lfd f0, 0(r22)
; CHECK-NEXT: lfd f1, 0(r23)
; CHECK-NEXT: stfdx f0, r20, r9
; CHECK-NEXT: lfd f0, 0(r23)
; CHECK-NEXT: lfd f1, 0(r22)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r24, r7
; CHECK-NEXT: lfdx f1, r21, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r24, r7
; CHECK-NEXT: lfdx f0, r22, r0
; CHECK-NEXT: lfdx f1, r23, r0
; CHECK-NEXT: stfdx f0, r21, r9
; CHECK-NEXT: lfdx f0, r23, r7
; CHECK-NEXT: lfdx f1, r22, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r25, r7
; CHECK-NEXT: lfdx f1, r24, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r25, r7
; CHECK-NEXT: lfdx f0, r22, r12
; CHECK-NEXT: lfdx f1, r23, r12
; CHECK-NEXT: add r22, r22, r11
; CHECK-NEXT: add r23, r23, r11
; CHECK-NEXT: stfdx f0, r24, r9
; CHECK-NEXT: lfdx f0, r23, r8
; CHECK-NEXT: lfdx f1, r22, r8
; CHECK-NEXT: add r23, r23, r12
; CHECK-NEXT: add r22, r22, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r26, r7
; CHECK-NEXT: lfdx f1, r25, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r26, r7
; CHECK-NEXT: lfd f0, 0(r27)
; CHECK-NEXT: lfd f1, 0(r28)
; CHECK-NEXT: stfdx f0, r25, r9
; CHECK-NEXT: lfd f0, 0(r28)
; CHECK-NEXT: lfd f1, 0(r27)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r29, r7
; CHECK-NEXT: lfdx f1, r26, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r29, r7
; CHECK-NEXT: lfdx f0, r27, r0
; CHECK-NEXT: lfdx f1, r28, r0
; CHECK-NEXT: stfdx f0, r26, r9
; CHECK-NEXT: lfdx f0, r28, r7
; CHECK-NEXT: lfdx f1, r27, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r8, r7
; CHECK-NEXT: lfdx f1, r29, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r8, r7
; CHECK-NEXT: lfdx f0, r27, r12
; CHECK-NEXT: lfdx f1, r28, r12
; CHECK-NEXT: add r27, r27, r11
; CHECK-NEXT: add r28, r28, r11
; CHECK-NEXT: stfdx f0, r29, r9
; CHECK-NEXT: lfdx f0, r28, r8
; CHECK-NEXT: lfdx f1, r27, r8
; CHECK-NEXT: add r28, r28, r12
; CHECK-NEXT: add r27, r27, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r9, r7
; CHECK-NEXT: lfdx f1, r30, r9
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r9, r7
; CHECK-NEXT: add r7, r7, r11
; CHECK-NEXT: stfdx f0, r30, r9
; CHECK-NEXT: add r9, r9, r12
; CHECK-NEXT: bdnz .LBB7_3
; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: ld r7, -192(r1) # 8-byte Folded Reload
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: beq cr0, .LBB7_7
; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
; CHECK-NEXT: ld r3, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: sldi r8, r30, 3
; CHECK-NEXT: add r3, r10, r3
; CHECK-NEXT: sldi r3, r3, 3
; CHECK-NEXT: add r7, r5, r3
; CHECK-NEXT: add r9, r4, r3
; CHECK-NEXT: add r11, r0, r3
; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r3, r10, r3
; CHECK-NEXT: sldi r3, r3, 3
; CHECK-NEXT: add r12, r5, r3
; CHECK-NEXT: add r30, r4, r3
; CHECK-NEXT: add r29, r0, r3
; CHECK-NEXT: ld r3, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r3, r10, r3
; CHECK-NEXT: li r10, 0
; CHECK-NEXT: sldi r3, r3, 3
; CHECK-NEXT: add r5, r5, r3
; CHECK-NEXT: add r4, r4, r3
; CHECK-NEXT: add r3, r0, r3
; CHECK-NEXT: ld r4, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r29, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: mr r30, r3
; CHECK-NEXT: sldi r7, r7, 3
; CHECK-NEXT: add r4, r0, r4
; CHECK-NEXT: sldi r4, r4, 3
; CHECK-NEXT: add r3, r5, r4
; CHECK-NEXT: add r8, r29, r4
; CHECK-NEXT: add r9, r30, r4
; CHECK-NEXT: ld r4, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r4, r0, r4
; CHECK-NEXT: sldi r4, r4, 3
; CHECK-NEXT: add r10, r5, r4
; CHECK-NEXT: add r11, r29, r4
; CHECK-NEXT: add r12, r30, r4
; CHECK-NEXT: ld r4, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r4, r0, r4
; CHECK-NEXT: sldi r0, r4, 3
; CHECK-NEXT: add r5, r5, r0
; CHECK-NEXT: add r4, r29, r0
; CHECK-NEXT: add r30, r30, r0
; CHECK-NEXT: li r0, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_6: # %for.body.epil
; CHECK-NEXT: #
; CHECK-NEXT: lfdx f0, r3, r10
; CHECK-NEXT: lfdx f1, r4, r10
; CHECK-NEXT: lfdx f0, r30, r0
; CHECK-NEXT: lfdx f1, r4, r0
; CHECK-NEXT: addi r6, r6, -1
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r5)
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r5)
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: lfdx f0, r29, r10
; CHECK-NEXT: lfdx f1, r30, r10
; CHECK-NEXT: add r5, r5, r7
; CHECK-NEXT: lfdx f0, r12, r0
; CHECK-NEXT: lfdx f1, r11, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r12, r10
; CHECK-NEXT: lfdx f1, r10, r0
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r12, r10
; CHECK-NEXT: lfdx f0, r11, r10
; CHECK-NEXT: lfdx f1, r9, r10
; CHECK-NEXT: stfdx f0, r10, r0
; CHECK-NEXT: lfdx f0, r9, r0
; CHECK-NEXT: lfdx f1, r8, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r7, r10
; CHECK-NEXT: lfdx f1, r3, r0
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r7, r10
; CHECK-NEXT: add r10, r10, r8
; CHECK-NEXT: stfdx f0, r3, r0
; CHECK-NEXT: add r0, r0, r7
; CHECK-NEXT: bne cr0, .LBB7_6
; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload

View File

@@ -30,14 +30,16 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
; CHECK-NEXT: mflr r0
; CHECK-NEXT: std r0, 16(r1)
; CHECK-NEXT: stw r12, 8(r1)
; CHECK-NEXT: stdu r1, -48(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: stdu r1, -64(r1)
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: .cfi_offset lr, 16
; CHECK-NEXT: .cfi_offset r29, -24
; CHECK-NEXT: .cfi_offset r30, -16
; CHECK-NEXT: .cfi_offset cr2, 8
; CHECK-NEXT: .cfi_offset cr3, 8
; CHECK-NEXT: .cfi_offset cr4, 8
; CHECK-NEXT: std r30, 32(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r29, 40(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r30, 48(r1) # 8-byte Folded Spill
; CHECK-NEXT: bl call_2@notoc
; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_13
; CHECK-NEXT: # %bb.1: # %bb
@@ -65,10 +67,11 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
; CHECK-NEXT: bc 12, 4*cr3+eq, .LBB0_11
; CHECK-NEXT: # %bb.6: # %bb32
; CHECK-NEXT: #
; CHECK-NEXT: rlwinm r30, r30, 0, 24, 22
; CHECK-NEXT: andi. r3, r30, 2
; CHECK-NEXT: rlwinm r29, r30, 0, 24, 22
; CHECK-NEXT: mcrf cr2, cr0
; CHECK-NEXT: bl call_4@notoc
; CHECK-NEXT: mr r30, r29
; CHECK-NEXT: beq+ cr2, .LBB0_3
; CHECK-NEXT: # %bb.7: # %bb37
; CHECK-NEXT: .LBB0_8: # %bb22
@@ -89,11 +92,13 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
; CHECK-BE-NEXT: stdu r1, -144(r1)
; CHECK-BE-NEXT: .cfi_def_cfa_offset 144
; CHECK-BE-NEXT: .cfi_offset lr, 16
; CHECK-BE-NEXT: .cfi_offset r28, -32
; CHECK-BE-NEXT: .cfi_offset r29, -24
; CHECK-BE-NEXT: .cfi_offset r30, -16
; CHECK-BE-NEXT: .cfi_offset cr2, 8
; CHECK-BE-NEXT: .cfi_offset cr2, 8
; CHECK-BE-NEXT: .cfi_offset cr2, 8
; CHECK-BE-NEXT: std r28, 112(r1) # 8-byte Folded Spill
; CHECK-BE-NEXT: std r29, 120(r1) # 8-byte Folded Spill
; CHECK-BE-NEXT: std r30, 128(r1) # 8-byte Folded Spill
; CHECK-BE-NEXT: bl call_2
@@ -126,11 +131,12 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
; CHECK-BE-NEXT: bc 12, 4*cr3+eq, .LBB0_11
; CHECK-BE-NEXT: # %bb.6: # %bb32
; CHECK-BE-NEXT: #
; CHECK-BE-NEXT: rlwinm r29, r29, 0, 24, 22
; CHECK-BE-NEXT: andi. r3, r29, 2
; CHECK-BE-NEXT: rlwinm r28, r29, 0, 24, 22
; CHECK-BE-NEXT: mcrf cr2, cr0
; CHECK-BE-NEXT: bl call_4
; CHECK-BE-NEXT: nop
; CHECK-BE-NEXT: mr r29, r28
; CHECK-BE-NEXT: beq+ cr2, .LBB0_3
; CHECK-BE-NEXT: # %bb.7: # %bb37
; CHECK-BE-NEXT: .LBB0_8: # %bb22

View File

@@ -0,0 +1,99 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=licm < %s | FileCheck %s
; Adapted from:
; for(long i = 0; i < n; ++i)
; a[i] = (i*k) * v;
define void @test(i64 %n, i64 %k) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_PH:%.*]]
; CHECK: for.ph:
; CHECK-NEXT: [[K_2:%.*]] = shl nuw nsw i64 [[K:%.*]], 1
; CHECK-NEXT: [[VEC_INIT:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[K]], i64 1
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[K_2]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add <2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ [[VEC_INIT]], [[FOR_PH]] ], [ [[VEC_IND_NEXT_REASS:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: call void @use(<2 x i64> [[STEP_ADD]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT_REASS]] = add <2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.ph
for.ph:
%k.2 = shl nuw nsw i64 %k, 1
%vec.init = insertelement <2 x i64> zeroinitializer, i64 %k, i64 1
%.splatinsert = insertelement <2 x i64> poison, i64 %k.2, i64 0
%.splat = shufflevector <2 x i64> %.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
br label %for.body
for.body:
%index = phi i64 [ 0, %for.ph ], [ %index.next, %for.body ]
%vec.ind = phi <2 x i64> [ %vec.init, %for.ph ], [ %vec.ind.next, %for.body ]
%step.add = add <2 x i64> %vec.ind, %.splat
call void @use(<2 x i64> %step.add)
%index.next = add nuw i64 %index, 4
%vec.ind.next = add <2 x i64> %step.add, %.splat
%cmp = icmp eq i64 %index.next, %n
br i1 %cmp, label %for.end, label %for.body
for.end:
ret void
}
; Same as above but `%step.add` is unused and thus removed.
define void @test_single_use(i64 %n, i64 %k) {
; CHECK-LABEL: @test_single_use(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_PH:%.*]]
; CHECK: for.ph:
; CHECK-NEXT: [[K_2:%.*]] = shl nuw nsw i64 [[K:%.*]], 1
; CHECK-NEXT: [[VEC_INIT:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[K]], i64 1
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[K_2]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add <2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ [[VEC_INIT]], [[FOR_PH]] ], [ [[VEC_IND_NEXT_REASS:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT_REASS]] = add <2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.ph
for.ph:
%k.2 = shl nuw nsw i64 %k, 1
%vec.init = insertelement <2 x i64> zeroinitializer, i64 %k, i64 1
%.splatinsert = insertelement <2 x i64> poison, i64 %k.2, i64 0
%.splat = shufflevector <2 x i64> %.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
br label %for.body
for.body:
%index = phi i64 [ 0, %for.ph ], [ %index.next, %for.body ]
%vec.ind = phi <2 x i64> [ %vec.init, %for.ph ], [ %vec.ind.next, %for.body ]
%step.add = add <2 x i64> %vec.ind, %.splat
%index.next = add nuw i64 %index, 4
%vec.ind.next = add <2 x i64> %step.add, %.splat
%cmp = icmp eq i64 %index.next, %n
br i1 %cmp, label %for.end, label %for.body
for.end:
ret void
}
declare void @use(<2 x i64>)

View File

@@ -79,7 +79,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[I_ADDR_0:%.*]] = phi i32 [ [[ADD:%.*]], [[IF_END:%.*]] ]
; CHECK-NEXT: [[I_ADDR_0:%.*]] = phi i32 [ [[ADD_REASS:%.*]], [[IF_END:%.*]] ]
; CHECK-NEXT: [[P_ADDR_0:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[IF_END]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_ADDR_0]], [[J:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[LOOPEXIT0:%.*]]
@@ -97,7 +97,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
; CHECK-NEXT: [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD_I]], 1
; CHECK-NEXT: [[ADD_REASS]] = add nsw i32 [[I_ADDR]], 2
; CHECK-NEXT: br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
; CHECK: loopexit0:
; CHECK-NEXT: [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]

View File

@@ -2,7 +2,7 @@
define i16 @main() {
; SCEV-EXPR: Classifying expressions for: @main
; SCEV-EXPR-NEXT: %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ]
; SCEV-EXPR-NEXT: %mul = phi i16 [ 1, %entry ], [ %mul.n.3.reass, %loop ]
; SCEV-EXPR-NEXT: --> %mul U: [0,-15) S: [-32768,32753) Exits: 4096 LoopDispositions: { %loop: Variant }
; SCEV-EXPR-NEXT: %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ]
; SCEV-EXPR-NEXT: --> %div U: [-2048,-32768) S: [-2048,-32768) Exits: 7 LoopDispositions: { %loop: Variant }