Files
clang-p2996/llvm/test/CodeGen/PowerPC/common-chain.ll
esmeyi 6e0e926c2f [PowerPC] Converts to comparison against zero even when the optimization
doesn't happened in peephole optimizer.

Summary: Converting a comparison against 1 or -1 into a comparison
against 0 can exploit record-form instructions for comparison optimization.
The conversion will happen only when a record-form instruction can be used
to replace the comparison during the peephole optimizer (see function optimizeCompareInstr).

In post-RA, we also want to optimize the comparison by using the record
form (see D131873) and it requires additional dataflow analysis to reliably
find uses of the CR register set.

It's reasonable to common the conversion for both peephole optimizer and
post-RA optimizer.

Converting to comparison against zero even when the optimization doesn't
happened in peephole optimizer may create additional opportunities for the
post-RA optimization.

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D131374
2022-09-15 06:06:25 -04:00

1220 lines
47 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \
; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + offset
; 4: + offset
;
; chains:
; 1: base: base1 + offset, offsets: (0, offset)
; 2: base: base1 + 3*offset, offsets: (0, offset)
;
; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 3 * offset;
; long long o4 = base1 + 4 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @two_chain_same_offset_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: two_chain_same_offset_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB0_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r8, r4, r7
; CHECK-NEXT: add r7, r5, r4
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: add r7, r3, r7
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r7)
; CHECK-NEXT: ldx r8, r7, r4
; CHECK-NEXT: ld r9, 0(r5)
; CHECK-NEXT: ldx r10, r5, r4
; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r8, r6
; CHECK-NEXT: mulld r6, r6, r9
; CHECK-NEXT: maddld r3, r6, r10, r3
; CHECK-NEXT: bdnz .LBB0_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul2 = mul nsw i64 %offset, 3
%mul4 = shl nsw i64 %offset, 2
%cmp46 = icmp sgt i64 %n, 0
br i1 %cmp46, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
%i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.047, %base1
%add.ptr9.idx = add i64 %add, %offset
%add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
%0 = bitcast i8* %add.ptr9 to i64*
%1 = load i64, i64* %0, align 8
%add.ptr10.idx = add i64 %add, %mul
%add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
%2 = bitcast i8* %add.ptr10 to i64*
%3 = load i64, i64* %2, align 8
%add.ptr11.idx = add i64 %add, %mul2
%add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
%4 = bitcast i8* %add.ptr11 to i64*
%5 = load i64, i64* %4, align 8
%add.ptr12.idx = add i64 %add, %mul4
%add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
%6 = bitcast i8* %add.ptr12 to i64*
%7 = load i64, i64* %6, align 8
%mul13 = mul i64 %3, %1
%mul14 = mul i64 %mul13, %5
%mul15 = mul i64 %mul14, %7
%add16 = add i64 %mul15, %sum.048
%inc = add nuw nsw i64 %i.047, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + offset
; 4: + offset
; 5: + offset
;
; It can not be commoned to chains because we need a chain for a single address.
; It is not profitable to common chains if not all addresses are in chains.
;
; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 3 * offset;
; long long o4 = base1 + 4 * offset;
; long long o5 = base1 + 5 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; char *p5 = p + o5;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; unsigned long x5 = *(unsigned long *)(p5 + i);
; sum += x1 * x2 * x3 * x4 * x5;
; }
; return sum;
; }
;
define i64 @not_perfect_chain_all_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: ble cr0, .LBB1_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: sldi r9, r4, 2
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: add r8, r4, r7
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r10, r4, r9
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ldx r6, r5, r4
; CHECK-NEXT: ldx r11, r5, r7
; CHECK-NEXT: ldx r12, r5, r8
; CHECK-NEXT: ldx r0, r5, r9
; CHECK-NEXT: mulld r6, r11, r6
; CHECK-NEXT: ldx r30, r5, r10
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r6, r12
; CHECK-NEXT: mulld r6, r6, r0
; CHECK-NEXT: maddld r3, r6, r30, r3
; CHECK-NEXT: bdnz .LBB1_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul2 = mul nsw i64 %offset, 3
%mul4 = shl nsw i64 %offset, 2
%mul6 = mul nsw i64 %offset, 5
%cmp58 = icmp sgt i64 %n, 0
br i1 %cmp58, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ]
%i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.059, %base1
%add.ptr12.idx = add i64 %add, %offset
%add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
%0 = bitcast i8* %add.ptr12 to i64*
%1 = load i64, i64* %0, align 8
%add.ptr13.idx = add i64 %add, %mul
%add.ptr13 = getelementptr inbounds i8, i8* %p, i64 %add.ptr13.idx
%2 = bitcast i8* %add.ptr13 to i64*
%3 = load i64, i64* %2, align 8
%add.ptr14.idx = add i64 %add, %mul2
%add.ptr14 = getelementptr inbounds i8, i8* %p, i64 %add.ptr14.idx
%4 = bitcast i8* %add.ptr14 to i64*
%5 = load i64, i64* %4, align 8
%add.ptr15.idx = add i64 %add, %mul4
%add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx
%6 = bitcast i8* %add.ptr15 to i64*
%7 = load i64, i64* %6, align 8
%add.ptr16.idx = add i64 %add, %mul6
%add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx
%8 = bitcast i8* %add.ptr16 to i64*
%9 = load i64, i64* %8, align 8
%mul17 = mul i64 %3, %1
%mul18 = mul i64 %mul17, %5
%mul19 = mul i64 %mul18, %7
%mul20 = mul i64 %mul19, %9
%add21 = add i64 %mul20, %sum.060
%inc = add nuw nsw i64 %i.059, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1
; 2: + 2*offset
; 3: + offset
;
; We need at least 4 addresses to common 2 chains to reuse at least 1 offset.
;
; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 3 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; sum += x1 * x2 * x3;
; }
; return sum;
; }
;
define i64 @no_enough_elements_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: no_enough_elements_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB2_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: add r4, r4, r7
; CHECK-NEXT: .p2align 5
; CHECK-NEXT: .LBB2_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r5)
; CHECK-NEXT: ldx r8, r5, r7
; CHECK-NEXT: ldx r9, r5, r4
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r8, r6
; CHECK-NEXT: maddld r3, r6, r9, r3
; CHECK-NEXT: bdnz .LBB2_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul1 = mul nsw i64 %offset, 3
%cmp32 = icmp sgt i64 %n, 0
br i1 %cmp32, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ]
%i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add.ptr5.idx = add i64 %i.033, %base1
%add.ptr5 = getelementptr inbounds i8, i8* %p, i64 %add.ptr5.idx
%0 = bitcast i8* %add.ptr5 to i64*
%1 = load i64, i64* %0, align 8
%add.ptr6.idx = add i64 %add.ptr5.idx, %mul
%add.ptr6 = getelementptr inbounds i8, i8* %p, i64 %add.ptr6.idx
%2 = bitcast i8* %add.ptr6 to i64*
%3 = load i64, i64* %2, align 8
%add.ptr7.idx = add i64 %add.ptr5.idx, %mul1
%add.ptr7 = getelementptr inbounds i8, i8* %p, i64 %add.ptr7.idx
%4 = bitcast i8* %add.ptr7 to i64*
%5 = load i64, i64* %4, align 8
%mul8 = mul i64 %3, %1
%mul9 = mul i64 %mul8, %5
%add10 = add i64 %mul9, %sum.034
%inc = add nuw nsw i64 %i.033, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1
; 2: + 2*offset
; 3: + 2*offset
; 4: + 3*offset
;
; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
; so we can not common any chains.
;
; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 4 * offset;
; long long o4 = base1 + 7 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @no_reuseable_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: no_reuseable_offset_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB3_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r9, r4, 3
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: sldi r8, r4, 2
; CHECK-NEXT: sub r4, r9, r4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB3_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r5)
; CHECK-NEXT: ldx r9, r5, r7
; CHECK-NEXT: ldx r10, r5, r8
; CHECK-NEXT: ldx r11, r5, r4
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r9, r6
; CHECK-NEXT: mulld r6, r6, r10
; CHECK-NEXT: maddld r3, r6, r11, r3
; CHECK-NEXT: bdnz .LBB3_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB3_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul1 = shl nsw i64 %offset, 2
%mul3 = mul nsw i64 %offset, 7
%cmp44 = icmp sgt i64 %n, 0
br i1 %cmp44, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
%i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add.ptr8.idx = add i64 %i.045, %base1
%add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx
%0 = bitcast i8* %add.ptr8 to i64*
%1 = load i64, i64* %0, align 8
%add.ptr9.idx = add i64 %add.ptr8.idx, %mul
%add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
%2 = bitcast i8* %add.ptr9 to i64*
%3 = load i64, i64* %2, align 8
%add.ptr10.idx = add i64 %add.ptr8.idx, %mul1
%add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
%4 = bitcast i8* %add.ptr10 to i64*
%5 = load i64, i64* %4, align 8
%add.ptr11.idx = add i64 %add.ptr8.idx, %mul3
%add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
%6 = bitcast i8* %add.ptr11 to i64*
%7 = load i64, i64* %6, align 8
%mul12 = mul i64 %3, %1
%mul13 = mul i64 %mul12, %5
%mul14 = mul i64 %mul13, %7
%add15 = add i64 %mul14, %sum.046
%inc = add nuw nsw i64 %i.045, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + 3*offset
; 4: + 2*offset
; 5: + 1*offset
; 6: + 2*offset
;
; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6
; and address 5(2*offset), so we can not common chains for these addresses.
;
; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 5 * offset;
; long long o4 = base1 + 7 * offset;
; long long o5 = base1 + 8 * offset;
; long long o6 = base1 + 10 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; char *p5 = p + o5;
; char *p6 = p + o6;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; unsigned long x5 = *(unsigned long *)(p5 + i);
; unsigned long x6 = *(unsigned long *)(p6 + i);
; sum += x1 * x2 * x3 * x4 * x5 * x6;
; }
; return sum;
; }
;
define i64 @not_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: not_same_offset_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: ble cr0, .LBB4_3
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: mulli r11, r4, 10
; CHECK-NEXT: sldi r8, r4, 2
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: add r8, r4, r8
; CHECK-NEXT: sldi r9, r4, 3
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: sub r10, r9, r4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ldx r6, r5, r4
; CHECK-NEXT: ldx r12, r5, r7
; CHECK-NEXT: ldx r0, r5, r8
; CHECK-NEXT: ldx r30, r5, r10
; CHECK-NEXT: mulld r6, r12, r6
; CHECK-NEXT: ldx r29, r5, r9
; CHECK-NEXT: ldx r28, r5, r11
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r6, r0
; CHECK-NEXT: mulld r6, r6, r30
; CHECK-NEXT: mulld r6, r6, r29
; CHECK-NEXT: maddld r3, r6, r28, r3
; CHECK-NEXT: bdnz .LBB4_2
; CHECK-NEXT: b .LBB4_4
; CHECK-NEXT: .LBB4_3:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .LBB4_4: # %for.cond.cleanup
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul2 = mul nsw i64 %offset, 5
%mul4 = mul nsw i64 %offset, 7
%mul6 = shl nsw i64 %offset, 3
%mul8 = mul nsw i64 %offset, 10
%cmp70 = icmp sgt i64 %n, 0
br i1 %cmp70, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ]
%i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.071, %base1
%add.ptr15.idx = add i64 %add, %offset
%add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx
%0 = bitcast i8* %add.ptr15 to i64*
%1 = load i64, i64* %0, align 8
%add.ptr16.idx = add i64 %add, %mul
%add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx
%2 = bitcast i8* %add.ptr16 to i64*
%3 = load i64, i64* %2, align 8
%add.ptr17.idx = add i64 %add, %mul2
%add.ptr17 = getelementptr inbounds i8, i8* %p, i64 %add.ptr17.idx
%4 = bitcast i8* %add.ptr17 to i64*
%5 = load i64, i64* %4, align 8
%add.ptr18.idx = add i64 %add, %mul4
%add.ptr18 = getelementptr inbounds i8, i8* %p, i64 %add.ptr18.idx
%6 = bitcast i8* %add.ptr18 to i64*
%7 = load i64, i64* %6, align 8
%add.ptr19.idx = add i64 %add, %mul6
%add.ptr19 = getelementptr inbounds i8, i8* %p, i64 %add.ptr19.idx
%8 = bitcast i8* %add.ptr19 to i64*
%9 = load i64, i64* %8, align 8
%add.ptr20.idx = add i64 %add, %mul8
%add.ptr20 = getelementptr inbounds i8, i8* %p, i64 %add.ptr20.idx
%10 = bitcast i8* %add.ptr20 to i64*
%11 = load i64, i64* %10, align 8
%mul21 = mul i64 %3, %1
%mul22 = mul i64 %mul21, %5
%mul23 = mul i64 %mul22, %7
%mul24 = mul i64 %mul23, %9
%mul25 = mul i64 %mul24, %11
%add26 = add i64 %mul25, %sum.072
%inc = add nuw nsw i64 %i.071, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + 3*offset
; 4: + 2*offset
;
; chains:
; 1: base1 + offset, offsets: (0, 2*offset)
; 2: base1 + 4*offset, offsets: (0, 2*offset)
;
; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 3 * offset;
; long long o3 = base1 + 4 * offset;
; long long o4 = base1 + 6 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @two_chain_different_offsets_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: two_chain_different_offsets_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB5_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r8, r4, 2
; CHECK-NEXT: add r7, r5, r4
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: add r7, r3, r7
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r7)
; CHECK-NEXT: ldx r8, r7, r4
; CHECK-NEXT: ld r9, 0(r5)
; CHECK-NEXT: ldx r10, r5, r4
; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r8, r6
; CHECK-NEXT: mulld r6, r6, r9
; CHECK-NEXT: maddld r3, r6, r10, r3
; CHECK-NEXT: bdnz .LBB5_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB5_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = mul nsw i64 %offset, 3
%mul2 = shl nsw i64 %offset, 2
%mul4 = mul nsw i64 %offset, 6
%cmp46 = icmp sgt i64 %n, 0
br i1 %cmp46, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
%i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.047, %base1
%add.ptr9.idx = add i64 %add, %offset
%add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
%0 = bitcast i8* %add.ptr9 to i64*
%1 = load i64, i64* %0, align 8
%add.ptr10.idx = add i64 %add, %mul
%add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
%2 = bitcast i8* %add.ptr10 to i64*
%3 = load i64, i64* %2, align 8
%add.ptr11.idx = add i64 %add, %mul2
%add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
%4 = bitcast i8* %add.ptr11 to i64*
%5 = load i64, i64* %4, align 8
%add.ptr12.idx = add i64 %add, %mul4
%add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
%6 = bitcast i8* %add.ptr12 to i64*
%7 = load i64, i64* %6, align 8
%mul13 = mul i64 %3, %1
%mul14 = mul i64 %mul13, %5
%mul15 = mul i64 %mul14, %7
%add16 = add i64 %mul15, %sum.048
%inc = add nuw nsw i64 %i.047, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + 2*offset
; 3: + base2 - base1 - 2*offset
; 4: + 2*offset
;
; chains:
; 1: base1 + offset, offsets: (0, 2*offset)
; 2: base2 + offset, offsets: (0, 2*offset)
;
; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 3 * offset;
; long long o3 = base2 + offset;
; long long o4 = base2 + 3 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @two_chain_two_bases_succ(i8* %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) {
; CHECK-LABEL: two_chain_two_bases_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r7, 0
; CHECK-NEXT: ble cr0, .LBB6_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: add r6, r6, r4
; CHECK-NEXT: add r5, r5, r4
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: add r6, r3, r6
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB6_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r7, 0(r5)
; CHECK-NEXT: ldx r8, r5, r4
; CHECK-NEXT: ld r9, 0(r6)
; CHECK-NEXT: ldx r10, r6, r4
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: mulld r7, r8, r7
; CHECK-NEXT: mulld r7, r7, r9
; CHECK-NEXT: maddld r3, r7, r10, r3
; CHECK-NEXT: bdnz .LBB6_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB6_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = mul nsw i64 %offset, 3
%cmp44 = icmp sgt i64 %n, 0
br i1 %cmp44, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
%i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.045, %base1
%add.ptr8.idx = add i64 %add, %offset
%add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx
%0 = bitcast i8* %add.ptr8 to i64*
%1 = load i64, i64* %0, align 8
%add1 = add i64 %i.045, %mul
%add.ptr9.idx = add i64 %add1, %base1
%add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
%2 = bitcast i8* %add.ptr9 to i64*
%3 = load i64, i64* %2, align 8
%add2 = add i64 %i.045, %base2
%add.ptr10.idx = add i64 %add2, %offset
%add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
%4 = bitcast i8* %add.ptr10 to i64*
%5 = load i64, i64* %4, align 8
%add.ptr11.idx = add i64 %add2, %mul
%add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
%6 = bitcast i8* %add.ptr11 to i64*
%7 = load i64, i64* %6, align 8
%mul12 = mul i64 %3, %1
%mul13 = mul i64 %mul12, %5
%mul14 = mul i64 %mul13, %7
%add15 = add i64 %mul14, %sum.046
%inc = add nuw nsw i64 %i.045, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
;
; Check chain commoning can reduce register pressure to save register spill/reload.
;
; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) {
; inc = inc4;
; #pragma unroll 4
; for (long long i = 0; i < 4 * m; i++) {
; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1];
; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2];
; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3];
; inc = inc + inc4;
; }
; return 0;
; }
;
define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
; CHECK-LABEL: spill_reduce_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r9, -160(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill
; CHECK-NEXT: ble cr0, .LBB7_7
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r6, r6, 2
; CHECK-NEXT: li r7, 1
; CHECK-NEXT: mr r12, r10
; CHECK-NEXT: cmpdi r6, 1
; CHECK-NEXT: iselgt r7, r6, r7
; CHECK-NEXT: addi r8, r7, -1
; CHECK-NEXT: clrldi r6, r7, 63
; CHECK-NEXT: cmpldi r8, 3
; CHECK-NEXT: blt cr0, .LBB7_4
; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
; CHECK-NEXT: rldicl r7, r7, 62, 2
; CHECK-NEXT: sldi r10, r12, 2
; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: rldicl r7, r7, 2, 1
; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill
; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r8, r7, r10
; CHECK-NEXT: mr r22, r7
; CHECK-NEXT: mr r7, r4
; CHECK-NEXT: mr r4, r3
; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: sldi r8, r8, 3
; CHECK-NEXT: add r9, r5, r8
; CHECK-NEXT: add r8, r3, r10
; CHECK-NEXT: add r10, r2, r10
; CHECK-NEXT: sldi r10, r10, 3
; CHECK-NEXT: sldi r8, r8, 3
; CHECK-NEXT: add r30, r5, r10
; CHECK-NEXT: add r29, r7, r10
; CHECK-NEXT: add r28, r4, r10
; CHECK-NEXT: sldi r10, r12, 1
; CHECK-NEXT: add r8, r5, r8
; CHECK-NEXT: add r11, r12, r10
; CHECK-NEXT: add r0, r22, r11
; CHECK-NEXT: sldi r0, r0, 3
; CHECK-NEXT: add r27, r5, r0
; CHECK-NEXT: add r0, r3, r11
; CHECK-NEXT: add r11, r2, r11
; CHECK-NEXT: sldi r11, r11, 3
; CHECK-NEXT: sldi r0, r0, 3
; CHECK-NEXT: add r25, r5, r11
; CHECK-NEXT: add r24, r7, r11
; CHECK-NEXT: add r23, r4, r11
; CHECK-NEXT: add r11, r22, r10
; CHECK-NEXT: add r26, r5, r0
; CHECK-NEXT: mr r0, r22
; CHECK-NEXT: sldi r11, r11, 3
; CHECK-NEXT: add r22, r5, r11
; CHECK-NEXT: add r11, r3, r10
; CHECK-NEXT: add r10, r2, r10
; CHECK-NEXT: sldi r10, r10, 3
; CHECK-NEXT: sldi r11, r11, 3
; CHECK-NEXT: add r20, r5, r10
; CHECK-NEXT: add r19, r7, r10
; CHECK-NEXT: add r18, r4, r10
; CHECK-NEXT: add r10, r12, r0
; CHECK-NEXT: add r21, r5, r11
; CHECK-NEXT: sldi r11, r2, 3
; CHECK-NEXT: sldi r10, r10, 3
; CHECK-NEXT: add r17, r5, r10
; CHECK-NEXT: add r10, r12, r3
; CHECK-NEXT: sldi r10, r10, 3
; CHECK-NEXT: add r16, r5, r10
; CHECK-NEXT: add r10, r12, r2
; CHECK-NEXT: sldi r10, r10, 3
; CHECK-NEXT: add r15, r5, r10
; CHECK-NEXT: add r14, r7, r10
; CHECK-NEXT: add r31, r4, r10
; CHECK-NEXT: sldi r10, r3, 3
; CHECK-NEXT: mr r3, r4
; CHECK-NEXT: mr r4, r7
; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: sub r0, r10, r11
; CHECK-NEXT: sldi r10, r7, 3
; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: sub r2, r10, r11
; CHECK-NEXT: li r11, 0
; CHECK-NEXT: mr r10, r12
; CHECK-NEXT: addi r7, r7, -4
; CHECK-NEXT: rldicl r7, r7, 62, 2
; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: sldi r7, r12, 5
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_3: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: lfd f0, 0(r31)
; CHECK-NEXT: lfd f1, 0(r14)
; CHECK-NEXT: add r10, r10, r12
; CHECK-NEXT: add r10, r10, r12
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r15)
; CHECK-NEXT: add r10, r10, r12
; CHECK-NEXT: add r10, r10, r12
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r15)
; CHECK-NEXT: add r15, r15, r7
; CHECK-NEXT: lfdx f0, r31, r0
; CHECK-NEXT: lfdx f1, r14, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r16, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r16, r11
; CHECK-NEXT: lfdx f0, r31, r2
; CHECK-NEXT: lfdx f1, r14, r2
; CHECK-NEXT: add r31, r31, r7
; CHECK-NEXT: add r14, r14, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r17, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r17, r11
; CHECK-NEXT: lfd f0, 0(r18)
; CHECK-NEXT: lfd f1, 0(r19)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r20, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r20, r11
; CHECK-NEXT: lfdx f0, r18, r0
; CHECK-NEXT: lfdx f1, r19, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r21, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r21, r11
; CHECK-NEXT: lfdx f0, r18, r2
; CHECK-NEXT: lfdx f1, r19, r2
; CHECK-NEXT: add r18, r18, r7
; CHECK-NEXT: add r19, r19, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r22, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r22, r11
; CHECK-NEXT: lfd f0, 0(r23)
; CHECK-NEXT: lfd f1, 0(r24)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r25, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r25, r11
; CHECK-NEXT: lfdx f0, r23, r0
; CHECK-NEXT: lfdx f1, r24, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r26, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r26, r11
; CHECK-NEXT: lfdx f0, r23, r2
; CHECK-NEXT: lfdx f1, r24, r2
; CHECK-NEXT: add r23, r23, r7
; CHECK-NEXT: add r24, r24, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r27, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r27, r11
; CHECK-NEXT: lfd f0, 0(r28)
; CHECK-NEXT: lfd f1, 0(r29)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r30, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r30, r11
; CHECK-NEXT: lfdx f0, r28, r0
; CHECK-NEXT: lfdx f1, r29, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r8, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r8, r11
; CHECK-NEXT: lfdx f0, r28, r2
; CHECK-NEXT: lfdx f1, r29, r2
; CHECK-NEXT: add r28, r28, r7
; CHECK-NEXT: add r29, r29, r7
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r9, r11
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r9, r11
; CHECK-NEXT: add r11, r11, r7
; CHECK-NEXT: bdnz .LBB7_3
; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: beq cr0, .LBB7_7
; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
; CHECK-NEXT: sldi r8, r12, 3
; CHECK-NEXT: ld r12, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r12, r10, r12
; CHECK-NEXT: add r7, r10, r7
; CHECK-NEXT: sldi r0, r12, 3
; CHECK-NEXT: sldi r11, r7, 3
; CHECK-NEXT: add r12, r5, r0
; CHECK-NEXT: add r30, r4, r0
; CHECK-NEXT: add r29, r3, r0
; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r7, r5, r11
; CHECK-NEXT: add r9, r4, r11
; CHECK-NEXT: add r11, r3, r11
; CHECK-NEXT: add r10, r10, r0
; CHECK-NEXT: sldi r10, r10, 3
; CHECK-NEXT: add r5, r5, r10
; CHECK-NEXT: add r4, r4, r10
; CHECK-NEXT: add r3, r3, r10
; CHECK-NEXT: li r10, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_6: # %for.body.epil
; CHECK-NEXT: #
; CHECK-NEXT: lfdx f0, r3, r10
; CHECK-NEXT: lfdx f1, r4, r10
; CHECK-NEXT: addi r6, r6, -1
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r5)
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r5)
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: lfdx f0, r29, r10
; CHECK-NEXT: lfdx f1, r30, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r12, r10
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r12, r10
; CHECK-NEXT: lfdx f0, r11, r10
; CHECK-NEXT: lfdx f1, r9, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r7, r10
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r7, r10
; CHECK-NEXT: add r10, r10, r8
; CHECK-NEXT: bne cr0, .LBB7_6
; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
entry:
%cmp49 = icmp sgt i64 %m, 0
br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%0 = shl i64 %m, 2
%smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1)
%1 = add nsw i64 %smax52, -1
%xtraiter = and i64 %smax52, 1
%2 = icmp ult i64 %1, 3
br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
for.body.preheader.new: ; preds = %for.body.preheader
%unroll_iter = and i64 %smax52, 9223372036854775804
br label %for.body
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
%inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ]
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
%inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
%add.epil = add nsw i64 %inc.addr.050.epil, %inc1
%arrayidx.epil = getelementptr inbounds double, double* %input1, i64 %add.epil
%3 = load double, double* %arrayidx.epil, align 8
%arrayidx2.epil = getelementptr inbounds double, double* %input2, i64 %add.epil
%4 = load double, double* %arrayidx2.epil, align 8
%mul3.epil = fmul double %3, %4
%arrayidx5.epil = getelementptr inbounds double, double* %output, i64 %add.epil
%5 = load double, double* %arrayidx5.epil, align 8
%add6.epil = fadd double %5, %mul3.epil
store double %add6.epil, double* %arrayidx5.epil, align 8
%add7.epil = add nsw i64 %inc.addr.050.epil, %inc2
%arrayidx8.epil = getelementptr inbounds double, double* %input1, i64 %add7.epil
%6 = load double, double* %arrayidx8.epil, align 8
%arrayidx10.epil = getelementptr inbounds double, double* %input2, i64 %add7.epil
%7 = load double, double* %arrayidx10.epil, align 8
%mul11.epil = fmul double %6, %7
%arrayidx13.epil = getelementptr inbounds double, double* %output, i64 %add7.epil
%8 = load double, double* %arrayidx13.epil, align 8
%add14.epil = fadd double %8, %mul11.epil
store double %add14.epil, double* %arrayidx13.epil, align 8
%add15.epil = add nsw i64 %inc.addr.050.epil, %inc3
%arrayidx16.epil = getelementptr inbounds double, double* %input1, i64 %add15.epil
%9 = load double, double* %arrayidx16.epil, align 8
%arrayidx18.epil = getelementptr inbounds double, double* %input2, i64 %add15.epil
%10 = load double, double* %arrayidx18.epil, align 8
%mul19.epil = fmul double %9, %10
%arrayidx21.epil = getelementptr inbounds double, double* %output, i64 %add15.epil
%11 = load double, double* %arrayidx21.epil, align 8
%add22.epil = fadd double %11, %mul19.epil
store double %add22.epil, double* %arrayidx21.epil, align 8
%add23.epil = add nsw i64 %inc.addr.050.epil, %inc4
%epil.iter.sub = add nsw i64 %epil.iter, -1
%epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0
br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil
for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
ret i32 0
for.body: ; preds = %for.body, %for.body.preheader.new
%inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ]
%niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
%add = add nsw i64 %inc.addr.050, %inc1
%arrayidx = getelementptr inbounds double, double* %input1, i64 %add
%12 = load double, double* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds double, double* %input2, i64 %add
%13 = load double, double* %arrayidx2, align 8
%mul3 = fmul double %12, %13
%arrayidx5 = getelementptr inbounds double, double* %output, i64 %add
%14 = load double, double* %arrayidx5, align 8
%add6 = fadd double %14, %mul3
store double %add6, double* %arrayidx5, align 8
%add7 = add nsw i64 %inc.addr.050, %inc2
%arrayidx8 = getelementptr inbounds double, double* %input1, i64 %add7
%15 = load double, double* %arrayidx8, align 8
%arrayidx10 = getelementptr inbounds double, double* %input2, i64 %add7
%16 = load double, double* %arrayidx10, align 8
%mul11 = fmul double %15, %16
%arrayidx13 = getelementptr inbounds double, double* %output, i64 %add7
%17 = load double, double* %arrayidx13, align 8
%add14 = fadd double %17, %mul11
store double %add14, double* %arrayidx13, align 8
%add15 = add nsw i64 %inc.addr.050, %inc3
%arrayidx16 = getelementptr inbounds double, double* %input1, i64 %add15
%18 = load double, double* %arrayidx16, align 8
%arrayidx18 = getelementptr inbounds double, double* %input2, i64 %add15
%19 = load double, double* %arrayidx18, align 8
%mul19 = fmul double %18, %19
%arrayidx21 = getelementptr inbounds double, double* %output, i64 %add15
%20 = load double, double* %arrayidx21, align 8
%add22 = fadd double %20, %mul19
store double %add22, double* %arrayidx21, align 8
%add23 = add nsw i64 %inc.addr.050, %inc4
%add.1 = add nsw i64 %add23, %inc1
%arrayidx.1 = getelementptr inbounds double, double* %input1, i64 %add.1
%21 = load double, double* %arrayidx.1, align 8
%arrayidx2.1 = getelementptr inbounds double, double* %input2, i64 %add.1
%22 = load double, double* %arrayidx2.1, align 8
%mul3.1 = fmul double %21, %22
%arrayidx5.1 = getelementptr inbounds double, double* %output, i64 %add.1
%23 = load double, double* %arrayidx5.1, align 8
%add6.1 = fadd double %23, %mul3.1
store double %add6.1, double* %arrayidx5.1, align 8
%add7.1 = add nsw i64 %add23, %inc2
%arrayidx8.1 = getelementptr inbounds double, double* %input1, i64 %add7.1
%24 = load double, double* %arrayidx8.1, align 8
%arrayidx10.1 = getelementptr inbounds double, double* %input2, i64 %add7.1
%25 = load double, double* %arrayidx10.1, align 8
%mul11.1 = fmul double %24, %25
%arrayidx13.1 = getelementptr inbounds double, double* %output, i64 %add7.1
%26 = load double, double* %arrayidx13.1, align 8
%add14.1 = fadd double %26, %mul11.1
store double %add14.1, double* %arrayidx13.1, align 8
%add15.1 = add nsw i64 %add23, %inc3
%arrayidx16.1 = getelementptr inbounds double, double* %input1, i64 %add15.1
%27 = load double, double* %arrayidx16.1, align 8
%arrayidx18.1 = getelementptr inbounds double, double* %input2, i64 %add15.1
%28 = load double, double* %arrayidx18.1, align 8
%mul19.1 = fmul double %27, %28
%arrayidx21.1 = getelementptr inbounds double, double* %output, i64 %add15.1
%29 = load double, double* %arrayidx21.1, align 8
%add22.1 = fadd double %29, %mul19.1
store double %add22.1, double* %arrayidx21.1, align 8
%add23.1 = add nsw i64 %add23, %inc4
%add.2 = add nsw i64 %add23.1, %inc1
%arrayidx.2 = getelementptr inbounds double, double* %input1, i64 %add.2
%30 = load double, double* %arrayidx.2, align 8
%arrayidx2.2 = getelementptr inbounds double, double* %input2, i64 %add.2
%31 = load double, double* %arrayidx2.2, align 8
%mul3.2 = fmul double %30, %31
%arrayidx5.2 = getelementptr inbounds double, double* %output, i64 %add.2
%32 = load double, double* %arrayidx5.2, align 8
%add6.2 = fadd double %32, %mul3.2
store double %add6.2, double* %arrayidx5.2, align 8
%add7.2 = add nsw i64 %add23.1, %inc2
%arrayidx8.2 = getelementptr inbounds double, double* %input1, i64 %add7.2
%33 = load double, double* %arrayidx8.2, align 8
%arrayidx10.2 = getelementptr inbounds double, double* %input2, i64 %add7.2
%34 = load double, double* %arrayidx10.2, align 8
%mul11.2 = fmul double %33, %34
%arrayidx13.2 = getelementptr inbounds double, double* %output, i64 %add7.2
%35 = load double, double* %arrayidx13.2, align 8
%add14.2 = fadd double %35, %mul11.2
store double %add14.2, double* %arrayidx13.2, align 8
%add15.2 = add nsw i64 %add23.1, %inc3
%arrayidx16.2 = getelementptr inbounds double, double* %input1, i64 %add15.2
%36 = load double, double* %arrayidx16.2, align 8
%arrayidx18.2 = getelementptr inbounds double, double* %input2, i64 %add15.2
%37 = load double, double* %arrayidx18.2, align 8
%mul19.2 = fmul double %36, %37
%arrayidx21.2 = getelementptr inbounds double, double* %output, i64 %add15.2
%38 = load double, double* %arrayidx21.2, align 8
%add22.2 = fadd double %38, %mul19.2
store double %add22.2, double* %arrayidx21.2, align 8
%add23.2 = add nsw i64 %add23.1, %inc4
%add.3 = add nsw i64 %add23.2, %inc1
%arrayidx.3 = getelementptr inbounds double, double* %input1, i64 %add.3
%39 = load double, double* %arrayidx.3, align 8
%arrayidx2.3 = getelementptr inbounds double, double* %input2, i64 %add.3
%40 = load double, double* %arrayidx2.3, align 8
%mul3.3 = fmul double %39, %40
%arrayidx5.3 = getelementptr inbounds double, double* %output, i64 %add.3
%41 = load double, double* %arrayidx5.3, align 8
%add6.3 = fadd double %41, %mul3.3
store double %add6.3, double* %arrayidx5.3, align 8
%add7.3 = add nsw i64 %add23.2, %inc2
%arrayidx8.3 = getelementptr inbounds double, double* %input1, i64 %add7.3
%42 = load double, double* %arrayidx8.3, align 8
%arrayidx10.3 = getelementptr inbounds double, double* %input2, i64 %add7.3
%43 = load double, double* %arrayidx10.3, align 8
%mul11.3 = fmul double %42, %43
%arrayidx13.3 = getelementptr inbounds double, double* %output, i64 %add7.3
%44 = load double, double* %arrayidx13.3, align 8
%add14.3 = fadd double %44, %mul11.3
store double %add14.3, double* %arrayidx13.3, align 8
%add15.3 = add nsw i64 %add23.2, %inc3
%arrayidx16.3 = getelementptr inbounds double, double* %input1, i64 %add15.3
%45 = load double, double* %arrayidx16.3, align 8
%arrayidx18.3 = getelementptr inbounds double, double* %input2, i64 %add15.3
%46 = load double, double* %arrayidx18.3, align 8
%mul19.3 = fmul double %45, %46
%arrayidx21.3 = getelementptr inbounds double, double* %output, i64 %add15.3
%47 = load double, double* %arrayidx21.3, align 8
%add22.3 = fadd double %47, %mul19.3
store double %add22.3, double* %arrayidx21.3, align 8
%add23.3 = add nsw i64 %add23.2, %inc4
%niter.nsub.3 = add i64 %niter, -4
%niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0
br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
}
declare i64 @llvm.smax.i64(i64, i64)