RegAllocGreedy uses SlotIndexes::getApproxInstrDistance to approximate the length of a live range for its heuristics. Renumbering all slot indexes with the default instruction distance ensures that this estimate will be as accurate as possible, and will not depend on the history of how instructions have been added to and removed from SlotIndexes's maps. This also means that enabling -early-live-intervals, which runs the SlotIndexes analysis earlier, will not cause large amounts of churn due to different register allocator decisions.
267 lines
13 KiB
LLVM
267 lines
13 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
|
|
|
|
target triple = "aarch64"
|
|
|
|
%"struct.std::complex" = type { { double, double } }
|
|
|
|
; Zero initialized reduction
|
|
;
|
|
; complex<double> x = 0.0 + 0.0i;
|
|
; for (int i = 0; i < 100; ++i)
|
|
; x += a[i] * b[i];
|
|
;
|
|
define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
|
|
; CHECK-LABEL: complex_mul_v2f64:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
|
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
|
; CHECK-NEXT: mov x8, xzr
|
|
; CHECK-NEXT: .LBB0_1: // %vector.body
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add x9, x0, x8
|
|
; CHECK-NEXT: add x10, x1, x8
|
|
; CHECK-NEXT: add x8, x8, #32
|
|
; CHECK-NEXT: ldp q3, q2, [x9]
|
|
; CHECK-NEXT: cmp x8, #1600
|
|
; CHECK-NEXT: ldp q5, q4, [x10]
|
|
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
|
|
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
|
|
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
|
|
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90
|
|
; CHECK-NEXT: b.ne .LBB0_1
|
|
; CHECK-NEXT: // %bb.2: // %middle.block
|
|
; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d
|
|
; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
|
|
; CHECK-NEXT: faddp d0, v0.2d
|
|
; CHECK-NEXT: faddp d1, v2.2d
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %entry
|
|
%lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
|
|
%vec.phi = phi <2 x double> [ zeroinitializer, %entry ], [ %7, %vector.body ]
|
|
%vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %5, %vector.body ]
|
|
%scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
|
|
%scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
|
|
%wide.vec = load <4 x double>, ptr %scevgep, align 8
|
|
%strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
|
|
%strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%0 = fmul fast <2 x double> %strided.vec31, %strided.vec
|
|
%1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
|
|
%2 = fmul fast <2 x double> %strided.vec30, %strided.vec
|
|
%3 = fadd fast <2 x double> %2, %vec.phi27
|
|
%4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
|
|
%5 = fsub fast <2 x double> %3, %4
|
|
%6 = fadd fast <2 x double> %1, %vec.phi
|
|
%7 = fadd fast <2 x double> %6, %0
|
|
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
|
|
%8 = icmp eq i64 %lsr.iv.next, 1600
|
|
br i1 %8, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
|
|
%10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
|
|
%.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
|
|
%.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
|
|
ret %"struct.std::complex" %.fca.0.1.insert
|
|
}
|
|
|
|
; Fixed value initialized reduction
|
|
;
|
|
; complex<double> x = 2.0 + 1.0i;
|
|
; for (int i = 0; i < 100; ++i)
|
|
; x += a[i] * b[i];
|
|
;
|
|
define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
|
|
; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
|
; CHECK-NEXT: adrp x8, .LCPI1_0
|
|
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
|
|
; CHECK-NEXT: mov x8, xzr
|
|
; CHECK-NEXT: .LBB1_1: // %vector.body
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add x9, x0, x8
|
|
; CHECK-NEXT: add x10, x1, x8
|
|
; CHECK-NEXT: add x8, x8, #32
|
|
; CHECK-NEXT: ldp q3, q2, [x9]
|
|
; CHECK-NEXT: cmp x8, #1600
|
|
; CHECK-NEXT: ldp q5, q4, [x10]
|
|
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
|
|
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
|
|
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
|
|
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90
|
|
; CHECK-NEXT: b.ne .LBB1_1
|
|
; CHECK-NEXT: // %bb.2: // %middle.block
|
|
; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d
|
|
; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d
|
|
; CHECK-NEXT: faddp d0, v0.2d
|
|
; CHECK-NEXT: faddp d1, v2.2d
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %entry
|
|
%lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
|
|
%vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %7, %vector.body ]
|
|
%vec.phi27 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %5, %vector.body ]
|
|
%scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
|
|
%scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
|
|
%wide.vec = load <4 x double>, ptr %scevgep, align 8
|
|
%strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
|
|
%strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%0 = fmul fast <2 x double> %strided.vec31, %strided.vec
|
|
%1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
|
|
%2 = fmul fast <2 x double> %strided.vec30, %strided.vec
|
|
%3 = fadd fast <2 x double> %2, %vec.phi27
|
|
%4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
|
|
%5 = fsub fast <2 x double> %3, %4
|
|
%6 = fadd fast <2 x double> %1, %vec.phi
|
|
%7 = fadd fast <2 x double> %6, %0
|
|
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
|
|
%8 = icmp eq i64 %lsr.iv.next, 1600
|
|
br i1 %8, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
|
|
%10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
|
|
%.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
|
|
%.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
|
|
ret %"struct.std::complex" %.fca.0.1.insert
|
|
}
|
|
|
|
; Loop unrolled with factor 2
|
|
;
|
|
define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
|
|
; CHECK-LABEL: complex_mul_v2f64_unrolled:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
|
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
|
; CHECK-NEXT: adrp x8, .LCPI2_0
|
|
; CHECK-NEXT: movi v3.2d, #0000000000000000
|
|
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
|
|
; CHECK-NEXT: mov x8, xzr
|
|
; CHECK-NEXT: .LBB2_1: // %vector.body
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add x9, x0, x8
|
|
; CHECK-NEXT: add x10, x1, x8
|
|
; CHECK-NEXT: add x8, x8, #64
|
|
; CHECK-NEXT: ldp q5, q4, [x9]
|
|
; CHECK-NEXT: cmp x8, #1600
|
|
; CHECK-NEXT: ldp q7, q6, [x10]
|
|
; CHECK-NEXT: ldp q17, q16, [x9, #32]
|
|
; CHECK-NEXT: ldp q19, q18, [x10, #32]
|
|
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0
|
|
; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0
|
|
; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0
|
|
; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0
|
|
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90
|
|
; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90
|
|
; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90
|
|
; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90
|
|
; CHECK-NEXT: b.ne .LBB2_1
|
|
; CHECK-NEXT: // %bb.2: // %middle.block
|
|
; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d
|
|
; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d
|
|
; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d
|
|
; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d
|
|
; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d
|
|
; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d
|
|
; CHECK-NEXT: faddp d0, v0.2d
|
|
; CHECK-NEXT: faddp d1, v1.2d
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%scevgep = getelementptr i8, ptr %a, i64 32
|
|
%scevgep49 = getelementptr i8, ptr %b, i64 32
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %entry
|
|
%lsr.iv54 = phi i64 [ %lsr.iv.next, %vector.body ], [ 100, %entry ]
|
|
%lsr.iv50 = phi ptr [ %scevgep51, %vector.body ], [ %scevgep49, %entry ]
|
|
%lsr.iv = phi ptr [ %scevgep48, %vector.body ], [ %scevgep, %entry ]
|
|
%vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %14, %vector.body ]
|
|
%vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %15, %vector.body ]
|
|
%vec.phi28 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %10, %vector.body ]
|
|
%vec.phi29 = phi <2 x double> [ zeroinitializer, %entry ], [ %11, %vector.body ]
|
|
%scevgep52 = getelementptr i8, ptr %lsr.iv, i64 -32
|
|
%scevgep53 = getelementptr i8, ptr %lsr.iv50, i64 -32
|
|
%wide.vec = load <4 x double>, ptr %scevgep52, align 8
|
|
%wide.vec30 = load <4 x double>, ptr %lsr.iv, align 8
|
|
%strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec31 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec32 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%strided.vec33 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%wide.vec34 = load <4 x double>, ptr %scevgep53, align 8
|
|
%wide.vec35 = load <4 x double>, ptr %lsr.iv50, align 8
|
|
%strided.vec36 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec37 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 0, i32 2>
|
|
%strided.vec38 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%strided.vec39 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 1, i32 3>
|
|
%0 = fmul fast <2 x double> %strided.vec38, %strided.vec
|
|
%1 = fmul fast <2 x double> %strided.vec39, %strided.vec31
|
|
%2 = fmul fast <2 x double> %strided.vec36, %strided.vec32
|
|
%3 = fmul fast <2 x double> %strided.vec37, %strided.vec33
|
|
%4 = fmul fast <2 x double> %strided.vec36, %strided.vec
|
|
%5 = fmul fast <2 x double> %strided.vec37, %strided.vec31
|
|
%6 = fadd fast <2 x double> %4, %vec.phi28
|
|
%7 = fadd fast <2 x double> %5, %vec.phi29
|
|
%8 = fmul fast <2 x double> %strided.vec38, %strided.vec32
|
|
%9 = fmul fast <2 x double> %strided.vec39, %strided.vec33
|
|
%10 = fsub fast <2 x double> %6, %8
|
|
%11 = fsub fast <2 x double> %7, %9
|
|
%12 = fadd fast <2 x double> %2, %vec.phi
|
|
%13 = fadd fast <2 x double> %3, %vec.phi27
|
|
%14 = fadd fast <2 x double> %12, %0
|
|
%15 = fadd fast <2 x double> %13, %1
|
|
%scevgep48 = getelementptr i8, ptr %lsr.iv, i64 64
|
|
%scevgep51 = getelementptr i8, ptr %lsr.iv50, i64 64
|
|
%lsr.iv.next = add nsw i64 %lsr.iv54, -4
|
|
%16 = icmp eq i64 %lsr.iv.next, 0
|
|
br i1 %16, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%bin.rdx40 = fadd fast <2 x double> %11, %10
|
|
%17 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx40)
|
|
%bin.rdx = fadd fast <2 x double> %15, %14
|
|
%18 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx)
|
|
%.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %17, 0, 0
|
|
%.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %18, 0, 1
|
|
ret %"struct.std::complex" %.fca.0.1.insert
|
|
}
|
|
|
|
; The reduced bug from D153355. Shows that reduction was detected where it did not exist.
|
|
define void @incorrect_reduction_pattern(i1 %exitcond.not) {
|
|
; CHECK-LABEL: incorrect_reduction_pattern:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: .LBB3_1: // %for.body
|
|
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: tbz w0, #0, .LBB3_1
|
|
; CHECK-NEXT: // %bb.2: // %for.end.loopexit
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%vec_r = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_r, %for.body ]
|
|
%vec_i = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_i, %for.body ]
|
|
%add = fadd <4 x float> %vec_r, %vec_i
|
|
%lane_r = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
|
|
%lane_i = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
|
|
br i1 %exitcond.not, label %for.end.loopexit, label %for.body
|
|
|
|
for.end.loopexit: ; preds = %for.body
|
|
%mul.r = fadd <4 x float> %lane_r, %add
|
|
%mul.i = fadd <4 x float> %lane_i, %add
|
|
ret void
|
|
}
|
|
|
|
declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
|