Files
clang-p2996/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
Jay Foad e0919b189b [CodeGen] Renumber slot indexes before register allocation (#66334)
RegAllocGreedy uses SlotIndexes::getApproxInstrDistance to approximate
the length of a live range for its heuristics. Renumbering all slot
indexes with the default instruction distance ensures that this estimate
will be as accurate as possible, and will not depend on the history of
how instructions have been added to and removed from SlotIndexes's maps.

This also means that enabling -early-live-intervals, which runs the
SlotIndexes analysis earlier, will not cause large amounts of churn due
to different register allocator decisions.
2023-09-19 11:18:12 +01:00

267 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
target triple = "aarch64"
%"struct.std::complex" = type { { double, double } }
; Zero initialized reduction
;
; complex<double> x = 0.0 + 0.0i;
; for (int i = 0; i < 100; ++i)
; x += a[i] * b[i];
;
define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: add x10, x1, x8
; CHECK-NEXT: add x8, x8, #32
; CHECK-NEXT: ldp q3, q2, [x9]
; CHECK-NEXT: cmp x8, #1600
; CHECK-NEXT: ldp q5, q4, [x10]
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %middle.block
; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d
; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: faddp d1, v2.2d
; CHECK-NEXT: ret
entry:
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
%vec.phi = phi <2 x double> [ zeroinitializer, %entry ], [ %7, %vector.body ]
%vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %5, %vector.body ]
%scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
%scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
%wide.vec = load <4 x double>, ptr %scevgep, align 8
%strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
%strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%0 = fmul fast <2 x double> %strided.vec31, %strided.vec
%1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
%2 = fmul fast <2 x double> %strided.vec30, %strided.vec
%3 = fadd fast <2 x double> %2, %vec.phi27
%4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
%5 = fsub fast <2 x double> %3, %4
%6 = fadd fast <2 x double> %1, %vec.phi
%7 = fadd fast <2 x double> %6, %0
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
%8 = icmp eq i64 %lsr.iv.next, 1600
br i1 %8, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
%10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
%.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
%.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
ret %"struct.std::complex" %.fca.0.1.insert
}
; Fixed value initialized reduction
;
; complex<double> x = 2.0 + 1.0i;
; for (int i = 0; i < 100; ++i)
; x += a[i] * b[i];
;
define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: add x10, x1, x8
; CHECK-NEXT: add x8, x8, #32
; CHECK-NEXT: ldp q3, q2, [x9]
; CHECK-NEXT: cmp x8, #1600
; CHECK-NEXT: ldp q5, q4, [x10]
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %middle.block
; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d
; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: faddp d1, v2.2d
; CHECK-NEXT: ret
entry:
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
%vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %7, %vector.body ]
%vec.phi27 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %5, %vector.body ]
%scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
%scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
%wide.vec = load <4 x double>, ptr %scevgep, align 8
%strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
%strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%0 = fmul fast <2 x double> %strided.vec31, %strided.vec
%1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
%2 = fmul fast <2 x double> %strided.vec30, %strided.vec
%3 = fadd fast <2 x double> %2, %vec.phi27
%4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
%5 = fsub fast <2 x double> %3, %4
%6 = fadd fast <2 x double> %1, %vec.phi
%7 = fadd fast <2 x double> %6, %0
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
%8 = icmp eq i64 %lsr.iv.next, 1600
br i1 %8, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
%10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
%.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
%.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
ret %"struct.std::complex" %.fca.0.1.insert
}
; Loop unrolled with factor 2
;
define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-LABEL: complex_mul_v2f64_unrolled:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: add x10, x1, x8
; CHECK-NEXT: add x8, x8, #64
; CHECK-NEXT: ldp q5, q4, [x9]
; CHECK-NEXT: cmp x8, #1600
; CHECK-NEXT: ldp q7, q6, [x10]
; CHECK-NEXT: ldp q17, q16, [x9, #32]
; CHECK-NEXT: ldp q19, q18, [x10, #32]
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0
; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0
; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0
; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90
; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90
; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90
; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %middle.block
; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d
; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d
; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d
; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d
; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d
; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: faddp d1, v1.2d
; CHECK-NEXT: ret
entry:
%scevgep = getelementptr i8, ptr %a, i64 32
%scevgep49 = getelementptr i8, ptr %b, i64 32
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%lsr.iv54 = phi i64 [ %lsr.iv.next, %vector.body ], [ 100, %entry ]
%lsr.iv50 = phi ptr [ %scevgep51, %vector.body ], [ %scevgep49, %entry ]
%lsr.iv = phi ptr [ %scevgep48, %vector.body ], [ %scevgep, %entry ]
%vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %14, %vector.body ]
%vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %15, %vector.body ]
%vec.phi28 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %10, %vector.body ]
%vec.phi29 = phi <2 x double> [ zeroinitializer, %entry ], [ %11, %vector.body ]
%scevgep52 = getelementptr i8, ptr %lsr.iv, i64 -32
%scevgep53 = getelementptr i8, ptr %lsr.iv50, i64 -32
%wide.vec = load <4 x double>, ptr %scevgep52, align 8
%wide.vec30 = load <4 x double>, ptr %lsr.iv, align 8
%strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec31 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec32 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%strided.vec33 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%wide.vec34 = load <4 x double>, ptr %scevgep53, align 8
%wide.vec35 = load <4 x double>, ptr %lsr.iv50, align 8
%strided.vec36 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec37 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 0, i32 2>
%strided.vec38 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%strided.vec39 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 1, i32 3>
%0 = fmul fast <2 x double> %strided.vec38, %strided.vec
%1 = fmul fast <2 x double> %strided.vec39, %strided.vec31
%2 = fmul fast <2 x double> %strided.vec36, %strided.vec32
%3 = fmul fast <2 x double> %strided.vec37, %strided.vec33
%4 = fmul fast <2 x double> %strided.vec36, %strided.vec
%5 = fmul fast <2 x double> %strided.vec37, %strided.vec31
%6 = fadd fast <2 x double> %4, %vec.phi28
%7 = fadd fast <2 x double> %5, %vec.phi29
%8 = fmul fast <2 x double> %strided.vec38, %strided.vec32
%9 = fmul fast <2 x double> %strided.vec39, %strided.vec33
%10 = fsub fast <2 x double> %6, %8
%11 = fsub fast <2 x double> %7, %9
%12 = fadd fast <2 x double> %2, %vec.phi
%13 = fadd fast <2 x double> %3, %vec.phi27
%14 = fadd fast <2 x double> %12, %0
%15 = fadd fast <2 x double> %13, %1
%scevgep48 = getelementptr i8, ptr %lsr.iv, i64 64
%scevgep51 = getelementptr i8, ptr %lsr.iv50, i64 64
%lsr.iv.next = add nsw i64 %lsr.iv54, -4
%16 = icmp eq i64 %lsr.iv.next, 0
br i1 %16, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%bin.rdx40 = fadd fast <2 x double> %11, %10
%17 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx40)
%bin.rdx = fadd fast <2 x double> %15, %14
%18 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx)
%.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %17, 0, 0
%.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %18, 0, 1
ret %"struct.std::complex" %.fca.0.1.insert
}
; The reduced bug from D153355. Shows that reduction was detected where it did not exist.
define void @incorrect_reduction_pattern(i1 %exitcond.not) {
; CHECK-LABEL: incorrect_reduction_pattern:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: .LBB3_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: tbz w0, #0, .LBB3_1
; CHECK-NEXT: // %bb.2: // %for.end.loopexit
; CHECK-NEXT: ret
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%vec_r = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_r, %for.body ]
%vec_i = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_i, %for.body ]
%add = fadd <4 x float> %vec_r, %vec_i
%lane_r = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
%lane_i = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
br i1 %exitcond.not, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
%mul.r = fadd <4 x float> %lane_r, %add
%mul.i = fadd <4 x float> %lane_i, %add
ret void
}
declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)