; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 define @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.b, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 %sub.diff = sub i64 %b15, %c14 %neg.compare = icmp slt i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } define @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_commutative: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.b, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_commutative: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 %sub.diff = sub i64 %b15, %c14 %neg.compare = icmp slt i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) %active.lane.mask.alias = or %.splat, %ptr.diff.lane.mask ret %active.lane.mask.alias } define @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.h, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 ; CHECK-NOSVE2-NEXT: cmn x8, #1 ; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 ; CHECK-NOSVE2-NEXT: asr x8, x8, #1 ; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 ; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b14, %c15 %diff = sdiv i64 %sub.diff, 2 %neg.compare = icmp slt i64 %sub.diff, -1 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } define @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.s, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 ; CHECK-NOSVE2-NEXT: add x9, x8, #3 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt ; CHECK-NOSVE2-NEXT: cmn x8, #3 ; CHECK-NOSVE2-NEXT: cset w8, lt ; CHECK-NOSVE2-NEXT: asr x9, x9, #2 ; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %b12 = ptrtoint ptr %b to i64 %c13 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 4 %neg.compare = icmp slt i64 %sub.diff, -3 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } define @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.d, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 ; CHECK-NOSVE2-NEXT: add x9, x8, #7 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt ; CHECK-NOSVE2-NEXT: cmn x8, #7 ; CHECK-NOSVE2-NEXT: cset w8, lt ; CHECK-NOSVE2-NEXT: asr x9, x9, #3 ; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 ; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %b12 = ptrtoint ptr %b to i64 %c13 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 8 %neg.compare = icmp slt i64 %sub.diff, -7 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } define @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: no_whilewr_128: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub x8, x1, x2 ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x9, x8, #15 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: csel x9, x9, x8, lt ; CHECK-NEXT: cmn x8, #15 ; CHECK-NEXT: asr x9, x9, #4 ; CHECK-NEXT: cset w8, lt ; CHECK-NEXT: sbfx x8, x8, #0, #1 ; CHECK-NEXT: mov z1.d, x9 ; CHECK-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d ; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: no_whilewr_128: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 ; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 ; CHECK-NOSVE2-NEXT: ptrue p0.d ; CHECK-NOSVE2-NEXT: add x9, x8, #15 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt ; CHECK-NOSVE2-NEXT: cmn x8, #15 ; CHECK-NOSVE2-NEXT: asr x9, x9, #4 ; CHECK-NOSVE2-NEXT: cset w8, lt ; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 ; CHECK-NOSVE2-NEXT: mov z1.d, x9 ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d ; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b ; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %b12 = ptrtoint ptr %b to i64 %c13 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 16 %neg.compare = icmp slt i64 %sub.diff, -15 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB6_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.b, x1, x2 ; CHECK-NEXT: mov w9, w3 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.b ; CHECK-NEXT: and x10, x10, #0xff ; CHECK-NEXT: .LBB6_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] ; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] ; CHECK-NEXT: add z0.b, z1.b, z0.b ; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: b.mi .LBB6_2 ; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB6_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 ; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff ; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] ; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] ; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b ; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB6_2 ; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %b15, %c14 %neg.compare = icmp slt i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) %0 = zext %active.lane.mask.alias to %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %0) %2 = zext i8 %1 to i64 br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %3 = and %active.lane.mask, %active.lane.mask.alias %4 = getelementptr inbounds i8, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, %3, poison) %5 = getelementptr inbounds i8, ptr %b, i64 %index %wide.masked.load16 = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %3, poison) %6 = add %wide.masked.load16, %wide.masked.load %7 = getelementptr inbounds i8, ptr %c, i64 %index tail call void @llvm.masked.store.nxv16i8.p0( %6, ptr %7, i32 1, %3) %index.next = add i64 %index, %2 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) %8 = extractelement %active.lane.mask.next, i64 0 br i1 %8, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB7_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: whilewr p1.h, x1, x2 ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: whilelo p0.h, xzr, x8 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: .LBB7_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] ; CHECK-NEXT: inch x9 ; CHECK-NEXT: whilelo p0.h, x9, x8 ; CHECK-NEXT: b.mi .LBB7_2 ; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB7_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 ; CHECK-NOSVE2-NEXT: cmn x10, #1 ; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63 ; CHECK-NOSVE2-NEXT: cset w11, lt ; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1 ; CHECK-NOSVE2-NEXT: asr x10, x10, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11 ; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 ; CHECK-NOSVE2-NEXT: cnth x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h ; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB7_2 ; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 %wide.trip.count = zext nneg i32 %n to i64 %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 3 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) %sub.diff = sub i64 %b14, %c15 %diff = sdiv i64 %sub.diff, 2 %neg.compare = icmp slt i64 %sub.diff, -1 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %2 = and %active.lane.mask.alias, %active.lane.mask.entry br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %3 = getelementptr inbounds i16, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, %active.lane.mask, poison) %4 = getelementptr inbounds i16, ptr %b, i64 %index %wide.masked.load16 = tail call @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, %active.lane.mask, poison) %5 = add %wide.masked.load16, %wide.masked.load %6 = getelementptr inbounds i16, ptr %c, i64 %index tail call void @llvm.masked.store.nxv8i16.p0( %5, ptr %6, i32 2, %active.lane.mask) %index.next = add i64 %index, %1 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) %7 = extractelement %active.lane.mask.next, i64 0 br i1 %7, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB8_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: whilewr p1.s, x1, x2 ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: whilelo p0.s, xzr, x8 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: .LBB8_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] ; CHECK-NEXT: incw x9 ; CHECK-NEXT: whilelo p0.s, x9, x8 ; CHECK-NEXT: b.mi .LBB8_2 ; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB8_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 ; CHECK-NOSVE2-NEXT: add x11, x10, #3 ; CHECK-NOSVE2-NEXT: cmp x10, #0 ; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt ; CHECK-NOSVE2-NEXT: cmn x10, #3 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: asr x11, x11, #2 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11 ; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 ; CHECK-NOSVE2-NEXT: cntw x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: .LBB8_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB8_2 ; CHECK-NOSVE2-NEXT: .LBB8_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %b12 = ptrtoint ptr %b to i64 %c13 = ptrtoint ptr %c to i64 %wide.trip.count = zext nneg i32 %n to i64 %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 2 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 4 %neg.compare = icmp slt i64 %sub.diff, -3 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %2 = and %active.lane.mask.alias, %active.lane.mask.entry br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %3 = getelementptr inbounds i32, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %active.lane.mask, poison) %4 = getelementptr inbounds i32, ptr %b, i64 %index %wide.masked.load14 = tail call @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, %active.lane.mask, poison) %5 = add %wide.masked.load14, %wide.masked.load %6 = getelementptr inbounds i32, ptr %c, i64 %index tail call void @llvm.masked.store.nxv4i32.p0( %5, ptr %6, i32 4, %active.lane.mask) %index.next = add i64 %index, %1 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) %7 = extractelement %active.lane.mask.next, i64 0 br i1 %7, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB9_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: whilewr p1.d, x1, x2 ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: whilelo p0.d, xzr, x8 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: .LBB9_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] ; CHECK-NEXT: incd x9 ; CHECK-NEXT: whilelo p0.d, x9, x8 ; CHECK-NEXT: b.mi .LBB9_2 ; CHECK-NEXT: .LBB9_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB9_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 ; CHECK-NOSVE2-NEXT: add x11, x10, #7 ; CHECK-NOSVE2-NEXT: cmp x10, #0 ; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt ; CHECK-NOSVE2-NEXT: cmn x10, #7 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: asr x11, x11, #3 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11 ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 ; CHECK-NOSVE2-NEXT: cntd x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: .LBB9_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d ; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB9_2 ; CHECK-NOSVE2-NEXT: .LBB9_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %b12 = ptrtoint ptr %b to i64 %c13 = ptrtoint ptr %c to i64 %wide.trip.count = zext nneg i32 %n to i64 %0 = tail call i64 @llvm.vscale.i64() %1 = shl nuw nsw i64 %0, 1 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 8 %neg.compare = icmp slt i64 %sub.diff, -7 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %2 = and %active.lane.mask.alias, %active.lane.mask.entry br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %3 = getelementptr inbounds i64, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, %active.lane.mask, poison) %4 = getelementptr inbounds i64, ptr %b, i64 %index %wide.masked.load14 = tail call @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, %active.lane.mask, poison) %5 = add %wide.masked.load14, %wide.masked.load %6 = getelementptr inbounds i64, ptr %c, i64 %index tail call void @llvm.masked.store.nxv2i64.p0( %5, ptr %6, i32 8, %active.lane.mask) %index.next = add i64 %index, %1 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) %7 = extractelement %active.lane.mask.next, i64 0 br i1 %7, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB10_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.b, x0, x2 ; CHECK-NEXT: mov w9, w3 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: whilewr p1.b, x1, x2 ; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.b ; CHECK-NEXT: and x10, x10, #0xff ; CHECK-NEXT: .LBB10_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] ; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] ; CHECK-NEXT: add z0.b, z1.b, z0.b ; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: b.mi .LBB10_2 ; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB10_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9 ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10 ; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff ; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] ; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] ; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b ; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB10_2 ; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %c14 = ptrtoint ptr %c to i64 %a15 = ptrtoint ptr %a to i64 %b16 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a15, %c14 %neg.compare = icmp slt i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff18 = sub i64 %b16, %c14 %neg.compare20 = icmp slt i64 %sub.diff18, 0 %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18) %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) %1 = zext %0 to %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %1) %3 = zext i8 %2 to i64 br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %4 = and %active.lane.mask, %0 %5 = getelementptr inbounds i8, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %4, poison) %6 = getelementptr inbounds i8, ptr %b, i64 %index %wide.masked.load25 = tail call @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, %4, poison) %7 = add %wide.masked.load25, %wide.masked.load %8 = getelementptr inbounds i8, ptr %c, i64 %index tail call void @llvm.masked.store.nxv16i8.p0( %7, ptr %8, i32 1, %4) %index.next = add i64 %index, %3 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) %9 = extractelement %active.lane.mask.next, i64 0 br i1 %9, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB11_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.h, x0, x2 ; CHECK-NEXT: mov w9, w3 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: whilewr p1.h, x1, x2 ; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: whilelo p1.h, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.h ; CHECK-NEXT: and x10, x10, #0xff ; CHECK-NEXT: .LBB11_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.h, x8, x9 ; CHECK-NEXT: b.mi .LBB11_2 ; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB11_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: cmn x9, #1 ; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: asr x9, x9, #1 ; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 ; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 ; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63 ; CHECK-NOSVE2-NEXT: cmn x10, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: asr x9, x9, #1 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9 ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 ; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff ; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] ; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h ; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB11_2 ; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %c14 = ptrtoint ptr %c to i64 %a15 = ptrtoint ptr %a to i64 %b16 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a15, %c14 %diff = sdiv i64 %sub.diff, 2 %neg.compare = icmp slt i64 %sub.diff, -1 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff18 = sub i64 %b16, %c14 %diff19 = sdiv i64 %sub.diff18, 2 %neg.compare20 = icmp slt i64 %sub.diff18, -1 %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19) %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) %1 = zext %0 to %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8( %1) %3 = zext i8 %2 to i64 br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %4 = and %active.lane.mask, %0 %5 = getelementptr inbounds i16, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, %4, poison) %6 = getelementptr inbounds i16, ptr %b, i64 %index %wide.masked.load25 = tail call @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, %4, poison) %7 = add %wide.masked.load25, %wide.masked.load %8 = getelementptr inbounds i16, ptr %c, i64 %index tail call void @llvm.masked.store.nxv8i16.p0( %7, ptr %8, i32 2, %4) %index.next = add i64 %index, %3 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) %9 = extractelement %active.lane.mask.next, i64 0 br i1 %9, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB12_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.s, x0, x2 ; CHECK-NEXT: mov w9, w3 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: whilewr p1.s, x1, x2 ; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.s ; CHECK-NEXT: and x10, x10, #0xff ; CHECK-NEXT: .LBB12_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.s, x8, x9 ; CHECK-NEXT: b.mi .LBB12_2 ; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB12_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: add x10, x9, #3 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt ; CHECK-NOSVE2-NEXT: cmn x9, #3 ; CHECK-NOSVE2-NEXT: asr x9, x10, #2 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 ; CHECK-NOSVE2-NEXT: add x10, x9, #3 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt ; CHECK-NOSVE2-NEXT: cmn x9, #3 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: asr x10, x10, #2 ; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10 ; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9 ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff ; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] ; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB12_2 ; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %c12 = ptrtoint ptr %c to i64 %a13 = ptrtoint ptr %a to i64 %b14 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a13, %c12 %diff = sdiv i64 %sub.diff, 4 %neg.compare = icmp slt i64 %sub.diff, -3 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff16 = sub i64 %b14, %c12 %diff17 = sdiv i64 %sub.diff16, 4 %neg.compare18 = icmp slt i64 %sub.diff16, -3 %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17) %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) %1 = zext %0 to %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8( %1) %3 = zext i8 %2 to i64 br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %4 = and %active.lane.mask, %0 %5 = getelementptr inbounds i32, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, %4, poison) %6 = getelementptr inbounds i32, ptr %b, i64 %index %wide.masked.load23 = tail call @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, %4, poison) %7 = add %wide.masked.load23, %wide.masked.load %8 = getelementptr inbounds i32, ptr %c, i64 %index tail call void @llvm.masked.store.nxv4i32.p0( %7, ptr %8, i32 4, %4) %index.next = add i64 %index, %3 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) %9 = extractelement %active.lane.mask.next, i64 0 br i1 %9, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 ; CHECK-NEXT: b.lt .LBB13_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.d, x0, x2 ; CHECK-NEXT: mov w9, w3 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: whilewr p1.d, x1, x2 ; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: whilelo p1.d, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.d ; CHECK-NEXT: and x10, x10, #0xff ; CHECK-NEXT: .LBB13_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.d, x8, x9 ; CHECK-NEXT: b.mi .LBB13_2 ; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 ; CHECK-NOSVE2-NEXT: b.lt .LBB13_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: add x10, x9, #7 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt ; CHECK-NOSVE2-NEXT: cmn x9, #7 ; CHECK-NOSVE2-NEXT: asr x9, x10, #3 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 ; CHECK-NOSVE2-NEXT: add x10, x9, #7 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt ; CHECK-NOSVE2-NEXT: cmn x9, #7 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: asr x10, x10, #3 ; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10 ; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9 ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff ; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d ; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9 ; CHECK-NOSVE2-NEXT: b.mi .LBB13_2 ; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: %c12 = ptrtoint ptr %c to i64 %a13 = ptrtoint ptr %a to i64 %b14 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a13, %c12 %diff = sdiv i64 %sub.diff, 8 %neg.compare = icmp slt i64 %sub.diff, -7 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff16 = sub i64 %b14, %c12 %diff17 = sdiv i64 %sub.diff16, 8 %neg.compare18 = icmp slt i64 %sub.diff16, -7 %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17) %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) %1 = zext %0 to %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8( %1) %3 = zext i8 %2 to i64 br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] %4 = and %active.lane.mask, %0 %5 = getelementptr inbounds i64, ptr %a, i64 %index %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, %4, poison) %6 = getelementptr inbounds i64, ptr %b, i64 %index %wide.masked.load23 = tail call @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, %4, poison) %7 = add %wide.masked.load23, %wide.masked.load %8 = getelementptr inbounds i64, ptr %c, i64 %index tail call void @llvm.masked.store.nxv2i64.p0( %7, ptr %8, i32 8, %4) %index.next = add i64 %index, %3 %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) %9 = extractelement %active.lane.mask.next, i64 0 br i1 %9, label %vector.body, label %for.cond.cleanup for.cond.cleanup: ret void } declare i64 @llvm.vscale.i64() declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) declare @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, , ) declare void @llvm.masked.store.nxv16i8.p0(, ptr nocapture, i32 immarg, ) declare @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) declare @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, , ) declare void @llvm.masked.store.nxv8i16.p0(, ptr nocapture, i32 immarg, ) declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) declare @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, , ) declare void @llvm.masked.store.nxv4i32.p0(, ptr nocapture, i32 immarg, ) declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) declare @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, , ) declare void @llvm.masked.store.nxv2i64.p0(, ptr nocapture, i32 immarg, )