Files
clang-p2996/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
Florian Hahn 911055e34f [VPlan] Consistently use (Part, 0) for first lane scalar values (#80271)
At the moment, some VPInstructions create only a single scalar value,
but use VPTransformatState's 'vector' storage for this value. Those
values are effectively uniform-per-VF (or in some cases
uniform-across-VF-and-UF). Using the vector/per-part storage doesn't
interact well with other recipes, that more accurately using (Part,
Lane) to look up scalar values and prevents VPInstructions creating
scalars from interacting with other recipes working with scalars.

This PR tries to unify handling of scalars by using (Part, 0) for scalar
values where only the first lane is demanded. This allows using
VPInstructions with other recipes like VPScalarCastRecipe and is also
needed when using VPInstructions in more cases otuside the vector loop
region to generate scalars.

Depends on https://github.com/llvm/llvm-project/pull/80269
2024-02-26 19:06:43 +00:00

278 lines
21 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control-without-rt-check < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL_NO_RT_CHECK
target triple = "aarch64-unknown-linux-gnu"
; Test the different tail folding styles.
define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" {
; NONE-LABEL: @simple_memset_tailfold(
; NONE-NEXT: entry:
; NONE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
; NONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; NONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; NONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]]
; NONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; NONE: vector.ph:
; NONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; NONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]]
; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; NONE-NEXT: br label [[VECTOR_BODY:%.*]]
; NONE: vector.body:
; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
; NONE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0
; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
; NONE-NEXT: store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4
; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]]
; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; NONE: middle.block:
; NONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
; NONE-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NONE: scalar.ph:
; NONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; NONE-NEXT: br label [[WHILE_BODY:%.*]]
; NONE: while.body:
; NONE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; NONE-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
; NONE-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
; NONE-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
; NONE-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
; NONE-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; NONE: while.end.loopexit:
; NONE-NEXT: ret void
;
; DATA-LABEL: @simple_memset_tailfold(
; DATA-NEXT: entry:
; DATA-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
; DATA-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; DATA-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
; DATA-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DATA-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DATA: vector.ph:
; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; DATA-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
; DATA-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
; DATA-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; DATA-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; DATA-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
; DATA: vector.body:
; DATA-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
; DATA-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
; DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]])
; DATA-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
; DATA-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
; DATA-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; DATA-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DATA: middle.block:
; DATA-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; DATA: scalar.ph:
; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; DATA-NEXT: br label [[WHILE_BODY:%.*]]
; DATA: while.body:
; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; DATA-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
; DATA-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
; DATA-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
; DATA-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
; DATA-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; DATA: while.end.loopexit:
; DATA-NEXT: ret void
;
; DATA_NO_LANEMASK-LABEL: @simple_memset_tailfold(
; DATA_NO_LANEMASK-NEXT: entry:
; DATA_NO_LANEMASK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
; DATA_NO_LANEMASK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
; DATA_NO_LANEMASK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; DATA_NO_LANEMASK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
; DATA_NO_LANEMASK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DATA_NO_LANEMASK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DATA_NO_LANEMASK: vector.ph:
; DATA_NO_LANEMASK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; DATA_NO_LANEMASK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; DATA_NO_LANEMASK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; DATA_NO_LANEMASK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
; DATA_NO_LANEMASK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
; DATA_NO_LANEMASK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
; DATA_NO_LANEMASK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; DATA_NO_LANEMASK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; DATA_NO_LANEMASK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]]
; DATA_NO_LANEMASK: vector.body:
; DATA_NO_LANEMASK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ]
; DATA_NO_LANEMASK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i64 0
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; DATA_NO_LANEMASK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; DATA_NO_LANEMASK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
; DATA_NO_LANEMASK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
; DATA_NO_LANEMASK-NEXT: [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; DATA_NO_LANEMASK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
; DATA_NO_LANEMASK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
; DATA_NO_LANEMASK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]])
; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]]
; DATA_NO_LANEMASK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
; DATA_NO_LANEMASK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DATA_NO_LANEMASK: middle.block:
; DATA_NO_LANEMASK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; DATA_NO_LANEMASK: scalar.ph:
; DATA_NO_LANEMASK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; DATA_NO_LANEMASK-NEXT: br label [[WHILE_BODY:%.*]]
; DATA_NO_LANEMASK: while.body:
; DATA_NO_LANEMASK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; DATA_NO_LANEMASK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
; DATA_NO_LANEMASK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
; DATA_NO_LANEMASK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
; DATA_NO_LANEMASK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; DATA_NO_LANEMASK: while.end.loopexit:
; DATA_NO_LANEMASK-NEXT: ret void
;
; DATA_AND_CONTROL-LABEL: @simple_memset_tailfold(
; DATA_AND_CONTROL-NEXT: entry:
; DATA_AND_CONTROL-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
; DATA_AND_CONTROL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
; DATA_AND_CONTROL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
; DATA_AND_CONTROL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DATA_AND_CONTROL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DATA_AND_CONTROL: vector.ph:
; DATA_AND_CONTROL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; DATA_AND_CONTROL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
; DATA_AND_CONTROL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
; DATA_AND_CONTROL-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
; DATA_AND_CONTROL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; DATA_AND_CONTROL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; DATA_AND_CONTROL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; DATA_AND_CONTROL-NEXT: br label [[VECTOR_BODY:%.*]]
; DATA_AND_CONTROL: vector.body:
; DATA_AND_CONTROL-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; DATA_AND_CONTROL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
; DATA_AND_CONTROL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
; DATA_AND_CONTROL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
; DATA_AND_CONTROL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
; DATA_AND_CONTROL-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; DATA_AND_CONTROL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
; DATA_AND_CONTROL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DATA_AND_CONTROL: middle.block:
; DATA_AND_CONTROL-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; DATA_AND_CONTROL: scalar.ph:
; DATA_AND_CONTROL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; DATA_AND_CONTROL-NEXT: br label [[WHILE_BODY:%.*]]
; DATA_AND_CONTROL: while.body:
; DATA_AND_CONTROL-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; DATA_AND_CONTROL-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
; DATA_AND_CONTROL-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
; DATA_AND_CONTROL-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
; DATA_AND_CONTROL-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; DATA_AND_CONTROL: while.end.loopexit:
; DATA_AND_CONTROL-NEXT: ret void
;
; DATA_AND_CONTROL_NO_RT_CHECK-LABEL: @simple_memset_tailfold(
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: entry:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DATA_AND_CONTROL_NO_RT_CHECK: vector.ph:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; DATA_AND_CONTROL_NO_RT_CHECK: vector.body:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DATA_AND_CONTROL_NO_RT_CHECK: middle.block:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; DATA_AND_CONTROL_NO_RT_CHECK: scalar.ph:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; DATA_AND_CONTROL_NO_RT_CHECK: while.body:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; DATA_AND_CONTROL_NO_RT_CHECK: while.end.loopexit:
; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: ret void
;
entry:
br label %while.body
while.body: ; preds = %while.body, %entry
%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
%gep = getelementptr i32, ptr %ptr, i64 %index
store i32 %val, ptr %gep
%index.next = add nsw i64 %index, 1
%cmp10 = icmp ult i64 %index.next, %n
br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
while.end.loopexit: ; preds = %while.body
ret void
}
!0 = distinct !{!0, !1, !2, !3, !4}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = !{!"llvm.loop.vectorize.width", i32 4}