Reland: [LV]: Teach LV to recursively (de)interleave. (#122989)

This commit relands the changes from "[LV]: Teach LV to recursively
(de)interleave. #89018"

Reason for revert:
- The patch exposed a bug in the IA pass, the bug is now fixed and landed by commit: #122643
This commit is contained in:
Hassnaa Hamdi
2025-01-17 10:34:57 +00:00
committed by GitHub
parent e79bb8731a
commit 9491f75e1d
6 changed files with 1391 additions and 675 deletions

View File

@@ -3505,10 +3505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
// We currently only know how to emit interleave/deinterleave with
// Factor=2 for scalable vectors. This is purely an implementation
// limit.
if (VF.isScalable() && InterleaveFactor != 2)
// For scalable vectors, the only interleave factor currently supported
// must be power of 2 since we require the (de)interleave2 intrinsics
// instead of shufflevectors.
if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
return false;
// If the group involves a non-integral pointer, we may not be able to
@@ -9435,9 +9435,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
// is 2 since we require the (de)interleave2 intrinsics instead of
// shufflevectors.
assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
// must be power of 2 since we require the (de)interleave2 intrinsics
// instead of shufflevectors.
assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};

View File

@@ -2863,10 +2863,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
Vals,
/*FMFSource=*/nullptr, Name);
assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
"scalable vectors, must be power of 2");
SmallVector<Value *> InterleavingValues(Vals);
// When interleaving, the number of values will be shrunk until we have the
// single final interleaved value.
auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
for (unsigned I = 0; I < Midpoint; ++I)
InterleavingValues[I] = Builder.CreateIntrinsic(
InterleaveTy, Intrinsic::vector_interleave2,
{InterleavingValues[I], InterleavingValues[Midpoint + I]},
/*FMFSource=*/nullptr, Name);
}
return InterleavingValues[0];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2952,15 +2963,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
assert(InterleaveFactor == 2 &&
assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
State.VF.getKnownMinValue() * 2, true);
return State.Builder.CreateIntrinsic(
MaskTy, Intrinsic::vector_interleave2, Ops,
/*FMFSource=*/nullptr, "interleaved.mask");
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
return interleaveVectors(State.Builder, Ops, "interleaved.mask");
}
if (!BlockInMask)
@@ -3000,22 +3007,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
ArrayRef<VPValue *> VPDefs = definedValues();
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
assert(InterleaveFactor == 2 &&
assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
Value *DI = State.Builder.CreateIntrinsic(
Intrinsic::vector_deinterleave2, VecTy, NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
unsigned J = 0;
for (unsigned I = 0; I < InterleaveFactor; ++I) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
DeinterleavedValues[0] = NewLoad;
// For the case of InterleaveFactor > 2, we will have to do recursive
// deinterleaving, because the current available deinterleave intrinsic
// supports only Factor of 2, otherwise it will bailout after first
// iteration.
// When deinterleaving, the number of values will double until we
// have "InterleaveFactor".
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
NumVectors *= 2) {
// Deinterleave the elements within the vector
SmallVector<Value *> TempDeinterleavedValues(NumVectors);
for (unsigned I = 0; I < NumVectors; ++I) {
auto *DiTy = DeinterleavedValues[I]->getType();
TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
/*FMFSource=*/nullptr, "strided.vec");
}
// Extract the deinterleaved values:
for (unsigned I = 0; I < 2; ++I)
for (unsigned J = 0; J < NumVectors; ++J)
DeinterleavedValues[NumVectors * I + J] =
State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
}
#ifndef NDEBUG
for (Value *Val : DeinterleavedValues)
assert(Val && "NULL Deinterleaved Value");
#endif
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
if (!Member)
Value *StridedVec = DeinterleavedValues[I];
if (!Member) {
// This value is not needed as it's not used
cast<Instruction>(StridedVec)->eraseFromParent();
continue;
Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
}
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);

View File

@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1548,5 +1548,263 @@ end:
ret void
}
; Check vectorization on an interleaved load/store groups of factor 4
; for (int i = 0; i < 1024; ++i) {
; dst[i].x = a[i].x + b[i].x;
; dst[i].y = a[i].y - b[i].y;
; dst[i].z = a[i].z << b[i].z;
; dst[i].t = a[i].t >> b[i].t;
; }
%struct.xyzt = type { i32, i32, i32, i32 }
define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
; CHECK-LABEL: @interleave_deinterleave(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4
; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4
; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4
; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
store i32 %add, ptr %arrayidx5, align 4
%y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
%2 = load i32, ptr %y, align 4
%y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
%3 = load i32, ptr %y11, align 4
%sub = sub nsw i32 %2, %3
%y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
store i32 %sub, ptr %y14, align 4
%z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
%4 = load i32, ptr %z, align 4
%z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
%5 = load i32, ptr %z19, align 4
%shl = shl i32 %4, %5
%z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
store i32 %shl, ptr %z22, align 4
%t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
%6 = load i32, ptr %t, align 4
%t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
%7 = load i32, ptr %t27, align 4
%shr = ashr i32 %6, %7
%t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
store i32 %shr, ptr %t30, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret void
}
; Check vectorization on a reverse interleaved load/store groups of factor 4
; for (int i = 1023; i >= 0; i--) {
; int a = A[i].x + i;
; int b = A[i].y - i;
; int c = A[i].z * i;
; int d = A[i].t << i;
; B[i].x = a;
; B[i].y = b;
; B[i].z = c;
; B[i].t = d;
; }
define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
; CHECK-LABEL: @interleave_deinterleave_reverse(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; CHECK-NEXT: [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
; CHECK-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
; CHECK-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
; CHECK-NEXT: [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
; CHECK-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
; CHECK-NEXT: [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-NEXT: [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
;
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
%x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
%load1 = load i32, ptr %x, align 4
%trunc = trunc i64 %indvars.iv to i32
%add = add nsw i32 %load1, %trunc
%y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
%load2 = load i32, ptr %y, align 4
%sub = sub nsw i32 %load2, %trunc
%z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
%load3 = load i32, ptr %z, align 4
%mul = mul nsw i32 %load3, %trunc
%t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
%load4 = load i32, ptr %t, align 4
%shl = shl nuw nsw i32 %load4, %trunc
%x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
store i32 %add, ptr %x5, align 4
%y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
store i32 %sub, ptr %y8, align 4
%z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
store i32 %mul, ptr %z5, align 4
%t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
store i32 %shl, ptr %t8, align 4
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%cmp = icmp sgt i64 %indvars.iv, 0
br i1 %cmp, label %for.body, label %for.cond.cleanup
}
attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }

View File

@@ -529,3 +529,255 @@ for.inc:
for.end:
ret void
}
; Expected to contain interleave2/deinterleave2 instructions
;
; void masked_strided_factor4(const unsigned char* restrict p,
; unsigned char* restrict q,
; unsigned char guard) {
; for(ix=0; ix < 1024; ++ix) {
; if (ix > guard) {
; char left1 = p[4*ix];
; char right1 = p[4*ix + 1];
; char left2 = p[4*ix + 2];
; char right2 = p[4*ix + 3];
; char max1 = max(left1, right1);
; char max2 = max(left2, right2);
; q[4*ix] = max1;
; q[4*ix + 1] = 0 - max1;
; q[4*ix + 2] = max2;
; q[4*ix + 3] = 0 - max2;
; }
; }
;}
define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
; SCALAR_TAIL_FOLDING-NEXT: entry:
; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALAR_TAIL_FOLDING: vector.ph:
; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4
; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALAR_TAIL_FOLDING: vector.body:
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2
; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SCALAR_TAIL_FOLDING: middle.block:
; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SCALAR_TAIL_FOLDING: scalar.ph:
; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]]
; SCALAR_TAIL_FOLDING: for.body:
; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; SCALAR_TAIL_FOLDING: if.then:
; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]])
; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]])
; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]]
; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]]
; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]]
; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]]
; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]]
; SCALAR_TAIL_FOLDING: for.inc:
; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1
; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; SCALAR_TAIL_FOLDING: for.end:
; SCALAR_TAIL_FOLDING-NEXT: ret void
;
; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
; PREDICATED_TAIL_FOLDING-NEXT: entry:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; PREDICATED_TAIL_FOLDING: vector.ph:
; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]]
; PREDICATED_TAIL_FOLDING: vector.body:
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; PREDICATED_TAIL_FOLDING: scalar.ph:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]]
; PREDICATED_TAIL_FOLDING: for.body:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; PREDICATED_TAIL_FOLDING: if.then:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]]
; PREDICATED_TAIL_FOLDING: for.inc:
; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; PREDICATED_TAIL_FOLDING: for.end:
; PREDICATED_TAIL_FOLDING-NEXT: ret void
;
entry:
%conv = zext i8 %guard to i32
br label %for.body
for.body:
%ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%cmp1 = icmp ugt i32 %ix.024, %conv
br i1 %cmp1, label %if.then, label %for.inc
if.then:
%idx0 = shl nuw nsw i32 %ix.024, 2
%idx1 = add i32 %idx0, 1
%idx2 = add i32 %idx0, 2
%idx3 = add i32 %idx0, 3
%array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
%0 = load i8, ptr %array1idx0, align 1
%array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
%1 = load i8, ptr %array1idx1, align 1
%array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
%2 = load i8, ptr %array1idx2, align 1
%array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
%3 = load i8, ptr %array1idx3, align 1
%cmp.i1 = icmp slt i8 %0, %1
%spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
%sub1 = sub i8 0, %spec.select.i1
%cmp.i2 = icmp slt i8 %2, %3
%spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
%sub2 = sub i8 0, %spec.select.i2
%array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
store i8 %spec.select.i1, ptr %array3idx0, align 1
%array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
store i8 %sub1, ptr %array3idx1, align 1
%array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
store i8 %spec.select.i2, ptr %array3idx2, align 1
%array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
store i8 %sub2, ptr %array3idx3, align 1
br label %for.inc
for.inc:
%inc = add nuw nsw i32 %ix.024, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}

View File

@@ -0,0 +1,135 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
%struct.xyzt = type { i32, i32, i32, i32 }
; for (int i = 0; i < 1024; ++i) {
; dst[i].x = a[i].x + b[i].x;
; dst[i].y = a[i].y - b[i].y;
; dst[i].z = a[i].z << b[i].z;
; dst[i].t = a[i].t >> b[i].t;
; }
define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
; CHECK-LABEL: @interleave_deinterleave(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[LDN9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP13]])
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 0
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 1
; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 2
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 3
; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP21]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4
; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4
; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]]
; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4
; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4
; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]]
; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4
; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4
; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]]
; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv
%a.0 = load i32, ptr %gep.a, align 4
%gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv
%b.0 = load i32, ptr %gep.b, align 4
%add = add nsw i32 %b.0, %a.0
%gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv
store i32 %add, ptr %gep.dst, align 4
%gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4
%a.1 = load i32, ptr %gep.a.1, align 4
%gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4
%b.1 = load i32, ptr %gep.b.1, align 4
%sub = sub nsw i32 %a.1, %b.1
%gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4
store i32 %sub, ptr %gep.dst.1, align 4
%gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8
%a.2 = load i32, ptr %gep.a.2, align 4
%gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8
%b.2 = load i32, ptr %gep.b.2, align 4
%shl = shl i32 %a.2, %b.2
%gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8
store i32 %shl, ptr %gep.dst.2, align 4
%gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12
%a.3 = load i32, ptr %gep.a.3, align 4
%gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12
%b.3 = load i32, ptr %gep.b.3, align 4
%shr = ashr i32 %a.3, %b.3
%gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12
store i32 %shr, ptr %gep.dst.3, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret void
}