From 63e8a1b16f344eaef17c4015497326479e69d1e7 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 5 Sep 2024 07:52:27 -0700 Subject: [PATCH] [SLP] Enable reordering for non-power-of-two vectors (#106638) This change tries to enable vector reordering during vectorization for non-power-of-two vectors. Specifically, my goal is to be able to vectorize reductions whose operands appear in other than identity order. (i.e. a[1] + a[0] + a[2]). Our standard pass pipeline, Reassociation effectively canonicalizes towards this form. So for reduction vectorization to be wildly applicable, we need this feature. This change enables the use of a non-empty ReorderIndices structure - which is effectively required for out of order loads or gathers - while leaving the ReuseShuffleIndices mechanism unused and disabled. If I've understood the code structure, the former is used when describing implicit shuffles required by the vectorization strategy (i.e. loading elements 0,1,3,2 in the order 0,1,2,3 and then shuffling later), while the later is used when trying to optimize explode/buildvectors (called gathers in this code). I audited all the code enabled by this change, but can't claim to deeply understand most of it. I added a couple of bailouts in places which appeared to be difficult to audit and optional optimizations. I've tried to do so in the least risky way I can, but am not completely confident in this change. Careful review appreciated. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 48 +++++----- .../AArch64/vec3-reorder-reshuffle.ll | 15 ++-- .../SLPVectorizer/RISCV/vec3-base.ll | 88 +++++++++++-------- .../X86/vec3-reorder-reshuffle.ll | 9 +- 4 files changed, 89 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 60476398e5ca..74bb529b2526 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3388,6 +3388,10 @@ private: TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->State = EntryState; + // FIXME: Remove once support for ReuseShuffleIndices has been implemented + // for non-power-of-two vectors. + assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) && + "Reshuffling scalars not yet supported for nodes with padding"); Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { @@ -3452,11 +3456,8 @@ private: MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) { + if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && - "Reordering isn't implemented for non-power-of-2 nodes yet"); - } return Last; } @@ -4731,12 +4732,6 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( auto *VecTy = getWidenedType(ScalarTy, Sz); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); - // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (!Order.empty() && !has_single_bit(VL.size())) { - assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " - "supported with VectorizeNonPowerOf2"); - return LoadsState::Gather; - } Align CommonAlignment = computeCommonAlignment(VL); if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) && @@ -4824,6 +4819,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // representation is better than just gather. auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, bool ProfitableGatherPointers) { + // FIXME: The following code has not been updated for non-power-of-2 + // vectors. The splitting logic here does not cover the original + // vector if the vector factor is not a power of two. FIXME + if (!has_single_bit(VL.size())) + return false; + // Compare masked gather cost and loads + insert subvector costs. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto [ScalarGEPCost, VectorGEPCost] = @@ -5195,13 +5196,13 @@ static bool areTwoInsertFromSameBuildVector( std::optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { - // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (TE.isNonPowOf2Vec()) - return std::nullopt; - // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { + // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. + assert(!TE.isNonPowOf2Vec() && + "Reshuffling scalars not yet supported for nodes with padding"); + if (isSplat(TE.Scalars)) return std::nullopt; // Check if reuse shuffle indices can be improved by reordering. @@ -5424,11 +5425,15 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } if (isSplat(TE.Scalars)) return std::nullopt; - if (TE.Scalars.size() >= 4) + if (TE.Scalars.size() >= 3) if (std::optional Order = findPartiallyOrderedLoads(TE)) return Order; - if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) - return CurrentOrder; + + // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars + // has been auditted for correctness with non-power-of-two vectors. + if (!TE.isNonPowOf2Vec()) + if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; } return std::nullopt; } @@ -5580,7 +5585,7 @@ void BoUpSLP::reorderTopToBottom() { // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; - VF /= 2) { + VF = bit_ceil(VF) / 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) continue; @@ -5752,10 +5757,6 @@ bool BoUpSLP::canReorderOperands( TreeEntry *UserTE, SmallVectorImpl> &Edges, ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps) { - // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (UserTE->isNonPowOf2Vec()) - return false; - for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { if (any_of(Edges, [I](const std::pair &OpData) { return OpData.first == I && @@ -5927,9 +5928,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); const auto AllowsReordering = [&](const TreeEntry *TE) { - // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (TE->isNonPowOf2Vec()) - return false; if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || (IgnoreReorder && TE->Idx == 0)) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 9bbd314a27cb..c9b2e0ffc15f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -191,12 +191,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP1]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 ; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) +; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]]) ; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 @@ -263,7 +263,8 @@ define void @reorder_indices_2(ptr %spoint) { ; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer -; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[DSCO]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reorder_indices_2( @@ -566,11 +567,11 @@ define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) { ; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding( ; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP1:%.*]] = fsub <3 x float> [[TMP0]], [[TMP0]] +; NON-POW2-NEXT: [[TMP1:%.*]] = fsub <3 x float> [[IN]], [[IN]] ; NON-POW2-NEXT: [[TMP2:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> , <3 x float> ) ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <3 x float> [[TMP2]], -; NON-POW2-NEXT: store <3 x float> [[TMP3]], ptr [[A]], align 4 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP4]], ptr [[A]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index d9e5655f4b4e..4e8e019e155d 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -557,25 +557,34 @@ define i32 @dot_product_i32(ptr %a, ptr %b) { ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { -; CHECK-LABEL: @dot_product_i32_reorder( -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 -; CHECK-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 -; CHECK-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 -; CHECK-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 -; CHECK-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 -; CHECK-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] -; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] -; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] -; CHECK-NEXT: ret i32 [[ADD_1]] +; NON-POW2-LABEL: @dot_product_i32_reorder( +; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]]) +; NON-POW2-NEXT: ret i32 [[TMP4]] +; +; POW2-ONLY-LABEL: @dot_product_i32_reorder( +; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 +; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 +; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 +; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 +; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] +; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] +; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0 %l.a.0 = load i32, ptr %gep.a.0, align 4 @@ -653,22 +662,31 @@ define float @dot_product_fp32(ptr %a, ptr %b) { ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define float @dot_product_fp32_reorder(ptr %a, ptr %b) { -; CHECK-LABEL: @dot_product_fp32_reorder( -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; CHECK-NEXT: ret float [[ADD_1]] +; NON-POW2-LABEL: @dot_product_fp32_reorder( +; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) +; NON-POW2-NEXT: ret float [[TMP4]] +; +; POW2-ONLY-LABEL: @dot_product_fp32_reorder( +; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] +; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 1399b4c35c78..22a59d3da52a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -190,12 +190,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP1]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 ; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) +; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]]) ; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 @@ -262,7 +262,8 @@ define void @reorder_indices_2(ptr %spoint) { ; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 ; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer -; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[DSCO]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reorder_indices_2(