From cf6a452cc734e00a1207514bb1fc2b123e31bb5f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 31 Mar 2025 19:33:40 -0400 Subject: [PATCH] [SLP]Fix same/alternate analysis in split node analysis for compares getSameOpcode in some cases may consider 2 compares as having same opcode, even though previously they were considered as alternate. It may happen, because getSameOpcode looses info about previous instructions and their states. Need to use isAlternateInstruction function instead for the correct analysis. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/133769 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 7 +++-- .../X86/vector-reductions-logical.ll | 14 ++++----- .../SLPVectorizer/X86/reduction-logical.ll | 18 ++++++------ .../SLPVectorizer/revec-reduction-logical.ll | 29 ++++++++++++------- 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0679eac17658..18c896767b6d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9013,8 +9013,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Op1Indices.set(Idx); continue; } - InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI); - if (NewS && !NewS.isAltShuffle()) { + if ((LocalState.getAltOpcode() != LocalState.getOpcode() && + I->getOpcode() == LocalState.getOpcode()) || + (LocalState.getAltOpcode() == LocalState.getOpcode() && + !isAlternateInstruction(I, LocalState.getMainOp(), + LocalState.getAltOp(), *TLI))) { Op1.push_back(V); Op1Indices.set(Idx); continue; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index 035f3f145bf8..ee51e467c5b8 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -86,10 +86,9 @@ return: define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 @@ -401,10 +400,9 @@ return: define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-LABEL: @test_merge_anyof_v4si( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T:%.*]], splat (i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[T]], splat (i32 255) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index e0b3ff714162..81da11dc42e8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -207,10 +207,10 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) { define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) ; CHECK-NEXT: ret i1 [[TMP6]] @@ -239,12 +239,12 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 ; CHECK-NEXT: call void @use1(i1 [[TMP5]]) +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP8]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]]) ; CHECK-NEXT: ret i1 [[TMP7]] diff --git a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll index 25f161e9f127..250c60a61fea 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,X86 %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefixes=CHECK,AARCH64 %} define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_diff_preds( @@ -28,14 +28,23 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) -; CHECK-NEXT: ret i1 [[TMP6]] +; X86-LABEL: @logical_and_icmp_clamp( +; X86-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; X86-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> +; X86-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4) +; X86-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] +; X86-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) +; X86-NEXT: ret i1 [[TMP6]] +; +; AARCH64-LABEL: @logical_and_icmp_clamp( +; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> +; AARCH64-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], +; AARCH64-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], +; AARCH64-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> +; AARCH64-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] +; AARCH64-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) +; AARCH64-NEXT: ret i1 [[TMP6]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1