From cf6a452cc734e00a1207514bb1fc2b123e31bb5f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 31 Mar 2025 19:33:40 -0400
Subject: [PATCH] [SLP]Fix same/alternate analysis in split node analysis for
 compares

getSameOpcode in some cases may consider 2 compares as having same
opcode, even though previously they were considered as alternate. It may
happen, because getSameOpcode looses info about previous instructions
and their states. Need to use isAlternateInstruction function instead
for the correct analysis.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/133769
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  7 +++--
 .../X86/vector-reductions-logical.ll          | 14 ++++-----
 .../SLPVectorizer/X86/reduction-logical.ll    | 18 ++++++------
 .../SLPVectorizer/revec-reduction-logical.ll  | 29 ++++++++++++-------
 4 files changed, 39 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0679eac17658..18c896767b6d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9013,8 +9013,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         Op1Indices.set(Idx);
         continue;
       }
-      InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI);
-      if (NewS && !NewS.isAltShuffle()) {
+      if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
+           I->getOpcode() == LocalState.getOpcode()) ||
+          (LocalState.getAltOpcode() == LocalState.getOpcode() &&
+           !isAlternateInstruction(I, LocalState.getMainOp(),
+                                   LocalState.getAltOp(), *TLI))) {
         Op1.push_back(V);
         Op1Indices.set(Idx);
         continue;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
index 035f3f145bf8..ee51e467c5b8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
@@ -86,10 +86,9 @@ return:
 define float @test_merge_anyof_v4sf(<4 x float> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4sf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[T:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x float> [[T]], splat (float 1.000000e+00)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0
@@ -401,10 +400,9 @@ return:
 define float @test_merge_anyof_v4si(<4 x i32> %t) {
 ; CHECK-LABEL: @test_merge_anyof_v4si(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> [[T:%.*]], splat (i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[T]], splat (i32 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index e0b3ff714162..81da11dc42e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -207,10 +207,10 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) {
 
 define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret i1 [[TMP6]]
@@ -239,12 +239,12 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 
 define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
 ; CHECK-NEXT:    call void @use1(i1 [[TMP5]])
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP8]], <4 x i1> [[TMP1]], i64 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]])
 ; CHECK-NEXT:    ret i1 [[TMP7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll
index 25f161e9f127..250c60a61fea 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,X86 %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefixes=CHECK,AARCH64 %}
 
 define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_diff_preds(
@@ -28,14 +28,23 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
-; CHECK-NEXT:    ret i1 [[TMP6]]
+; X86-LABEL: @logical_and_icmp_clamp(
+; X86-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
+; X86-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
+; X86-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
+; X86-NEXT:    ret i1 [[TMP6]]
+;
+; AARCH64-LABEL: @logical_and_icmp_clamp(
+; AARCH64-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
+; AARCH64-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
+; AARCH64-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
+; AARCH64-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
+; AARCH64-NEXT:    ret i1 [[TMP6]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1