[SLP]Fix same/alternate analysis in split node analysis for compares

getSameOpcode in some cases may consider 2 compares as having same
opcode, even though previously they were considered as alternate. It may
happen, because getSameOpcode looses info about previous instructions
and their states. Need to use isAlternateInstruction function instead
for the correct analysis.

Reviewers: RKSimon, hiraditya

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/133769
This commit is contained in:
Alexey Bataev
2025-03-31 19:33:40 -04:00
committed by GitHub
parent 5b8d8bb90a
commit cf6a452cc7
4 changed files with 39 additions and 29 deletions

View File

@@ -9013,8 +9013,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Op1Indices.set(Idx);
continue;
}
InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI);
if (NewS && !NewS.isAltShuffle()) {
if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
I->getOpcode() == LocalState.getOpcode()) ||
(LocalState.getAltOpcode() == LocalState.getOpcode() &&
!isAlternateInstruction(I, LocalState.getMainOp(),
LocalState.getAltOp(), *TLI))) {
Op1.push_back(V);
Op1Indices.set(Idx);
continue;

View File

@@ -86,10 +86,9 @@ return:
define float @test_merge_anyof_v4sf(<4 x float> %t) {
; CHECK-LABEL: @test_merge_anyof_v4sf(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T:%.*]], zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0
@@ -401,10 +400,9 @@ return:
define float @test_merge_anyof_v4si(<4 x i32> %t) {
; CHECK-LABEL: @test_merge_anyof_v4si(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T:%.*]], splat (i32 1)
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[T]], splat (i32 255)
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0

View File

@@ -207,10 +207,10 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) {
define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_clamp(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
; CHECK-NEXT: ret i1 [[TMP6]]
@@ -239,12 +239,12 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
; CHECK-NEXT: call void @use1(i1 [[TMP5]])
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP8]], <4 x i1> [[TMP1]], i64 4)
; CHECK-NEXT: [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]])
; CHECK-NEXT: ret i1 [[TMP7]]

View File

@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s %}
; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %}
; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,X86 %}
; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefixes=CHECK,AARCH64 %}
define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_diff_preds(
@@ -28,14 +28,23 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
}
define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_clamp(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
; CHECK-NEXT: ret i1 [[TMP6]]
; X86-LABEL: @logical_and_icmp_clamp(
; X86-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
; X86-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; X86-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
; X86-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
; X86-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
; X86-NEXT: ret i1 [[TMP6]]
;
; AARCH64-LABEL: @logical_and_icmp_clamp(
; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; AARCH64-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; AARCH64-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
; AARCH64-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; AARCH64-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
; AARCH64-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
; AARCH64-NEXT: ret i1 [[TMP6]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1