Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
Alexey Bataev 32994cc0d6 [SLP]Improve findReusedOrderedScalars and graph rotation.
Patch syncs the code in findReusedOrderedScalars with cost
estimation/codegen. It tries to use similar logic to better determine
best order.
Before, it just tried to find previously vectorized node without
checking if it is possible to use the vectorized value in the shuffle.
Now it relies on the more generalized version. If it determines, that
a single vector must be reordered (using same mechanism, as codegen and
cost estimation), it generates better order.

The comparison between new/ref ordering:

Metric: SLP.NumVectorInstructions

Program                                                                                                                                                SLP.NumVectorInstructions
                                                                                                                                                       results                   results0 diff
                                                                                               test-suite :: MultiSource/Benchmarks/nbench/nbench.test   139.00                    140.00   0.7%
                                                                             test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test   344.00                    346.00   0.6%
                                                                                        test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test  1293.00                   1292.00  -0.1%
                                                                                test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  5176.00                   5170.00  -0.1%
                                                                                        test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  5173.00                   5167.00  -0.1%
                                                                                test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00                  11660.00  -0.3%
                                                                                     test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test  1621.00                   1615.00  -0.4%
                                                                                             test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test   795.00                    792.00  -0.4%
                                                                              test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00                  26338.00  -0.6%
                                                                                               test-suite :: MultiSource/Benchmarks/Bullet/bullet.test  7343.00                   7281.00  -0.8%
                                                                                          test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test  1104.00                   1094.00  -0.9%
                                                                                          test-suite :: MultiSource/Applications/JM/lencod/lencod.test  2216.00                   2180.00  -1.6%
                                                                                            test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test   787.00                    637.00 -19.1%

Less 0% is better.
Most of the benchmarks see more vectorized code. The first ones just
have shuffles removed.

The ordering analysis still may require some improvements (e.g. for
alternate nodes), but this one should be produce better results.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/77529
2024-02-22 14:32:15 -05:00

129 lines
6.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX
%class.1 = type { %class.2 }
%class.2 = type { %"class.3" }
%"class.3" = type { %"struct.1", i64 }
%"struct.1" = type { [8 x i64] }
$_ZN1C10SwitchModeEv = comdat any
; Function Attrs: uwtable
define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
; SSE-LABEL: @_ZN1C10SwitchModeEv(
; SSE-NEXT: for.body.lr.ph.i:
; SSE-NEXT: [[OR_1:%.*]] = or i64 undef, 1
; SSE-NEXT: store i64 [[OR_1]], ptr undef, align 8
; SSE-NEXT: [[FOO_3:%.*]] = load i64, ptr undef, align 8
; SSE-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], ptr undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
; SSE-NEXT: [[FOO_4:%.*]] = load i64, ptr [[FOO_2]], align 8
; SSE-NEXT: [[BAR5:%.*]] = load i64, ptr undef, align 8
; SSE-NEXT: [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]]
; SSE-NEXT: [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]]
; SSE-NEXT: store i64 [[AND_2]], ptr undef, align 8
; SSE-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], ptr undef, i64 0, i32 0, i32 0, i32 0, i64 1
; SSE-NEXT: store i64 [[AND_1]], ptr [[BAR4]], align 8
; SSE-NEXT: ret void
;
; AVX-LABEL: @_ZN1C10SwitchModeEv(
; AVX-NEXT: for.body.lr.ph.i:
; AVX-NEXT: [[OR_1:%.*]] = or i64 undef, 1
; AVX-NEXT: store i64 [[OR_1]], ptr undef, align 8
; AVX-NEXT: [[BAR5:%.*]] = load i64, ptr undef, align 8
; AVX-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr undef, align 8
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BAR5]], i32 1
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], [[TMP0]]
; AVX-NEXT: store <2 x i64> [[TMP3]], ptr undef, align 8
; AVX-NEXT: ret void
;
for.body.lr.ph.i:
%or.1 = or i64 undef, 1
store i64 %or.1, ptr undef, align 8
%foo.3 = load i64, ptr undef, align 8
%foo.2 = getelementptr inbounds %class.1, ptr undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
%foo.4 = load i64, ptr %foo.2, align 8
%bar5 = load i64, ptr undef, align 8
%and.2 = and i64 %or.1, %foo.3
%and.1 = and i64 %bar5, %foo.4
store i64 %and.2, ptr undef, align 8
%bar4 = getelementptr inbounds %class.2, ptr undef, i64 0, i32 0, i32 0, i32 0, i64 1
store i64 %and.1, ptr %bar4, align 8
ret void
}
; Function Attrs: norecurse nounwind uwtable
define void @pr35497() local_unnamed_addr #0 {
; SSE-LABEL: @pr35497(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1
; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef
; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1
; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 undef>, i64 [[TMP0]], i32 0
; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
; SSE-NEXT: ret void
;
; AVX-LABEL: @pr35497(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1
; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef
; AVX-NEXT: store i64 [[ADD]], ptr undef, align 1
; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
; AVX-NEXT: ret void
;
entry:
%0 = load i64, ptr undef, align 1
%and = shl i64 %0, 2
%shl = and i64 %and, 20
%add = add i64 undef, undef
store i64 %add, ptr undef, align 1
%arrayidx2.1 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 5
%and.1 = shl i64 undef, 2
%shl.1 = and i64 %and.1, 20
%shr.1 = lshr i64 undef, 6
%add.1 = add nuw nsw i64 %shl, %shr.1
%arrayidx2.2 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
%shr.2 = lshr i64 undef, 6
%add.2 = add nuw nsw i64 %shl.1, %shr.2
%and.4 = shl i64 %add, 2
%shl.4 = and i64 %and.4, 20
%arrayidx2.5 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 1
store i64 %add.1, ptr %arrayidx2.5, align 1
%and.5 = shl nuw nsw i64 %add.1, 2
%shl.5 = and i64 %and.5, 20
%shr.5 = lshr i64 %add.1, 6
%add.5 = add nuw nsw i64 %shl.4, %shr.5
store i64 %add.5, ptr %arrayidx2.1, align 1
store i64 %add.2, ptr undef, align 1
%shr.6 = lshr i64 %add.2, 6
%add.6 = add nuw nsw i64 %shl.5, %shr.6
store i64 %add.6, ptr %arrayidx2.2, align 1
ret void
}