Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
Alexey Bataev 32994cc0d6 [SLP]Improve findReusedOrderedScalars and graph rotation.
Patch syncs the code in findReusedOrderedScalars with cost
estimation/codegen. It tries to use similar logic to better determine
best order.
Before, it just tried to find previously vectorized node without
checking if it is possible to use the vectorized value in the shuffle.
Now it relies on the more generalized version. If it determines, that
a single vector must be reordered (using same mechanism, as codegen and
cost estimation), it generates better order.

The comparison between new/ref ordering:

Metric: SLP.NumVectorInstructions

Program                                                                                                                                                SLP.NumVectorInstructions
                                                                                                                                                       results                   results0 diff
                                                                                               test-suite :: MultiSource/Benchmarks/nbench/nbench.test   139.00                    140.00   0.7%
                                                                             test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test   344.00                    346.00   0.6%
                                                                                        test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test  1293.00                   1292.00  -0.1%
                                                                                test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  5176.00                   5170.00  -0.1%
                                                                                        test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  5173.00                   5167.00  -0.1%
                                                                                test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00                  11660.00  -0.3%
                                                                                     test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test  1621.00                   1615.00  -0.4%
                                                                                             test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test   795.00                    792.00  -0.4%
                                                                              test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00                  26338.00  -0.6%
                                                                                               test-suite :: MultiSource/Benchmarks/Bullet/bullet.test  7343.00                   7281.00  -0.8%
                                                                                          test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test  1104.00                   1094.00  -0.9%
                                                                                          test-suite :: MultiSource/Applications/JM/lencod/lencod.test  2216.00                   2180.00  -1.6%
                                                                                            test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test   787.00                    637.00 -19.1%

Less 0% is better.
Most of the benchmarks see more vectorized code. The first ones just
have shuffles removed.

The ordering analysis still may require some improvements (e.g. for
alternate nodes), but this one should be produce better results.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/77529
2024-02-22 14:32:15 -05:00

75 lines
3.8 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s
define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) {
; CHECK-LABEL: @test(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
; CHECK-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
; CHECK-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
; CHECK-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
; CHECK-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false
; CHECK-NEXT: ret i1 [[OP_RDX]]
;
bb:
%i226 = getelementptr ptr, ptr %arg, i32 7
%i2271 = load ptr, ptr %i226, align 8
%i232 = getelementptr ptr, ptr %arg, i32 8
%i2332 = load ptr, ptr %i232, align 8
%i234 = getelementptr ptr, ptr %arg, i32 9
%i2353 = load ptr, ptr %i234, align 8
%i236 = getelementptr ptr, ptr %arg, i32 10
%i2374 = load ptr, ptr %i236, align 8
%i240 = icmp ult ptr null, %i2332
%i242 = getelementptr double, ptr %i233, i64 %i241
%i243 = icmp ult ptr %i242, null
%i245 = getelementptr double, ptr %i235, i64 %i241
%i247 = icmp ult ptr null, %i2374
%i248 = getelementptr double, ptr %i237, i64 %i241
%i249 = icmp ult ptr %i248, null
%i250 = getelementptr double, ptr %i227, i64 %i241
%i251 = icmp ult ptr %i250, %i2332
%i252 = icmp ult ptr %i242, %i2271
%i253 = icmp ult ptr %i250, %i2353
%i254 = icmp ult ptr %i245, %i2271
%i255 = icmp ult ptr %i250, null
%i256 = icmp ult ptr null, %i2271
%i257 = icmp ult ptr null, %i2353
%i258 = icmp ult ptr %i245, null
%i259 = icmp ult ptr %i242, null
%i260 = icmp ult ptr null, %i2332
%i261 = icmp ult ptr null, %i2374
%i262 = icmp ult ptr %i248, null
%i263 = or i1 %i240, %i243
%i265 = and i1 %i263, false
%i266 = or i1 %i247, %i249
%i267 = and i1 %i265, %i266
%i268 = or i1 %i251, %i252
%i269 = and i1 %i267, %i268
%i270 = or i1 %i253, %i254
%i271 = and i1 %i269, %i270
%i272 = or i1 %i255, %i256
%i273 = and i1 %i271, %i272
%i274 = or i1 %i257, %i258
%i275 = and i1 %i273, %i274
%i276 = or i1 %i259, %i260
%i277 = and i1 %i275, %i276
%i278 = or i1 %i261, %i262
%i279 = and i1 %i277, %i278
ret i1 %i279
}