Patch syncs the code in findReusedOrderedScalars with cost
estimation/codegen. It tries to use similar logic to better determine
best order.
Before, it just tried to find previously vectorized node without
checking if it is possible to use the vectorized value in the shuffle.
Now it relies on the more generalized version. If it determines, that
a single vector must be reordered (using same mechanism, as codegen and
cost estimation), it generates better order.
The comparison between new/ref ordering:
Metric: SLP.NumVectorInstructions
Program SLP.NumVectorInstructions
results results0 diff
test-suite :: MultiSource/Benchmarks/nbench/nbench.test 139.00 140.00 0.7%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 344.00 346.00 0.6%
test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 1293.00 1292.00 -0.1%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 5176.00 5170.00 -0.1%
test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 5173.00 5167.00 -0.1%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00 11660.00 -0.3%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 1621.00 1615.00 -0.4%
test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 795.00 792.00 -0.4%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00 26338.00 -0.6%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 7343.00 7281.00 -0.8%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 1104.00 1094.00 -0.9%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 2216.00 2180.00 -1.6%
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 787.00 637.00 -19.1%
Less 0% is better.
Most of the benchmarks see more vectorized code. The first ones just
have shuffles removed.
The ordering analysis still may require some improvements (e.g. for
alternate nodes), but this one should be produce better results.
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/77529
75 lines
3.8 KiB
LLVM
75 lines
3.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s
|
|
|
|
define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) {
|
|
; CHECK-LABEL: @test(
|
|
; CHECK-NEXT: bb:
|
|
; CHECK-NEXT: [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
|
|
; CHECK-NEXT: [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
|
|
; CHECK-NEXT: [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
|
|
; CHECK-NEXT: [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
|
|
; CHECK-NEXT: [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
|
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
|
|
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
|
|
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
|
|
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
|
|
; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false
|
|
; CHECK-NEXT: ret i1 [[OP_RDX]]
|
|
;
|
|
bb:
|
|
%i226 = getelementptr ptr, ptr %arg, i32 7
|
|
%i2271 = load ptr, ptr %i226, align 8
|
|
%i232 = getelementptr ptr, ptr %arg, i32 8
|
|
%i2332 = load ptr, ptr %i232, align 8
|
|
%i234 = getelementptr ptr, ptr %arg, i32 9
|
|
%i2353 = load ptr, ptr %i234, align 8
|
|
%i236 = getelementptr ptr, ptr %arg, i32 10
|
|
%i2374 = load ptr, ptr %i236, align 8
|
|
%i240 = icmp ult ptr null, %i2332
|
|
%i242 = getelementptr double, ptr %i233, i64 %i241
|
|
%i243 = icmp ult ptr %i242, null
|
|
%i245 = getelementptr double, ptr %i235, i64 %i241
|
|
%i247 = icmp ult ptr null, %i2374
|
|
%i248 = getelementptr double, ptr %i237, i64 %i241
|
|
%i249 = icmp ult ptr %i248, null
|
|
%i250 = getelementptr double, ptr %i227, i64 %i241
|
|
%i251 = icmp ult ptr %i250, %i2332
|
|
%i252 = icmp ult ptr %i242, %i2271
|
|
%i253 = icmp ult ptr %i250, %i2353
|
|
%i254 = icmp ult ptr %i245, %i2271
|
|
%i255 = icmp ult ptr %i250, null
|
|
%i256 = icmp ult ptr null, %i2271
|
|
%i257 = icmp ult ptr null, %i2353
|
|
%i258 = icmp ult ptr %i245, null
|
|
%i259 = icmp ult ptr %i242, null
|
|
%i260 = icmp ult ptr null, %i2332
|
|
%i261 = icmp ult ptr null, %i2374
|
|
%i262 = icmp ult ptr %i248, null
|
|
%i263 = or i1 %i240, %i243
|
|
%i265 = and i1 %i263, false
|
|
%i266 = or i1 %i247, %i249
|
|
%i267 = and i1 %i265, %i266
|
|
%i268 = or i1 %i251, %i252
|
|
%i269 = and i1 %i267, %i268
|
|
%i270 = or i1 %i253, %i254
|
|
%i271 = and i1 %i269, %i270
|
|
%i272 = or i1 %i255, %i256
|
|
%i273 = and i1 %i271, %i272
|
|
%i274 = or i1 %i257, %i258
|
|
%i275 = and i1 %i273, %i274
|
|
%i276 = or i1 %i259, %i260
|
|
%i277 = and i1 %i275, %i276
|
|
%i278 = or i1 %i261, %i262
|
|
%i279 = and i1 %i277, %i278
|
|
ret i1 %i279
|
|
}
|