Patch syncs the code in findReusedOrderedScalars with cost
estimation/codegen. It tries to use similar logic to better determine
best order.
Before, it just tried to find previously vectorized node without
checking if it is possible to use the vectorized value in the shuffle.
Now it relies on the more generalized version. If it determines, that
a single vector must be reordered (using same mechanism, as codegen and
cost estimation), it generates better order.
The comparison between new/ref ordering:
Metric: SLP.NumVectorInstructions
Program SLP.NumVectorInstructions
results results0 diff
test-suite :: MultiSource/Benchmarks/nbench/nbench.test 139.00 140.00 0.7%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 344.00 346.00 0.6%
test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 1293.00 1292.00 -0.1%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 5176.00 5170.00 -0.1%
test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 5173.00 5167.00 -0.1%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00 11660.00 -0.3%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 1621.00 1615.00 -0.4%
test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 795.00 792.00 -0.4%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00 26338.00 -0.6%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 7343.00 7281.00 -0.8%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 1104.00 1094.00 -0.9%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 2216.00 2180.00 -1.6%
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 787.00 637.00 -19.1%
Less 0% is better.
Most of the benchmarks see more vectorized code. The first ones just
have shuffles removed.
The ordering analysis still may require some improvements (e.g. for
alternate nodes), but this one should be produce better results.
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/77529
129 lines
6.3 KiB
LLVM
129 lines
6.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefix=SSE
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -mattr=+avx -S | FileCheck %s --check-prefix=AVX
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefix=AVX
|
|
|
|
%class.1 = type { %class.2 }
|
|
%class.2 = type { %"class.3" }
|
|
%"class.3" = type { %"struct.1", i64 }
|
|
%"struct.1" = type { [8 x i64] }
|
|
|
|
$_ZN1C10SwitchModeEv = comdat any
|
|
|
|
; Function Attrs: uwtable
|
|
define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
|
|
; SSE-LABEL: @_ZN1C10SwitchModeEv(
|
|
; SSE-NEXT: for.body.lr.ph.i:
|
|
; SSE-NEXT: [[OR_1:%.*]] = or i64 undef, 1
|
|
; SSE-NEXT: store i64 [[OR_1]], ptr undef, align 8
|
|
; SSE-NEXT: [[FOO_3:%.*]] = load i64, ptr undef, align 8
|
|
; SSE-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], ptr undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
|
|
; SSE-NEXT: [[FOO_4:%.*]] = load i64, ptr [[FOO_2]], align 8
|
|
; SSE-NEXT: [[BAR5:%.*]] = load i64, ptr undef, align 8
|
|
; SSE-NEXT: [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]]
|
|
; SSE-NEXT: [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]]
|
|
; SSE-NEXT: store i64 [[AND_2]], ptr undef, align 8
|
|
; SSE-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], ptr undef, i64 0, i32 0, i32 0, i32 0, i64 1
|
|
; SSE-NEXT: store i64 [[AND_1]], ptr [[BAR4]], align 8
|
|
; SSE-NEXT: ret void
|
|
;
|
|
; AVX-LABEL: @_ZN1C10SwitchModeEv(
|
|
; AVX-NEXT: for.body.lr.ph.i:
|
|
; AVX-NEXT: [[OR_1:%.*]] = or i64 undef, 1
|
|
; AVX-NEXT: store i64 [[OR_1]], ptr undef, align 8
|
|
; AVX-NEXT: [[BAR5:%.*]] = load i64, ptr undef, align 8
|
|
; AVX-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr undef, align 8
|
|
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
|
|
; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[BAR5]], i32 1
|
|
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], [[TMP0]]
|
|
; AVX-NEXT: store <2 x i64> [[TMP3]], ptr undef, align 8
|
|
; AVX-NEXT: ret void
|
|
;
|
|
for.body.lr.ph.i:
|
|
%or.1 = or i64 undef, 1
|
|
store i64 %or.1, ptr undef, align 8
|
|
%foo.3 = load i64, ptr undef, align 8
|
|
%foo.2 = getelementptr inbounds %class.1, ptr undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
|
|
%foo.4 = load i64, ptr %foo.2, align 8
|
|
%bar5 = load i64, ptr undef, align 8
|
|
%and.2 = and i64 %or.1, %foo.3
|
|
%and.1 = and i64 %bar5, %foo.4
|
|
store i64 %and.2, ptr undef, align 8
|
|
%bar4 = getelementptr inbounds %class.2, ptr undef, i64 0, i32 0, i32 0, i32 0, i64 1
|
|
store i64 %and.1, ptr %bar4, align 8
|
|
ret void
|
|
}
|
|
|
|
; Function Attrs: norecurse nounwind uwtable
|
|
define void @pr35497() local_unnamed_addr #0 {
|
|
; SSE-LABEL: @pr35497(
|
|
; SSE-NEXT: entry:
|
|
; SSE-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1
|
|
; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef
|
|
; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1
|
|
; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
|
|
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 undef>, i64 [[TMP0]], i32 0
|
|
; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
|
|
; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
|
|
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
|
|
; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
|
|
; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1
|
|
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
|
|
; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
|
|
; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
|
|
; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
|
|
; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
|
|
; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
|
|
; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
|
|
; SSE-NEXT: ret void
|
|
;
|
|
; AVX-LABEL: @pr35497(
|
|
; AVX-NEXT: entry:
|
|
; AVX-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1
|
|
; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef
|
|
; AVX-NEXT: store i64 [[ADD]], ptr undef, align 1
|
|
; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
|
|
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
|
|
; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
|
|
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
|
|
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
|
|
; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1
|
|
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
|
|
; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
|
|
; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
|
|
; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
|
|
; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
|
|
; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
|
|
; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
|
|
; AVX-NEXT: ret void
|
|
;
|
|
entry:
|
|
%0 = load i64, ptr undef, align 1
|
|
%and = shl i64 %0, 2
|
|
%shl = and i64 %and, 20
|
|
%add = add i64 undef, undef
|
|
store i64 %add, ptr undef, align 1
|
|
%arrayidx2.1 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 5
|
|
%and.1 = shl i64 undef, 2
|
|
%shl.1 = and i64 %and.1, 20
|
|
%shr.1 = lshr i64 undef, 6
|
|
%add.1 = add nuw nsw i64 %shl, %shr.1
|
|
%arrayidx2.2 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
|
|
%shr.2 = lshr i64 undef, 6
|
|
%add.2 = add nuw nsw i64 %shl.1, %shr.2
|
|
%and.4 = shl i64 %add, 2
|
|
%shl.4 = and i64 %and.4, 20
|
|
%arrayidx2.5 = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 1
|
|
store i64 %add.1, ptr %arrayidx2.5, align 1
|
|
%and.5 = shl nuw nsw i64 %add.1, 2
|
|
%shl.5 = and i64 %and.5, 20
|
|
%shr.5 = lshr i64 %add.1, 6
|
|
%add.5 = add nuw nsw i64 %shl.4, %shr.5
|
|
store i64 %add.5, ptr %arrayidx2.1, align 1
|
|
store i64 %add.2, ptr undef, align 1
|
|
%shr.6 = lshr i64 %add.2, 6
|
|
%add.6 = add nuw nsw i64 %shl.5, %shr.6
|
|
store i64 %add.6, ptr %arrayidx2.2, align 1
|
|
ret void
|
|
}
|