This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0%
test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.
Original Pull Request: https://github.com/llvm/llvm-project/pull/84334
The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.
Reviewers:
Pull Request: https://github.com/llvm/llvm-project/pull/84536
90 lines
3.4 KiB
LLVM
90 lines
3.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
|
|
|
|
define void @h() {
|
|
; CHECK-LABEL: define void @h() {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
|
|
; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%conv9 = zext i16 0 to i32
|
|
%arrayidx2 = getelementptr i8, ptr null, i64 16
|
|
%conv310 = zext i16 0 to i32
|
|
%add4 = or i32 %conv310, %conv9
|
|
%sub = or i32 %conv9, %conv310
|
|
%conv15 = sext i16 0 to i32
|
|
%shr = ashr i32 0, 0
|
|
%arrayidx18 = getelementptr i8, ptr null, i64 24
|
|
%conv19 = sext i16 0 to i32
|
|
%sub20 = or i32 %shr, %conv19
|
|
%shr29 = ashr i32 0, 0
|
|
%add30 = or i32 %shr29, %conv15
|
|
%sub39 = or i32 %sub, %sub20
|
|
%conv40 = trunc i32 %sub39 to i16
|
|
store i16 %conv40, ptr %arrayidx2, align 2
|
|
%sub44 = or i32 %add4, %add30
|
|
%conv45 = trunc i32 %sub44 to i16
|
|
store i16 %conv45, ptr %arrayidx18, align 2
|
|
%arrayidx2.1 = getelementptr i8, ptr null, i64 18
|
|
%conv3.112 = zext i16 0 to i32
|
|
%add4.1 = or i32 %conv3.112, 0
|
|
%sub.1 = or i32 0, %conv3.112
|
|
%conv15.1 = sext i16 0 to i32
|
|
%shr.1 = ashr i32 0, 0
|
|
%arrayidx18.1 = getelementptr i8, ptr null, i64 26
|
|
%conv19.1 = sext i16 0 to i32
|
|
%sub20.1 = or i32 %shr.1, %conv19.1
|
|
%shr29.1 = ashr i32 0, 0
|
|
%add30.1 = or i32 %shr29.1, %conv15.1
|
|
%sub39.1 = or i32 %sub.1, %sub20.1
|
|
%conv40.1 = trunc i32 %sub39.1 to i16
|
|
store i16 %conv40.1, ptr %arrayidx2.1, align 2
|
|
%sub44.1 = or i32 %add4.1, %add30.1
|
|
%conv45.1 = trunc i32 %sub44.1 to i16
|
|
store i16 %conv45.1, ptr %arrayidx18.1, align 2
|
|
%conv.213 = zext i16 0 to i32
|
|
%arrayidx2.2 = getelementptr i8, ptr null, i64 20
|
|
%conv3.214 = zext i16 0 to i32
|
|
%add4.2 = or i32 0, %conv.213
|
|
%sub.2 = or i32 0, %conv3.214
|
|
%conv15.2 = sext i16 0 to i32
|
|
%shr.2 = ashr i32 0, 0
|
|
%arrayidx18.2 = getelementptr i8, ptr null, i64 28
|
|
%conv19.2 = sext i16 0 to i32
|
|
%sub20.2 = or i32 %shr.2, %conv19.2
|
|
%shr29.2 = ashr i32 0, 0
|
|
%add30.2 = or i32 %shr29.2, %conv15.2
|
|
%sub39.2 = or i32 %sub.2, %sub20.2
|
|
%conv40.2 = trunc i32 %sub39.2 to i16
|
|
store i16 %conv40.2, ptr %arrayidx2.2, align 2
|
|
%sub44.2 = or i32 %add4.2, %add30.2
|
|
%conv45.2 = trunc i32 %sub44.2 to i16
|
|
store i16 %conv45.2, ptr %arrayidx18.2, align 2
|
|
%conv.315 = zext i16 0 to i32
|
|
%arrayidx2.3 = getelementptr i8, ptr null, i64 22
|
|
%conv3.316 = zext i16 0 to i32
|
|
%add4.3 = or i32 0, %conv.315
|
|
%sub.3 = or i32 0, %conv3.316
|
|
%conv15.3 = sext i16 0 to i32
|
|
%shr.3 = ashr i32 0, 0
|
|
%arrayidx18.3 = getelementptr i8, ptr null, i64 30
|
|
%conv19.3 = sext i16 0 to i32
|
|
%sub20.3 = or i32 %shr.3, %conv19.3
|
|
%shr29.3 = ashr i32 0, 0
|
|
%add30.3 = or i32 %shr29.3, %conv15.3
|
|
%sub39.3 = or i32 %sub.3, %sub20.3
|
|
%conv40.3 = trunc i32 %sub39.3 to i16
|
|
store i16 %conv40.3, ptr %arrayidx2.3, align 2
|
|
%sub44.3 = or i32 %add4.3, %add30.3
|
|
%conv45.3 = trunc i32 %sub44.3 to i16
|
|
store i16 %conv45.3, ptr %arrayidx18.3, align 2
|
|
ret void
|
|
}
|