This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0%
test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.
Original Pull Request: https://github.com/llvm/llvm-project/pull/84334
The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.
Reviewers:
Pull Request: https://github.com/llvm/llvm-project/pull/84536
127 lines
7.4 KiB
LLVM
127 lines
7.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
|
|
|
|
%"struct.std::array" = type { [32 x i8] }
|
|
|
|
; Function Attrs: nounwind uwtable
|
|
define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {
|
|
; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
|
|
; CHECK: if.then22.i:
|
|
; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
|
|
; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 2, i32 3, i32 4>
|
|
; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
|
|
; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
|
|
; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
|
|
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5
|
|
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6
|
|
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7
|
|
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
|
|
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
|
|
; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
|
|
; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
|
; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1
|
|
; CHECK-NEXT: unreachable
|
|
; CHECK: if.end50.i:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br i1 undef, label %if.end50.i, label %if.then22.i
|
|
|
|
if.then22.i: ; preds = %entry
|
|
%sub.i = add nsw i32 undef, -1
|
|
%conv31.i = and i32 undef, %sub.i
|
|
%0 = trunc i32 %sub.i to i8
|
|
%conv.i.i1199 = and i8 %0, 1
|
|
store i8 %conv.i.i1199, ptr undef, align 1
|
|
%shr.i.i = lshr i32 %conv31.i, 1
|
|
%1 = trunc i32 %shr.i.i to i8
|
|
%conv.1.i.i = and i8 %1, 1
|
|
%arrayidx.i.i7.1.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 1
|
|
store i8 %conv.1.i.i, ptr %arrayidx.i.i7.1.i.i, align 1
|
|
%shr.1.i.i = lshr i32 %conv31.i, 2
|
|
%2 = trunc i32 %shr.1.i.i to i8
|
|
%conv.2.i.i = and i8 %2, 1
|
|
%arrayidx.i.i7.2.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 2
|
|
store i8 %conv.2.i.i, ptr %arrayidx.i.i7.2.i.i, align 1
|
|
%shr.2.i.i = lshr i32 %conv31.i, 3
|
|
%3 = trunc i32 %shr.2.i.i to i8
|
|
%conv.3.i.i = and i8 %3, 1
|
|
%arrayidx.i.i7.3.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 3
|
|
store i8 %conv.3.i.i, ptr %arrayidx.i.i7.3.i.i, align 1
|
|
%shr.3.i.i = lshr i32 %conv31.i, 4
|
|
%4 = trunc i32 %shr.3.i.i to i8
|
|
%conv.4.i.i = and i8 %4, 1
|
|
%arrayidx.i.i7.4.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 4
|
|
store i8 %conv.4.i.i, ptr %arrayidx.i.i7.4.i.i, align 1
|
|
%shr.4.i.i = lshr i32 %conv31.i, 5
|
|
%5 = trunc i32 %shr.4.i.i to i8
|
|
%conv.5.i.i = and i8 %5, 1
|
|
%arrayidx.i.i7.5.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 5
|
|
store i8 %conv.5.i.i, ptr %arrayidx.i.i7.5.i.i, align 1
|
|
%shr.5.i.i = lshr i32 %conv31.i, 6
|
|
%6 = trunc i32 %shr.5.i.i to i8
|
|
%conv.6.i.i = and i8 %6, 1
|
|
%arrayidx.i.i7.6.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 6
|
|
store i8 %conv.6.i.i, ptr %arrayidx.i.i7.6.i.i, align 1
|
|
%shr.6.i.i = lshr i32 %conv31.i, 7
|
|
%7 = trunc i32 %shr.6.i.i to i8
|
|
%conv.7.i.i = and i8 %7, 1
|
|
%arrayidx.i.i7.7.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 7
|
|
store i8 %conv.7.i.i, ptr %arrayidx.i.i7.7.i.i, align 1
|
|
%shr.7.i.i = lshr i32 %conv31.i, 8
|
|
%8 = trunc i32 %shr.7.i.i to i8
|
|
%conv.8.i.i = and i8 %8, 1
|
|
%arrayidx.i.i7.8.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 8
|
|
store i8 %conv.8.i.i, ptr %arrayidx.i.i7.8.i.i, align 1
|
|
%shr.8.i.i = lshr i32 %conv31.i, 9
|
|
%9 = trunc i32 %shr.8.i.i to i8
|
|
%conv.9.i.i = and i8 %9, 1
|
|
%arrayidx.i.i7.9.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 9
|
|
store i8 %conv.9.i.i, ptr %arrayidx.i.i7.9.i.i, align 1
|
|
%shr.9.i.i = lshr i32 %conv31.i, 10
|
|
%10 = trunc i32 %shr.9.i.i to i8
|
|
%conv.10.i.i = and i8 %10, 1
|
|
%arrayidx.i.i7.10.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 10
|
|
store i8 %conv.10.i.i, ptr %arrayidx.i.i7.10.i.i, align 1
|
|
%shr.10.i.i = lshr i32 %conv31.i, 11
|
|
%11 = trunc i32 %shr.10.i.i to i8
|
|
%conv.11.i.i = and i8 %11, 1
|
|
%arrayidx.i.i7.11.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 11
|
|
store i8 %conv.11.i.i, ptr %arrayidx.i.i7.11.i.i, align 1
|
|
%shr.11.i.i = lshr i32 %conv31.i, 12
|
|
%12 = trunc i32 %shr.11.i.i to i8
|
|
%conv.12.i.i = and i8 %12, 1
|
|
%arrayidx.i.i7.12.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 12
|
|
store i8 %conv.12.i.i, ptr %arrayidx.i.i7.12.i.i, align 1
|
|
%shr.12.i.i = lshr i32 %conv31.i, 13
|
|
%13 = trunc i32 %shr.12.i.i to i8
|
|
%conv.13.i.i = and i8 %13, 1
|
|
%arrayidx.i.i7.13.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 13
|
|
store i8 %conv.13.i.i, ptr %arrayidx.i.i7.13.i.i, align 1
|
|
%shr.13.i.i = lshr i32 %conv31.i, 14
|
|
%14 = trunc i32 %shr.13.i.i to i8
|
|
%conv.14.i.i = and i8 %14, 1
|
|
%arrayidx.i.i7.14.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 14
|
|
store i8 %conv.14.i.i, ptr %arrayidx.i.i7.14.i.i, align 1
|
|
%shr.14.i.i = lshr i32 %conv31.i, 15
|
|
%15 = trunc i32 %shr.14.i.i to i8
|
|
%conv.15.i.i = and i8 %15, 1
|
|
%arrayidx.i.i7.15.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 15
|
|
store i8 %conv.15.i.i, ptr %arrayidx.i.i7.15.i.i, align 1
|
|
unreachable
|
|
|
|
if.end50.i: ; preds = %entry
|
|
ret void
|
|
}
|