Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
Alexey Bataev 31eaf86a1e [SLP]Improve minbitwidth analysis.
This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.

Metric: size..text

Program                                                                                                                                                size..text
                                                                                                                                                       results     results0    diff
                                                                      test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test    92549.00    92609.00  0.1%
                                                                                  test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   663381.00   663493.00  0.0%
                                                                                   test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   663381.00   663493.00  0.0%
                                                                                               test-suite :: MultiSource/Benchmarks/Bullet/bullet.test   307182.00   307214.00  0.0%
                                                                             test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  1394420.00  1394484.00  0.0%
                                                                              test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  1394420.00  1394484.00  0.0%
                                                                                test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test  2040257.00  2040273.00  0.0%

                                                                              test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0%
                                                                                         test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test   909944.00   909768.00 -0.0%

SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.

Original Pull Request: https://github.com/llvm/llvm-project/pull/84334

The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.

Reviewers:

Pull Request: https://github.com/llvm/llvm-project/pull/84536
2024-03-19 08:19:45 -07:00

127 lines
7.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
%"struct.std::array" = type { [32 x i8] }
; Function Attrs: nounwind uwtable
define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {
; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
; CHECK: if.then22.i:
; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1
; CHECK-NEXT: unreachable
; CHECK: if.end50.i:
; CHECK-NEXT: ret void
;
entry:
br i1 undef, label %if.end50.i, label %if.then22.i
if.then22.i: ; preds = %entry
%sub.i = add nsw i32 undef, -1
%conv31.i = and i32 undef, %sub.i
%0 = trunc i32 %sub.i to i8
%conv.i.i1199 = and i8 %0, 1
store i8 %conv.i.i1199, ptr undef, align 1
%shr.i.i = lshr i32 %conv31.i, 1
%1 = trunc i32 %shr.i.i to i8
%conv.1.i.i = and i8 %1, 1
%arrayidx.i.i7.1.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 1
store i8 %conv.1.i.i, ptr %arrayidx.i.i7.1.i.i, align 1
%shr.1.i.i = lshr i32 %conv31.i, 2
%2 = trunc i32 %shr.1.i.i to i8
%conv.2.i.i = and i8 %2, 1
%arrayidx.i.i7.2.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 2
store i8 %conv.2.i.i, ptr %arrayidx.i.i7.2.i.i, align 1
%shr.2.i.i = lshr i32 %conv31.i, 3
%3 = trunc i32 %shr.2.i.i to i8
%conv.3.i.i = and i8 %3, 1
%arrayidx.i.i7.3.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 3
store i8 %conv.3.i.i, ptr %arrayidx.i.i7.3.i.i, align 1
%shr.3.i.i = lshr i32 %conv31.i, 4
%4 = trunc i32 %shr.3.i.i to i8
%conv.4.i.i = and i8 %4, 1
%arrayidx.i.i7.4.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 4
store i8 %conv.4.i.i, ptr %arrayidx.i.i7.4.i.i, align 1
%shr.4.i.i = lshr i32 %conv31.i, 5
%5 = trunc i32 %shr.4.i.i to i8
%conv.5.i.i = and i8 %5, 1
%arrayidx.i.i7.5.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 5
store i8 %conv.5.i.i, ptr %arrayidx.i.i7.5.i.i, align 1
%shr.5.i.i = lshr i32 %conv31.i, 6
%6 = trunc i32 %shr.5.i.i to i8
%conv.6.i.i = and i8 %6, 1
%arrayidx.i.i7.6.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 6
store i8 %conv.6.i.i, ptr %arrayidx.i.i7.6.i.i, align 1
%shr.6.i.i = lshr i32 %conv31.i, 7
%7 = trunc i32 %shr.6.i.i to i8
%conv.7.i.i = and i8 %7, 1
%arrayidx.i.i7.7.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 7
store i8 %conv.7.i.i, ptr %arrayidx.i.i7.7.i.i, align 1
%shr.7.i.i = lshr i32 %conv31.i, 8
%8 = trunc i32 %shr.7.i.i to i8
%conv.8.i.i = and i8 %8, 1
%arrayidx.i.i7.8.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 8
store i8 %conv.8.i.i, ptr %arrayidx.i.i7.8.i.i, align 1
%shr.8.i.i = lshr i32 %conv31.i, 9
%9 = trunc i32 %shr.8.i.i to i8
%conv.9.i.i = and i8 %9, 1
%arrayidx.i.i7.9.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 9
store i8 %conv.9.i.i, ptr %arrayidx.i.i7.9.i.i, align 1
%shr.9.i.i = lshr i32 %conv31.i, 10
%10 = trunc i32 %shr.9.i.i to i8
%conv.10.i.i = and i8 %10, 1
%arrayidx.i.i7.10.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 10
store i8 %conv.10.i.i, ptr %arrayidx.i.i7.10.i.i, align 1
%shr.10.i.i = lshr i32 %conv31.i, 11
%11 = trunc i32 %shr.10.i.i to i8
%conv.11.i.i = and i8 %11, 1
%arrayidx.i.i7.11.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 11
store i8 %conv.11.i.i, ptr %arrayidx.i.i7.11.i.i, align 1
%shr.11.i.i = lshr i32 %conv31.i, 12
%12 = trunc i32 %shr.11.i.i to i8
%conv.12.i.i = and i8 %12, 1
%arrayidx.i.i7.12.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 12
store i8 %conv.12.i.i, ptr %arrayidx.i.i7.12.i.i, align 1
%shr.12.i.i = lshr i32 %conv31.i, 13
%13 = trunc i32 %shr.12.i.i to i8
%conv.13.i.i = and i8 %13, 1
%arrayidx.i.i7.13.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 13
store i8 %conv.13.i.i, ptr %arrayidx.i.i7.13.i.i, align 1
%shr.13.i.i = lshr i32 %conv31.i, 14
%14 = trunc i32 %shr.13.i.i to i8
%conv.14.i.i = and i8 %14, 1
%arrayidx.i.i7.14.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 14
store i8 %conv.14.i.i, ptr %arrayidx.i.i7.14.i.i, align 1
%shr.14.i.i = lshr i32 %conv31.i, 15
%15 = trunc i32 %shr.14.i.i to i8
%conv.15.i.i = and i8 %15, 1
%arrayidx.i.i7.15.i.i = getelementptr inbounds %"struct.std::array", ptr undef, i64 0, i32 0, i64 15
store i8 %conv.15.i.i, ptr %arrayidx.i.i7.15.i.i, align 1
unreachable
if.end50.i: ; preds = %entry
ret void
}