This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0%
test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.
Original Pull Request: https://github.com/llvm/llvm-project/pull/84334
The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.
Reviewers:
Pull Request: https://github.com/llvm/llvm-project/pull/84536
50 lines
2.3 KiB
LLVM
50 lines
2.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
|
|
|
|
define void @test(ptr %block, ptr noalias %pixels, i1 %b) {
|
|
; CHECK-LABEL: define void @test(
|
|
; CHECK-SAME: ptr [[BLOCK:%.*]], ptr noalias [[PIXELS:%.*]], i1 [[B:%.*]]) {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i1> <i1 true, i1 poison, i1 false, i1 false>, i1 [[B]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[BLOCK]], align 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i16> [[TMP2]], zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i16> [[TMP2]] to <4 x i8>
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i8> [[TMP4]], <4 x i8> [[TMP1]]
|
|
; CHECK-NEXT: store <4 x i8> [[TMP5]], ptr [[PIXELS]], align 1
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%0 = load i16, ptr %block, align 2
|
|
%tobool.not.i78 = icmp ult i16 %0, 0
|
|
%conv.i80 = sext i1 true to i8
|
|
%conv1.i81 = trunc i16 %0 to i8
|
|
%retval.0.i82 = select i1 %tobool.not.i78, i8 %conv1.i81, i8 %conv.i80
|
|
store i8 %retval.0.i82, ptr %pixels, align 1
|
|
%arrayidx2 = getelementptr i8, ptr %block, i64 2
|
|
%1 = load i16, ptr %arrayidx2, align 2
|
|
%tobool.not.i73 = icmp ult i16 %1, 0
|
|
%conv.i75 = sext i1 %b to i8
|
|
%conv1.i76 = trunc i16 %1 to i8
|
|
%retval.0.i77 = select i1 %tobool.not.i73, i8 %conv1.i76, i8 %conv.i75
|
|
%arrayidx5 = getelementptr i8, ptr %pixels, i64 1
|
|
store i8 %retval.0.i77, ptr %arrayidx5, align 1
|
|
%arrayidx6 = getelementptr i8, ptr %block, i64 4
|
|
%2 = load i16, ptr %arrayidx6, align 2
|
|
%tobool.not.i68 = icmp ult i16 %2, 0
|
|
%conv.i70 = sext i1 false to i8
|
|
%conv1.i71 = trunc i16 %2 to i8
|
|
%retval.0.i72 = select i1 %tobool.not.i68, i8 %conv1.i71, i8 %conv.i70
|
|
%arrayidx9 = getelementptr i8, ptr %pixels, i64 2
|
|
store i8 %retval.0.i72, ptr %arrayidx9, align 1
|
|
%arrayidx10 = getelementptr i8, ptr %block, i64 6
|
|
%3 = load i16, ptr %arrayidx10, align 2
|
|
%tobool.not.i63 = icmp ult i16 %3, 0
|
|
%conv.i65 = sext i1 false to i8
|
|
%conv1.i66 = trunc i16 %3 to i8
|
|
%retval.0.i67 = select i1 %tobool.not.i63, i8 %conv1.i66, i8 %conv.i65
|
|
%arrayidx13 = getelementptr i8, ptr %pixels, i64 3
|
|
store i8 %retval.0.i67, ptr %arrayidx13, align 1
|
|
ret void
|
|
}
|