This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0%
test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.
Original Pull Request: https://github.com/llvm/llvm-project/pull/84334
The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.
Reviewers:
Pull Request: https://github.com/llvm/llvm-project/pull/84536
229 lines
8.7 KiB
LLVM
229 lines
8.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=x86_64 -S | FileCheck %s
|
|
|
|
; The inputs to vector phi should remain undef.
|
|
|
|
define i32 @phi3UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
|
|
; CHECK-LABEL: @phi3UndefInput(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[ARG0:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[TMP0]], i8 [[ARG1:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[ARG2:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 undef, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
|
|
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
br i1 %cond, label %bb2, label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%phi0 = phi i8 [ %arg0, %bb2 ], [ 0, %entry ]
|
|
%phi1 = phi i8 [ %arg1, %bb2 ], [ undef, %entry ]
|
|
%phi2 = phi i8 [ %arg2, %bb2 ], [ undef, %entry ]
|
|
%phi3 = phi i8 [ %arg3, %bb2 ], [ undef, %entry ]
|
|
%zext0 = zext i8 %phi0 to i32
|
|
%zext1 = zext i8 %phi1 to i32
|
|
%zext2 = zext i8 %phi2 to i32
|
|
%zext3 = zext i8 %phi3 to i32
|
|
%or1 = or i32 %zext0, %zext1
|
|
%or2 = or i32 %or1, %zext2
|
|
%or3 = or i32 %or2, %zext3
|
|
ret i32 %or3
|
|
}
|
|
|
|
define i32 @phi2UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
|
|
; CHECK-LABEL: @phi2UndefInput(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[ARG0:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[TMP0]], i8 [[ARG1:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[ARG2:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
|
|
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
br i1 %cond, label %bb2, label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%phi0 = phi i8 [ %arg0, %bb2 ], [ 0, %entry ]
|
|
%phi1 = phi i8 [ %arg1, %bb2 ], [ 0, %entry ]
|
|
%phi2 = phi i8 [ %arg2, %bb2 ], [ undef, %entry ]
|
|
%phi3 = phi i8 [ %arg3, %bb2 ], [ undef, %entry ]
|
|
%zext0 = zext i8 %phi0 to i32
|
|
%zext1 = zext i8 %phi1 to i32
|
|
%zext2 = zext i8 %phi2 to i32
|
|
%zext3 = zext i8 %phi3 to i32
|
|
%or1 = or i32 %zext0, %zext1
|
|
%or2 = or i32 %or1, %zext2
|
|
%or3 = or i32 %or2, %zext3
|
|
ret i32 %or3
|
|
}
|
|
|
|
define i32 @phi1UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
|
|
; CHECK-LABEL: @phi1UndefInput(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[ARG0:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[TMP0]], i8 [[ARG1:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[ARG2:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 0, i8 undef>, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
|
|
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
br i1 %cond, label %bb2, label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%phi0 = phi i8 [ %arg0, %bb2 ], [ 0, %entry ]
|
|
%phi1 = phi i8 [ %arg1, %bb2 ], [ 0, %entry ]
|
|
%phi2 = phi i8 [ %arg2, %bb2 ], [ 0, %entry ]
|
|
%phi3 = phi i8 [ %arg3, %bb2 ], [ undef, %entry ]
|
|
%zext0 = zext i8 %phi0 to i32
|
|
%zext1 = zext i8 %phi1 to i32
|
|
%zext2 = zext i8 %phi2 to i32
|
|
%zext3 = zext i8 %phi3 to i32
|
|
%or1 = or i32 %zext0, %zext1
|
|
%or2 = or i32 %or1, %zext2
|
|
%or3 = or i32 %or2, %zext3
|
|
ret i32 %or3
|
|
}
|
|
|
|
|
|
define i32 @phi1Undef1PoisonInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
|
|
; CHECK-LABEL: @phi1Undef1PoisonInput(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[ARG0:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[TMP0]], i8 [[ARG1:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[ARG2:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
|
|
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
br i1 %cond, label %bb2, label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%phi0 = phi i8 [ %arg0, %bb2 ], [ 0, %entry ]
|
|
%phi1 = phi i8 [ %arg1, %bb2 ], [ 0, %entry ]
|
|
%phi2 = phi i8 [ %arg2, %bb2 ], [ poison, %entry ]
|
|
%phi3 = phi i8 [ %arg3, %bb2 ], [ undef, %entry ]
|
|
%zext0 = zext i8 %phi0 to i32
|
|
%zext1 = zext i8 %phi1 to i32
|
|
%zext2 = zext i8 %phi2 to i32
|
|
%zext3 = zext i8 %phi3 to i32
|
|
%or1 = or i32 %zext0, %zext1
|
|
%or2 = or i32 %or1, %zext2
|
|
%or3 = or i32 %or2, %zext3
|
|
ret i32 %or3
|
|
}
|
|
|
|
|
|
define i32 @phi1Undef2PoisonInputs(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
|
|
; CHECK-LABEL: @phi1Undef2PoisonInputs(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[ARG1:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[TMP0]], i8 [[ARG0:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[ARG2:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG3:%.*]], i32 3
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 poison, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
|
|
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
br i1 %cond, label %bb2, label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%phi0 = phi i8 [ %arg0, %bb2 ], [ poison, %entry ]
|
|
%phi1 = phi i8 [ %arg1, %bb2 ], [ 0, %entry ]
|
|
%phi2 = phi i8 [ %arg2, %bb2 ], [ poison, %entry ]
|
|
%phi3 = phi i8 [ %arg3, %bb2 ], [ undef, %entry ]
|
|
%zext0 = zext i8 %phi0 to i32
|
|
%zext1 = zext i8 %phi1 to i32
|
|
%zext2 = zext i8 %phi2 to i32
|
|
%zext3 = zext i8 %phi3 to i32
|
|
%or1 = or i32 %zext0, %zext1
|
|
%or2 = or i32 %or1, %zext2
|
|
%or3 = or i32 %or2, %zext3
|
|
ret i32 %or3
|
|
}
|
|
|
|
define i32 @phi1Undef1PoisonGapInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
|
|
; CHECK-LABEL: @phi1Undef1PoisonGapInput(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[ARG1:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[TMP0]], i8 [[ARG3:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[ARG0:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[ARG2:%.*]], i32 3
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
|
|
; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
br i1 %cond, label %bb2, label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%phi0 = phi i8 [ %arg0, %bb2 ], [ poison, %entry ]
|
|
%phi1 = phi i8 [ %arg1, %bb2 ], [ 0, %entry ]
|
|
%phi2 = phi i8 [ %arg2, %bb2 ], [ undef, %entry ]
|
|
%phi3 = phi i8 [ %arg3, %bb2 ], [ 0, %entry ]
|
|
%zext0 = zext i8 %phi0 to i32
|
|
%zext1 = zext i8 %phi1 to i32
|
|
%zext2 = zext i8 %phi2 to i32
|
|
%zext3 = zext i8 %phi3 to i32
|
|
%or1 = or i32 %zext0, %zext1
|
|
%or2 = or i32 %or1, %zext2
|
|
%or3 = or i32 %or2, %zext3
|
|
ret i32 %or3
|
|
}
|