This improves overall analysis for minbitwidth in SLP. It allows to
analyze the trees with store/insertelement root nodes. Also, instead of
using single minbitwidth, detected from the very first analysis stage,
it tries to detect the best one for each trunc/ext subtree in the graph
and use it for the subtree.
Results in better code and less vector register pressure.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0%
test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0%
SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar
instructions remain scalar (good).
Spec2017/x264 - the whole function idct4x4dc is vectorized using <16
x i16> instead of <16 x i32>, also zext/trunc are removed. In other
places last vector zext/sext removed and replaced by
extractelement + scalar zext/sext pair.
MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by
reduce or <4 x i8>
Spec2017/imagick - Removed extra zext from 2 packs of the operations.
Spec2017/parest - Removed extra zext, replaced by extractelement+scalar
zext
Spec2017/blender - the whole bunch of vector zext/sext replaced by
extractelement+scalar zext/sext, some extra code vectorized in smaller
types.
Spec2006/gobmk - fixed cost estimation, some small code remains scalar.
Original Pull Request: https://github.com/llvm/llvm-project/pull/84334
The patch has the same functionality (no test changes, no changes in
benchmarks) as the original patch, just has some compile time
improvements + fixes for xxhash unittest, discovered earlier in the
previous version of the patch.
Reviewers:
Pull Request: https://github.com/llvm/llvm-project/pull/84536
47 lines
2.6 KiB
LLVM
47 lines
2.6 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=verify,slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s
|
|
|
|
@global = local_unnamed_addr global [6 x double] zeroinitializer, align 16
|
|
|
|
define { i64, i64 } @patatino(double %arg) {
|
|
; CHECK-LABEL: @patatino(
|
|
; CHECK-NEXT: bb:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @global, align 16
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 2), align 16
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
|
|
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
|
|
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
|
|
; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
|
|
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
|
|
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
|
|
; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
|
|
; CHECK-NEXT: ret { i64, i64 } [[T17]]
|
|
;
|
|
bb:
|
|
%t = load double, ptr @global, align 16
|
|
%t1 = load double, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 2), align 16
|
|
%t2 = fmul double %t1, %arg
|
|
%t3 = fadd double %t, %t2
|
|
%t4 = load double, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
|
|
%t5 = fadd double %t4, %t3
|
|
%t6 = fptosi double %t5 to i32
|
|
%t7 = sext i32 %t6 to i64
|
|
%t8 = load double, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 1), align 8
|
|
%t9 = load double, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 3), align 8
|
|
%t10 = fmul double %t9, %arg
|
|
%t11 = fadd double %t8, %t10
|
|
%t12 = load double, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 5), align 8
|
|
%t13 = fadd double %t12, %t11
|
|
%t14 = fptosi double %t13 to i32
|
|
%t15 = sext i32 %t14 to i64
|
|
%t16 = insertvalue { i64, i64 } undef, i64 %t7, 0
|
|
%t17 = insertvalue { i64, i64 } %t16, i64 %t15, 1
|
|
ret { i64, i64 } %t17
|
|
}
|