No need to handle extra arguments during the reductions anymore, the compiler now can handle all reduced values and reduction operands correctly, even if they are from different basic blocks. Simplifies analysis, reduces compiler size, improves overall vectorization. Metric: size..text test-suite :: SingleSource/Benchmarks/Misc-C++/stepanov_container.test 16668.00 17148.00 2.9% test-suite :: External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk.test 2389675.00 2418683.00 1.2% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253645.00 0.1% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309678.00 309806.00 0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 389203.00 389363.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test 111120.00 111152.00 0.0% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1039103.00 1039215.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1155883.00 1155963.00 0.0% test-suite :: MicroBenchmarks/LoopVectorization/LoopInterleavingBenchmarks.test 276646.00 276662.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848691.00 848739.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1138604.00 1138636.00 0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 910201.00 910217.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12385484.00 12385628.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 9667580.00 9667676.00 0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 9667580.00 9667676.00 0.0% test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 2856182.00 2856198.00 0.0% test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 2856182.00 2856198.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 773224.00 773192.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035148.00 1035084.00 -0.0% test-suite :: External/SPEC/CINT2017speed/631.deepsjeng_s/631.deepsjeng_s.test 98126.00 98094.00 -0.0% test-suite :: External/SPEC/CINT2017rate/531.deepsjeng_r/531.deepsjeng_r.test 97966.00 97934.00 -0.0% test-suite :: MultiSource/Benchmarks/MallocBench/gs/gs.test 167391.00 167215.00 -0.1% test-suite :: MultiSource/Applications/ALAC/encode/alacconvert-encode.test 56685.00 56605.00 -0.1% test-suite :: MultiSource/Applications/ALAC/decode/alacconvert-decode.test 56685.00 56605.00 -0.1% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-20050826-2.test 1302.00 1294.00 -0.6% Misc-C++/stepanov_container - better code due to cost fixes. 483.xalancbmk - better code due to cost fixes. ASCI_Purple/SMG2000 - better code due to cost fixes. Benchmarks/Bullet - better vector code because of the cost. JM/ldecod - extra code remain scalar, extra reduction vectorized consumer-jpeg - extra code remain scalar because of the cost. tramp3d-v4 - better vectorization because of cost fixes. 511.povray_r - better vectorization because of cost fixes. LoopInterleavingBenchmarks - extra reductions are vectorized JM/lencod - small changes in vector code because of extract cost fixes. 453.povray - small changes in vector code because of extract cost fixes. 445.gobmk - extra small reduction vectorized 526.blender_r - extra reduced scalars, better small reduction, small changes in the vetorization because of the fixes for extracts cost 602.gcc_s 502.gcc_r - small changes in reductions vectorization because of the fixes in the extract cost. 631.deepsjeng_s 623.xalancbmk_s - small changes in reductions vectorization because of the fixes in the extract cost. MallocBench/gs - extra code remain scalar because of extracts cost alacconvert-encode - extra code remain scalar because of extracts cost alacconvert-decode - extra code remain scalar because of extracts cost GCC-C-execute-20050826-2 - extra reduction gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/99923
68 lines
1.9 KiB
LLVM
68 lines
1.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s
|
|
|
|
define i32 @foo(i32 %a) {
|
|
; CHECK-LABEL: @foo(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
|
|
; CHECK-NEXT: [[LOCAL:%.*]] = sub nsw i32 0, 0
|
|
; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
|
|
; CHECK: bb1:
|
|
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[LOCAL]], 3
|
|
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]]
|
|
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
; CHECK: bb2:
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
; CHECK: bb3:
|
|
; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
|
|
; CHECK-NEXT: ret i32 0
|
|
; CHECK: bb4:
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[LOCAL]], 8
|
|
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]]
|
|
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0
|
|
; CHECK-NEXT: ret i32 [[OP_RDX1]]
|
|
; CHECK: bb5:
|
|
; CHECK-NEXT: br label [[BB4:%.*]]
|
|
;
|
|
entry:
|
|
%0 = sub nsw i32 0, %a
|
|
%local = sub nsw i32 0, 0
|
|
br i1 false, label %bb5, label %bb1
|
|
|
|
bb1:
|
|
%1 = add i32 %0, %local
|
|
%2 = add i32 %1, 0
|
|
%3 = add i32 %2, %local
|
|
%4 = add i32 %3, 0
|
|
%5 = add i32 %4, %local
|
|
br label %bb3
|
|
|
|
bb2:
|
|
br label %bb3
|
|
|
|
bb3:
|
|
%p1 = phi i32 [ %5, %bb1 ], [ 0, %bb2 ]
|
|
ret i32 0
|
|
|
|
bb4:
|
|
%6 = add i32 %0, %local
|
|
%7 = add i32 %6, %local
|
|
%8 = add i32 %7, 0
|
|
%9 = add i32 %8, %local
|
|
%10 = add i32 %9, 0
|
|
%11 = add i32 %10, %local
|
|
%12 = add i32 %11, 0
|
|
%13 = add i32 %12, %local
|
|
%14 = add i32 %13, 0
|
|
%15 = add i32 %14, %local
|
|
%16 = add i32 %15, 0
|
|
%17 = add i32 %16, %local
|
|
%18 = add i32 %17, 0
|
|
%19 = add i32 %18, %local
|
|
ret i32 %19
|
|
|
|
bb5:
|
|
br label %bb4
|
|
}
|