Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
Alexey Bataev 197f4a9051 [SLP]Remove ExtraArgs from reductions.
No need to handle extra arguments during the reductions anymore, the
compiler now can handle all reduced values and reduction operands
correctly, even if they are from different basic blocks.

Simplifies analysis, reduces compiler size, improves overall
vectorization.

Metric: size..text
test-suite :: SingleSource/Benchmarks/Misc-C++/stepanov_container.test    16668.00    17148.00  2.9%
test-suite :: External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk.test  2389675.00  2418683.00  1.2%
test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test   253517.00   253645.00  0.1%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test   309678.00   309806.00  0.0%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test   389203.00   389363.00  0.0%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test   111120.00   111152.00  0.0%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test  1039103.00  1039215.00  0.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  1155883.00  1155963.00  0.0%
test-suite :: MicroBenchmarks/LoopVectorization/LoopInterleavingBenchmarks.test   276646.00   276662.00  0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test   848691.00   848739.00  0.0%
test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  1138604.00  1138636.00  0.0%
test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test   910201.00   910217.00  0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12385484.00 12385628.00  0.0%
test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test  9667580.00  9667676.00  0.0%
test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test  9667580.00  9667676.00  0.0%
test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test  2856182.00  2856198.00  0.0%
test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test  2856182.00  2856198.00  0.0%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test   773224.00   773192.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  1035148.00  1035084.00 -0.0%
test-suite :: External/SPEC/CINT2017speed/631.deepsjeng_s/631.deepsjeng_s.test    98126.00    98094.00 -0.0%
test-suite :: External/SPEC/CINT2017rate/531.deepsjeng_r/531.deepsjeng_r.test    97966.00    97934.00 -0.0%
test-suite :: MultiSource/Benchmarks/MallocBench/gs/gs.test   167391.00   167215.00 -0.1%
test-suite :: MultiSource/Applications/ALAC/encode/alacconvert-encode.test    56685.00    56605.00 -0.1%
test-suite :: MultiSource/Applications/ALAC/decode/alacconvert-decode.test    56685.00    56605.00 -0.1%
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-20050826-2.test     1302.00     1294.00 -0.6%

Misc-C++/stepanov_container - better code due to cost fixes.
483.xalancbmk - better code due to cost fixes.
ASCI_Purple/SMG2000 - better code due to cost fixes.
Benchmarks/Bullet - better vector code because of the cost.
JM/ldecod - extra code remain scalar, extra reduction vectorized
consumer-jpeg - extra code remain scalar because of the cost.
tramp3d-v4 - better vectorization because of cost fixes.
511.povray_r - better vectorization because of cost fixes.
LoopInterleavingBenchmarks - extra reductions are vectorized
JM/lencod - small changes in vector code because of extract cost fixes.
453.povray - small changes in vector code because of extract cost fixes.
445.gobmk - extra small reduction vectorized
526.blender_r - extra reduced scalars, better small reduction, small
changes in the vetorization because of the fixes for extracts cost
602.gcc_s
502.gcc_r - small changes in reductions vectorization because of the
fixes in the extract cost.
631.deepsjeng_s
623.xalancbmk_s - small changes in reductions vectorization because of
the fixes in the extract cost.
MallocBench/gs - extra code remain scalar because of extracts cost
alacconvert-encode - extra code remain scalar because of extracts cost
alacconvert-decode - extra code remain scalar because of extracts cost
GCC-C-execute-20050826-2 - extra reduction gets vectorized

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/99923
2024-07-29 13:23:56 -04:00

68 lines
1.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s
define i32 @foo(i32 %a) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
; CHECK-NEXT: [[LOCAL:%.*]] = sub nsw i32 0, 0
; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[LOCAL]], 3
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
; CHECK-NEXT: ret i32 0
; CHECK: bb4:
; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[LOCAL]], 8
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0
; CHECK-NEXT: ret i32 [[OP_RDX1]]
; CHECK: bb5:
; CHECK-NEXT: br label [[BB4:%.*]]
;
entry:
%0 = sub nsw i32 0, %a
%local = sub nsw i32 0, 0
br i1 false, label %bb5, label %bb1
bb1:
%1 = add i32 %0, %local
%2 = add i32 %1, 0
%3 = add i32 %2, %local
%4 = add i32 %3, 0
%5 = add i32 %4, %local
br label %bb3
bb2:
br label %bb3
bb3:
%p1 = phi i32 [ %5, %bb1 ], [ 0, %bb2 ]
ret i32 0
bb4:
%6 = add i32 %0, %local
%7 = add i32 %6, %local
%8 = add i32 %7, 0
%9 = add i32 %8, %local
%10 = add i32 %9, 0
%11 = add i32 %10, %local
%12 = add i32 %11, 0
%13 = add i32 %12, %local
%14 = add i32 %13, 0
%15 = add i32 %14, %local
%16 = add i32 %15, 0
%17 = add i32 %16, %local
%18 = add i32 %17, 0
%19 = add i32 %18, %local
ret i32 %19
bb5:
br label %bb4
}