Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
Alexey Bataev 4a45efa431 [SLP] Preserve IR flags when vectorizing horizontal reductions.
Summary:
The SLP vectorizer should propagate IR-level optimization hints/flags
(nsw, nuw, exact, fast-math) when converting scalar horizontal
reductions instructions into vectors, just like for other vectorized
instructions.
It doe not include IR propagation for extra arguments, we need to handle
original scalar operations for extra args to propagate correct flags.

Reviewers: mkuper, mzolotukhin, hfinkel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30418

llvm-svn: 296614
2017-03-01 12:43:39 +00:00

79 lines
3.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
define i32 @foo(i32* nocapture readonly %diff) #0 {
; CHECK-LABEL: @foo(
; CHECK: load <4 x i32>
; CHECK: load <4 x i32>
; CHECK: [[S1:%.+]] = add nsw <4 x i32>
; CHECK: store <4 x i32> [[S1]],
; CHECK: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
; CHECK: [[ADD52:%.*]] = add nsw i32 [[TMP15]],
; CHECK: ret i32 [[ADD52]]
;
entry:
%m2 = alloca [8 x [8 x i32]], align 16
%0 = bitcast [8 x [8 x i32]]* %m2 to i8*
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
%1 = shl i64 %indvars.iv, 3
%arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
%2 = load i32, i32* %arrayidx, align 4
%3 = or i64 %1, 4
%arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
%4 = load i32, i32* %arrayidx2, align 4
%add3 = add nsw i32 %4, %2
%arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
store i32 %add3, i32* %arrayidx6, align 16
%add10 = add nsw i32 %add3, %a.088
%5 = or i64 %1, 1
%arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
%6 = load i32, i32* %arrayidx13, align 4
%7 = or i64 %1, 5
%arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
%8 = load i32, i32* %arrayidx16, align 4
%add17 = add nsw i32 %8, %6
%arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
store i32 %add17, i32* %arrayidx20, align 4
%add24 = add nsw i32 %add10, %add17
%9 = or i64 %1, 2
%arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
%10 = load i32, i32* %arrayidx27, align 4
%11 = or i64 %1, 6
%arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
%12 = load i32, i32* %arrayidx30, align 4
%add31 = add nsw i32 %12, %10
%arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
store i32 %add31, i32* %arrayidx34, align 8
%add38 = add nsw i32 %add24, %add31
%13 = or i64 %1, 3
%arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
%14 = load i32, i32* %arrayidx41, align 4
%15 = or i64 %1, 7
%arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
%16 = load i32, i32* %arrayidx44, align 4
%add45 = add nsw i32 %16, %14
%arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
store i32 %add45, i32* %arrayidx48, align 4
%add52 = add nsw i32 %add38, %add45
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 8
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
%arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
call void @ff([8 x i32]* %arraydecay) #1
ret i32 %add52
}
declare void @ff([8 x i32]*) #2