Files
clang-p2996/polly/test/ScheduleOptimizer/prevectorization.ll
Tobias Grosser 07c1c2fcc9 Make prevectorization width configurable
Polly uses 'prevectorization' to enable outer loop vectorization. When
vectorizing an outer loop, we strip-mine <number-of-prevec-dims> loop
iterations which are than interchanged to the innermost level such that LLVM's
inner loop vectorizer (or Polly's simple vectorizer) can easily vectorize this
loop. The number of loop iterations to strip-mine is now configurable with the
option -polly-prevect-width=<number-of-prevec-dims>.

This is mostly a debugging option. We should probably add a heuristic that
derives the number of prevectorization dimensions from the target data and
the data types used.

llvm-svn: 245424
2015-08-19 08:46:11 +00:00

107 lines
4.9 KiB
LLVM

; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=polly -polly-ast -analyze < %s | FileCheck %s
; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl -polly-vectorizer=stripmine -polly-ast -analyze < %s | FileCheck %s
; RUN: opt -S %loadPolly -polly-detect-unprofitable -basicaa -polly-opt-isl \
; RUN: -polly-vectorizer=polly -polly-ast -analyze \
; RUN: -polly-prevect-width=16 < %s | \
; RUN: FileCheck %s -check-prefix=VEC16
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
; Function Attrs: nounwind uwtable
define void @foo() #0 {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %for.cond1.preheader
for.cond1.preheader: ; preds = %entry.split, %for.inc28
%indvar4 = phi i64 [ 0, %entry.split ], [ %indvar.next5, %for.inc28 ]
br label %for.body3
for.body3: ; preds = %for.cond1.preheader, %for.inc25
%indvar6 = phi i64 [ 0, %for.cond1.preheader ], [ %indvar.next7, %for.inc25 ]
%arrayidx24 = getelementptr [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvar4, i64 %indvar6
store float 0.000000e+00, float* %arrayidx24, align 4
br label %for.body8
for.body8: ; preds = %for.body3, %for.body8
%indvar = phi i64 [ 0, %for.body3 ], [ %indvar.next, %for.body8 ]
%arrayidx16 = getelementptr [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvar4, i64 %indvar
%arrayidx20 = getelementptr [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvar, i64 %indvar6
%0 = load float, float* %arrayidx24, align 4
%1 = load float, float* %arrayidx16, align 4
%2 = load float, float* %arrayidx20, align 4
%mul = fmul float %1, %2
%add = fadd float %0, %mul
store float %add, float* %arrayidx24, align 4
%indvar.next = add i64 %indvar, 1
%exitcond = icmp ne i64 %indvar.next, 1536
br i1 %exitcond, label %for.body8, label %for.inc25
for.inc25: ; preds = %for.body8
%indvar.next7 = add i64 %indvar6, 1
%exitcond8 = icmp ne i64 %indvar.next7, 1536
br i1 %exitcond8, label %for.body3, label %for.inc28
for.inc28: ; preds = %for.inc25
%indvar.next5 = add i64 %indvar4, 1
%exitcond9 = icmp ne i64 %indvar.next5, 1536
br i1 %exitcond9, label %for.cond1.preheader, label %for.end30
for.end30: ; preds = %for.inc28
ret void
}
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1)
; CHECK: for (int c2 = 0; c2 <= 31; c2 += 1)
; CHECK: for (int c3 = 0; c3 <= 7; c3 += 1)
; CHECK: #pragma simd
; CHECK: for (int c4 = 0; c4 <= 3; c4 += 1)
; CHECK: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4);
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1)
; CHECK: for (int c2 = 0; c2 <= 47; c2 += 1)
; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1)
; CHECK: for (int c4 = 0; c4 <= 7; c4 += 1)
; CHECK: for (int c5 = 0; c5 <= 31; c5 += 1)
; CHECK: #pragma simd
; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
; VEC16: {
; VEC16: #pragma known-parallel
; VEC16: for (int c0 = 0; c0 <= 47; c0 += 1)
; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1)
; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1)
; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1)
; VEC16: #pragma simd
; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1)
; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4);
; VEC16: #pragma known-parallel
; VEC16: for (int c0 = 0; c0 <= 47; c0 += 1)
; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1)
; VEC16: for (int c2 = 0; c2 <= 47; c2 += 1)
; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1)
; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1)
; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1)
; VEC16: #pragma simd
; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1)
; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
; VEC16: }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.5.0 "}