In the following loop:
void foo(int *a, int *b, int N) {
for (int i=0; i<N; ++i)
a[i + 4] = a[i] + b[i];
}
The loop dependence constrains the VF to a maximum of (4, fixed), which
would mean using <4 x i32> as the vector type in vectorization.
Extending this to scalable vectorization, a VF of (4, scalable) implies
a vector type of <vscale x 4 x i32>. To determine if this is legal
vscale must be taken into account. For this example, unless
max(vscale)=1, it's unsafe to vectorize.
For SVE, the number of bits in an SVE register is architecturally
defined to be a multiple of 128 bits with a maximum of 2048 bits, thus
the maximum vscale is 16. In the loop above it is therefore unfeasible
to vectorize with SVE. However, in this loop:
void foo(int *a, int *b, int N) {
#pragma clang loop vectorize_width(X, scalable)
for (int i=0; i<N; ++i)
a[i + 32] = a[i] + b[i];
}
As long as max(vscale) multiplied by the number of lanes 'X' doesn't
exceed the dependence distance, it is safe to vectorize. For SVE a VF of
(2, scalable) is within this constraint, since a vector of <16 x 2 x 32>
will have no dependencies between lanes. For any number of lanes larger
than this it would be unsafe to vectorize.
This patch extends 'computeFeasibleMaxVF' to legalize scalable VFs
specified as loop hints, implementing the following behaviour:
* If the backend does not support scalable vectors, ignore the hint.
* If scalable vectorization is unfeasible given the loop
dependence, like in the first example above for SVE, then use a
fixed VF.
* Accept scalable VFs if it's safe to do so.
* Otherwise, clamp scalable VFs that exceed the maximum safe VF.
Reviewed By: sdesmalen, fhahn, david-arm
Differential Revision: https://reviews.llvm.org/D91718
102 lines
4.1 KiB
LLVM
102 lines
4.1 KiB
LLVM
; REQUIRES: asserts
|
|
; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
|
|
|
|
; Currently we cannot handle reduction loops.
|
|
; CHECK: LV: Checking a loop in "f1"
|
|
; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
|
|
|
|
define signext i32 @f1(i8* noalias %A, i32 signext %n) {
|
|
entry:
|
|
%cmp1 = icmp sgt i32 %n, 0
|
|
br i1 %cmp1, label %for.body.preheader, label %for.end
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%wide.trip.count = zext i32 %n to i64
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%sum.02 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%add = add nuw nsw i32 %sum.02, %conv
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond, label %for.body, label %for.end.loopexit
|
|
|
|
for.end.loopexit: ; preds = %for.body
|
|
%add.lcssa = phi i32 [ %add, %for.body ]
|
|
br label %for.end
|
|
|
|
for.end: ; preds = %for.end.loopexit, %entry
|
|
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
|
|
ret i32 %sum.0.lcssa
|
|
}
|
|
|
|
; Currently we cannot handle live-out variables that are recurrences.
|
|
; CHECK: LV: Checking a loop in "f2"
|
|
; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
|
|
|
|
define signext i32 @f2(i8* noalias %A, i32 signext %n) {
|
|
entry:
|
|
%cmp1 = icmp sgt i32 %n, 0
|
|
br i1 %cmp1, label %for.body.preheader, label %for.end
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%wide.trip.count = zext i32 %n to i64
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
|
|
%0 = load i8, i8* %arrayidx, align 1
|
|
%add = add i8 %0, 1
|
|
%arrayidx3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
|
|
store i8 %add, i8* %arrayidx3, align 1
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond, label %for.body, label %for.end.loopexit
|
|
|
|
for.end.loopexit: ; preds = %for.body
|
|
%inc.lcssa.wide = phi i64 [ %indvars.iv.next, %for.body ]
|
|
%1 = trunc i64 %inc.lcssa.wide to i32
|
|
br label %for.end
|
|
|
|
for.end: ; preds = %for.end.loopexit, %entry
|
|
%i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.end.loopexit ]
|
|
ret i32 %i.0.lcssa
|
|
}
|
|
|
|
; Currently we cannot handle widended/truncated inductions.
|
|
; CHECK: LV: Checking a loop in "f3"
|
|
; CHECK: LEV: Unable to vectorize epilogue because the loop is not a supported candidate.
|
|
|
|
define void @f3(i8* noalias %A, i32 signext %n) {
|
|
entry:
|
|
%cmp1 = icmp sgt i32 %n, 0
|
|
br i1 %cmp1, label %for.body.preheader, label %for.end
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%wide.trip.count = zext i32 %n to i64
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%0 = trunc i64 %indvars.iv to i32
|
|
%conv = trunc i32 %0 to i8
|
|
%arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
|
|
store i8 %conv, i8* %arrayidx, align 1
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond, label %for.body, label %for.end.loopexit
|
|
|
|
for.end.loopexit: ; preds = %for.body
|
|
br label %for.end
|
|
|
|
for.end: ; preds = %for.end.loopexit, %entry
|
|
ret void
|
|
}
|