Files
clang-p2996/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
Alina Sbirlea 3f8f7840bf [LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148.
Summary:
LSV was using two vector sets (heads and tails) to track pairs of adjiacent position to vectorize.
A recent optimization is trying to obtain the longest chain to vectorize and assumes the positions
in heads(H) and tails(T) match, which is not the case is there are multiple tails for the same head.

e.g.:
i1: store a[0]
i2: store a[1]
i3: store a[1]
Leads to:
H: i1
T: i2 i3
Instead of:
H: i1 i1
T: i2 i3
So the positions for instructions that follow i3 will have different indexes in H/T.
This patch resolves PR29148.

This issue also surfaced the fact that if the chain is too long, and TLI
returns a "not-fast" answer, the whole chain will be abandoned for
vectorization, even though a smaller one would be beneficial.
Added a testcase and FIXME for this.

Reviewers: tstellarAMD, arsenm, jlebar

Subscribers: mzolotukhin, wdng, llvm-commits

Differential Revision: https://reviews.llvm.org/D24057

llvm-svn: 280179
2016-08-30 23:53:59 +00:00

118 lines
3.5 KiB
LLVM

; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; Vectorized subsets of the load/store chains in the presence of
; interleaved loads/stores
; CHECK-LABEL: @interleave_2L_2S(
; CHECK: load <2 x i32>
; CHECK: load i32
; CHECK: store <2 x i32>
; CHECK: load i32
define void @interleave_2L_2S(i32* noalias %ptr) {
%next.gep = getelementptr i32, i32* %ptr, i64 0
%next.gep1 = getelementptr i32, i32* %ptr, i64 1
%next.gep2 = getelementptr i32, i32* %ptr, i64 2
%l1 = load i32, i32* %next.gep1, align 4
%l2 = load i32, i32* %next.gep, align 4
store i32 0, i32* %next.gep1, align 4
store i32 0, i32* %next.gep, align 4
%l3 = load i32, i32* %next.gep1, align 4
%l4 = load i32, i32* %next.gep2, align 4
ret void
}
; CHECK-LABEL: @interleave_3L_2S_1L(
; CHECK: load <3 x i32>
; CHECK: store <2 x i32>
; CHECK: load i32
define void @interleave_3L_2S_1L(i32* noalias %ptr) {
%next.gep = getelementptr i32, i32* %ptr, i64 0
%next.gep1 = getelementptr i32, i32* %ptr, i64 1
%next.gep2 = getelementptr i32, i32* %ptr, i64 2
%l2 = load i32, i32* %next.gep, align 4
%l1 = load i32, i32* %next.gep1, align 4
store i32 0, i32* %next.gep1, align 4
store i32 0, i32* %next.gep, align 4
%l3 = load i32, i32* %next.gep1, align 4
%l4 = load i32, i32* %next.gep2, align 4
ret void
}
; CHECK-LABEL: @chain_suffix(
; CHECK: load i32
; CHECK: store <2 x i32>
; CHECK: load <2 x i32>
define void @chain_suffix(i32* noalias %ptr) {
%next.gep = getelementptr i32, i32* %ptr, i64 0
%next.gep1 = getelementptr i32, i32* %ptr, i64 1
%next.gep2 = getelementptr i32, i32* %ptr, i64 2
%l2 = load i32, i32* %next.gep, align 4
store i32 0, i32* %next.gep1, align 4
store i32 0, i32* %next.gep, align 4
%l3 = load i32, i32* %next.gep1, align 4
%l4 = load i32, i32* %next.gep2, align 4
ret void
}
; CHECK-LABEL: @chain_prefix_suffix(
; CHECK: load <2 x i32>
; CHECK: store <2 x i32>
; CHECK: load <3 x i32>
define void @chain_prefix_suffix(i32* noalias %ptr) {
%next.gep = getelementptr i32, i32* %ptr, i64 0
%next.gep1 = getelementptr i32, i32* %ptr, i64 1
%next.gep2 = getelementptr i32, i32* %ptr, i64 2
%next.gep3 = getelementptr i32, i32* %ptr, i64 3
%l1 = load i32, i32* %next.gep, align 4
%l2 = load i32, i32* %next.gep1, align 4
store i32 0, i32* %next.gep1, align 4
store i32 0, i32* %next.gep2, align 4
%l3 = load i32, i32* %next.gep1, align 4
%l4 = load i32, i32* %next.gep2, align 4
%l5 = load i32, i32* %next.gep3, align 4
ret void
}
; FIXME: If the chain is too long and TLI says misaligned is not fast,
; then LSV fails to vectorize anything in that chain.
; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
; CHECK-LABEL: @interleave_get_longest
; CHECK: load <3 x i32>
; CHECK: load i32
; CHECK: store <2 x i32> zeroinitializer
; CHECK: load i32
; CHECK: load i32
; CHECK: load i32
define void @interleave_get_longest(i32* noalias %ptr) {
%tmp1 = getelementptr i32, i32* %ptr, i64 0
%tmp2 = getelementptr i32, i32* %ptr, i64 1
%tmp3 = getelementptr i32, i32* %ptr, i64 2
%tmp4 = getelementptr i32, i32* %ptr, i64 3
%l1 = load i32, i32* %tmp2, align 4
%l2 = load i32, i32* %tmp1, align 4
store i32 0, i32* %tmp2, align 4
store i32 0, i32* %tmp1, align 4
%l3 = load i32, i32* %tmp2, align 4
%l4 = load i32, i32* %tmp3, align 4
%l5 = load i32, i32* %tmp4, align 4
%l6 = load i32, i32* %tmp4, align 4
%l7 = load i32, i32* %tmp4, align 4
ret void
}