Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
Alexey Bataev 428e9d9d87 [SLP] Fix PR36481: vectorize reassociated instructions.
Summary:
If the load/extractelement/extractvalue instructions are not originally
consecutive, the SLP vectorizer is unable to vectorize them. Patch
allows reordering of such instructions.

Patch does not support reordering of the repeated instruction, this must
be handled in the separate patch.

Reviewers: RKSimon, spatel, hfinkel, mkuper, Ayal, ashahid

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D43776

llvm-svn: 329085
2018-04-03 17:14:47 +00:00

113 lines
6.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
; CHECK-LABEL: @jumbled-load(
; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
%load.1 = load i32, i32* %in.addr, align 4
%gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
%load.2 = load i32, i32* %gep.1, align 4
%gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
%load.3 = load i32, i32* %gep.2, align 4
%gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
%load.4 = load i32, i32* %gep.3, align 4
%inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
%load.5 = load i32, i32* %inn.addr, align 4
%gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 2
%load.6 = load i32, i32* %gep.4, align 4
%gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 3
%load.7 = load i32, i32* %gep.5, align 4
%gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 1
%load.8 = load i32, i32* %gep.6, align 4
%mul.1 = mul i32 %load.3, %load.5
%mul.2 = mul i32 %load.2, %load.8
%mul.3 = mul i32 %load.4, %load.7
%mul.4 = mul i32 %load.1, %load.6
%gep.7 = getelementptr inbounds i32, i32* %out, i64 0
store i32 %mul.1, i32* %gep.7, align 4
%gep.8 = getelementptr inbounds i32, i32* %out, i64 1
store i32 %mul.2, i32* %gep.8, align 4
%gep.9 = getelementptr inbounds i32, i32* %out, i64 2
store i32 %mul.3, i32* %gep.9, align 4
%gep.10 = getelementptr inbounds i32, i32* %out, i64 3
store i32 %mul.4, i32* %gep.10, align 4
ret i32 undef
}
define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) {
; CHECK-LABEL: @jumbled-load-multiuses(
; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[TMP10]]
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
%load.1 = load i32, i32* %in.addr, align 4
%gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
%load.2 = load i32, i32* %gep.1, align 4
%gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
%load.3 = load i32, i32* %gep.2, align 4
%gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
%load.4 = load i32, i32* %gep.3, align 4
%mul.1 = mul i32 %load.3, %load.4
%mul.2 = mul i32 %load.2, %load.2
%mul.3 = mul i32 %load.4, %load.1
%mul.4 = mul i32 %load.1, %load.3
%gep.7 = getelementptr inbounds i32, i32* %out, i64 0
store i32 %mul.1, i32* %gep.7, align 4
%gep.8 = getelementptr inbounds i32, i32* %out, i64 1
store i32 %mul.2, i32* %gep.8, align 4
%gep.9 = getelementptr inbounds i32, i32* %out, i64 2
store i32 %mul.3, i32* %gep.9, align 4
%gep.10 = getelementptr inbounds i32, i32* %out, i64 3
store i32 %mul.4, i32* %gep.10, align 4
ret i32 undef
}