Summary: If the load/extractelement/extractvalue instructions are not originally consecutive, the SLP vectorizer is unable to vectorize them. Patch allows reordering of such instructions. Patch does not support reordering of the repeated instruction, this must be handled in the separate patch. Reviewers: RKSimon, spatel, hfinkel, mkuper, Ayal, ashahid Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D43776 llvm-svn: 329085
113 lines
6.4 KiB
LLVM
113 lines
6.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
|
|
|
|
|
|
|
|
define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
|
|
; CHECK-LABEL: @jumbled-load(
|
|
; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
|
|
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
|
|
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
|
|
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
|
; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
|
|
; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
|
|
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
|
|
; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
|
|
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
|
|
; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
|
|
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
|
|
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
|
|
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
|
|
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
|
|
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
|
|
; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
|
|
; CHECK-NEXT: ret i32 undef
|
|
;
|
|
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
|
|
%load.1 = load i32, i32* %in.addr, align 4
|
|
%gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
|
|
%load.2 = load i32, i32* %gep.1, align 4
|
|
%gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
|
|
%load.3 = load i32, i32* %gep.2, align 4
|
|
%gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
|
|
%load.4 = load i32, i32* %gep.3, align 4
|
|
%inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
|
|
%load.5 = load i32, i32* %inn.addr, align 4
|
|
%gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 2
|
|
%load.6 = load i32, i32* %gep.4, align 4
|
|
%gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 3
|
|
%load.7 = load i32, i32* %gep.5, align 4
|
|
%gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 1
|
|
%load.8 = load i32, i32* %gep.6, align 4
|
|
%mul.1 = mul i32 %load.3, %load.5
|
|
%mul.2 = mul i32 %load.2, %load.8
|
|
%mul.3 = mul i32 %load.4, %load.7
|
|
%mul.4 = mul i32 %load.1, %load.6
|
|
%gep.7 = getelementptr inbounds i32, i32* %out, i64 0
|
|
store i32 %mul.1, i32* %gep.7, align 4
|
|
%gep.8 = getelementptr inbounds i32, i32* %out, i64 1
|
|
store i32 %mul.2, i32* %gep.8, align 4
|
|
%gep.9 = getelementptr inbounds i32, i32* %out, i64 2
|
|
store i32 %mul.3, i32* %gep.9, align 4
|
|
%gep.10 = getelementptr inbounds i32, i32* %out, i64 3
|
|
store i32 %mul.4, i32* %gep.10, align 4
|
|
|
|
ret i32 undef
|
|
}
|
|
|
|
|
|
define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) {
|
|
; CHECK-LABEL: @jumbled-load-multiuses(
|
|
; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
|
|
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
|
|
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
|
|
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
|
; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
|
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
|
|
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3
|
|
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2
|
|
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
|
|
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
|
|
; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[TMP10]]
|
|
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
|
|
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
|
|
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
|
|
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
|
|
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
|
|
; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
|
|
; CHECK-NEXT: ret i32 undef
|
|
;
|
|
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
|
|
%load.1 = load i32, i32* %in.addr, align 4
|
|
%gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
|
|
%load.2 = load i32, i32* %gep.1, align 4
|
|
%gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
|
|
%load.3 = load i32, i32* %gep.2, align 4
|
|
%gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
|
|
%load.4 = load i32, i32* %gep.3, align 4
|
|
%mul.1 = mul i32 %load.3, %load.4
|
|
%mul.2 = mul i32 %load.2, %load.2
|
|
%mul.3 = mul i32 %load.4, %load.1
|
|
%mul.4 = mul i32 %load.1, %load.3
|
|
%gep.7 = getelementptr inbounds i32, i32* %out, i64 0
|
|
store i32 %mul.1, i32* %gep.7, align 4
|
|
%gep.8 = getelementptr inbounds i32, i32* %out, i64 1
|
|
store i32 %mul.2, i32* %gep.8, align 4
|
|
%gep.9 = getelementptr inbounds i32, i32* %out, i64 2
|
|
store i32 %mul.3, i32* %gep.9, align 4
|
|
%gep.10 = getelementptr inbounds i32, i32* %out, i64 3
|
|
store i32 %mul.4, i32* %gep.10, align 4
|
|
|
|
ret i32 undef
|
|
}
|