Patch generalizes analysis of scalars. The main part is outlined into lambda, which can be used to find reused inserted scalars and emit shuffle for them instead of multiple insertelement instructions, if the permutation is found alreadyi. I.e. some scalars are transformed by the permutation of previously vectorized nodes, and some are inserted directly. Reworked part of D110978 Differential Revision: https://reviews.llvm.org/D146564
37 lines
1.9 KiB
LLVM
37 lines
1.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse4.2 | FileCheck %s
|
|
|
|
@a = common local_unnamed_addr global [4 x i32] zeroinitializer, align 4
|
|
@b = common local_unnamed_addr global [4 x i32] zeroinitializer, align 4
|
|
|
|
define i32 @fn1() {
|
|
; CHECK-LABEL: @fn1(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 8, i32 poison, i32 ptrtoint (ptr @fn1 to i32), i32 poison>, <4 x i32> <i32 4, i32 1, i32 6, i32 6>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 0, i32 0>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @a, align 4
|
|
; CHECK-NEXT: ret i32 0
|
|
;
|
|
entry:
|
|
%0 = load i32, ptr @b, align 4
|
|
%cmp = icmp sgt i32 %0, 0
|
|
%cond = select i1 %cmp, i32 8, i32 0
|
|
store i32 %cond, ptr getelementptr inbounds ([4 x i32], ptr @a, i64 0, i32 3), align 4
|
|
%1 = load i32, ptr getelementptr ([4 x i32], ptr @b, i64 0, i32 1), align 4
|
|
%cmp1 = icmp sgt i32 %1, 0
|
|
%. = select i1 %cmp1, i32 %1, i32 6
|
|
store i32 %., ptr @a, align 4
|
|
%2 = load i32, ptr getelementptr ([4 x i32], ptr @b, i64 0, i32 2), align 4
|
|
%cmp4 = icmp sgt i32 %2, 0
|
|
%3 = select i1 %cmp4, i32 ptrtoint (ptr @fn1 to i32), i32 0
|
|
store i32 %3, ptr getelementptr inbounds ([4 x i32], ptr @a, i64 0, i32 1), align 4
|
|
%4 = load i32, ptr getelementptr ([4 x i32], ptr @b, i64 0, i32 3), align 4
|
|
%cmp6 = icmp sgt i32 %4, 0
|
|
%5 = select i1 %cmp6, i32 ptrtoint (ptr @fn1 to i32), i32 0
|
|
store i32 %5, ptr getelementptr inbounds ([4 x i32], ptr @a, i64 0, i32 2), align 4
|
|
ret i32 0
|
|
}
|