Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
Anton Afanasyev ab2c499d3a [SLP] Add insertelement instructions to vectorizable tree
Add new type of tree node for `InsertElementInst` chain forming vector.
These instructions could be either removed, or replaced by shuffles during
vectorization and we can add this node to cost model, so naturally estimating
their cost, getting rid of `CompensateCost` tricks and reducing further work
for InstCombine. This fixes PR40522 and PR35732 in a natural way. Also this
patch is the first step towards revectorization of partially vectorization
(to fix PR42022 completely). After adding inserts to tree the next step is
to add vector instructions there (for instance, to merge `store <2 x float>`
and `store <2 x float>` to `store <4 x float>`).

Fixes PR40522 and PR35732.

Differential Revision: https://reviews.llvm.org/D98714
2021-05-13 07:41:45 +03:00

108 lines
5.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -instcombine -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s
define void @test1(float %a, float %b, float %c, float %d, i32* nocapture %p) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0:![0-9]+]]
; CHECK-NEXT: ret void
;
entry:
%conv = fptosi float %a to i32
%conv1 = fptosi float %b to i32
%conv3 = fptosi float %c to i32
%conv5 = fptosi float %d to i32
%incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
store i32 %conv, i32* %p, align 4, !tbaa !2
%incdec.ptr8 = getelementptr inbounds i32, i32* %p, i64 2
store i32 %conv1, i32* %incdec.ptr, align 4, !tbaa !2
%incdec.ptr10 = getelementptr inbounds i32, i32* %p, i64 3
store i32 %conv3, i32* %incdec.ptr8, align 4, !tbaa !2
store i32 %conv5, i32* %incdec.ptr10, align 4, !tbaa !2
ret void
}
define void @test1_vec(float %a, float %b, float %c, float %d, <4 x i32>* nocapture %p) {
; CHECK-LABEL: @test1_vec(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[P:%.*]], align 16, !tbaa [[TBAA0]]
; CHECK-NEXT: ret void
;
entry:
%conv = fptosi float %a to i32
%vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
%conv1 = fptosi float %b to i32
%vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1
%conv3 = fptosi float %c to i32
%vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2
%conv5 = fptosi float %d to i32
%vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3
store <4 x i32> %vecinit6, <4 x i32>* %p, align 16, !tbaa !2
ret void
}
define void @test2(i32 %a, i32 %b, i32 %c, i32 %d, i32* nocapture %p) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
; CHECK-NEXT: ret void
;
entry:
%add = add nsw i32 %a, 1
%add1 = add nsw i32 %b, 1
%add3 = add nsw i32 %c, 1
%add5 = add nsw i32 %d, 1
%incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
store i32 %add, i32* %p, align 4, !tbaa !2
%incdec.ptr8 = getelementptr inbounds i32, i32* %p, i64 2
store i32 %add1, i32* %incdec.ptr, align 4, !tbaa !2
%incdec.ptr10 = getelementptr inbounds i32, i32* %p, i64 3
store i32 %add3, i32* %incdec.ptr8, align 4, !tbaa !2
store i32 %add5, i32* %incdec.ptr10, align 4, !tbaa !2
ret void
}
define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, <4 x i32>* nocapture %4) {
; CHECK-LABEL: @test2_vec(
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
; CHECK-NEXT: ret void
;
%6 = add nsw i32 %0, 1
%7 = insertelement <4 x i32> undef, i32 %6, i32 0
%8 = add nsw i32 %1, 1
%9 = insertelement <4 x i32> %7, i32 %8, i32 1
%10 = add nsw i32 %2, 1
%11 = insertelement <4 x i32> %9, i32 %10, i32 2
%12 = add nsw i32 %3, 1
%13 = insertelement <4 x i32> %11, i32 %12, i32 3
store <4 x i32> %13, <4 x i32>* %4, align 16, !tbaa !2
ret void
}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}