Add new type of tree node for `InsertElementInst` chain forming vector. These instructions could be either removed, or replaced by shuffles during vectorization and we can add this node to cost model, so naturally estimating their cost, getting rid of `CompensateCost` tricks and reducing further work for InstCombine. This fixes PR40522 and PR35732 in a natural way. Also this patch is the first step towards revectorization of partially vectorization (to fix PR42022 completely). After adding inserts to tree the next step is to add vector instructions there (for instance, to merge `store <2 x float>` and `store <2 x float>` to `store <4 x float>`). Fixes PR40522 and PR35732. Differential Revision: https://reviews.llvm.org/D98714
108 lines
5.0 KiB
LLVM
108 lines
5.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -slp-vectorizer -instcombine -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s
|
|
|
|
define void @test1(float %a, float %b, float %c, float %d, i32* nocapture %p) {
|
|
; CHECK-LABEL: @test1(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
|
|
; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0:![0-9]+]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%conv = fptosi float %a to i32
|
|
%conv1 = fptosi float %b to i32
|
|
%conv3 = fptosi float %c to i32
|
|
%conv5 = fptosi float %d to i32
|
|
%incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
|
|
store i32 %conv, i32* %p, align 4, !tbaa !2
|
|
%incdec.ptr8 = getelementptr inbounds i32, i32* %p, i64 2
|
|
store i32 %conv1, i32* %incdec.ptr, align 4, !tbaa !2
|
|
%incdec.ptr10 = getelementptr inbounds i32, i32* %p, i64 3
|
|
store i32 %conv3, i32* %incdec.ptr8, align 4, !tbaa !2
|
|
store i32 %conv5, i32* %incdec.ptr10, align 4, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
define void @test1_vec(float %a, float %b, float %c, float %d, <4 x i32>* nocapture %p) {
|
|
; CHECK-LABEL: @test1_vec(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
|
|
; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[P:%.*]], align 16, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%conv = fptosi float %a to i32
|
|
%vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
|
|
%conv1 = fptosi float %b to i32
|
|
%vecinit2 = insertelement <4 x i32> %vecinit, i32 %conv1, i32 1
|
|
%conv3 = fptosi float %c to i32
|
|
%vecinit4 = insertelement <4 x i32> %vecinit2, i32 %conv3, i32 2
|
|
%conv5 = fptosi float %d to i32
|
|
%vecinit6 = insertelement <4 x i32> %vecinit4, i32 %conv5, i32 3
|
|
store <4 x i32> %vecinit6, <4 x i32>* %p, align 16, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
define void @test2(i32 %a, i32 %b, i32 %c, i32 %d, i32* nocapture %p) {
|
|
; CHECK-LABEL: @test2(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i32 3
|
|
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%add = add nsw i32 %a, 1
|
|
%add1 = add nsw i32 %b, 1
|
|
%add3 = add nsw i32 %c, 1
|
|
%add5 = add nsw i32 %d, 1
|
|
%incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
|
|
store i32 %add, i32* %p, align 4, !tbaa !2
|
|
%incdec.ptr8 = getelementptr inbounds i32, i32* %p, i64 2
|
|
store i32 %add1, i32* %incdec.ptr, align 4, !tbaa !2
|
|
%incdec.ptr10 = getelementptr inbounds i32, i32* %p, i64 3
|
|
store i32 %add3, i32* %incdec.ptr8, align 4, !tbaa !2
|
|
store i32 %add5, i32* %incdec.ptr10, align 4, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, <4 x i32>* nocapture %4) {
|
|
; CHECK-LABEL: @test2_vec(
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0
|
|
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1
|
|
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2
|
|
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3
|
|
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
|
|
; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%6 = add nsw i32 %0, 1
|
|
%7 = insertelement <4 x i32> undef, i32 %6, i32 0
|
|
%8 = add nsw i32 %1, 1
|
|
%9 = insertelement <4 x i32> %7, i32 %8, i32 1
|
|
%10 = add nsw i32 %2, 1
|
|
%11 = insertelement <4 x i32> %9, i32 %10, i32 2
|
|
%12 = add nsw i32 %3, 1
|
|
%13 = insertelement <4 x i32> %11, i32 %12, i32 3
|
|
store <4 x i32> %13, <4 x i32>* %4, align 16, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
!2 = !{!3, !3, i64 0}
|
|
!3 = !{!"int", !4, i64 0}
|
|
!4 = !{!"omnipotent char", !5, i64 0}
|
|
!5 = !{!"Simple C++ TBAA"}
|