SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/104144
47 lines
2.8 KiB
LLVM
47 lines
2.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -passes=slp-vectorizer -S < %s | FileCheck %s
|
|
|
|
@bar = external global [4 x [4 x i32]], align 4
|
|
@dct_luma = external global [4 x [4 x i32]], align 4
|
|
|
|
define void @foo() local_unnamed_addr {
|
|
; CHECK-LABEL: @foo(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef
|
|
; CHECK-NEXT: store i32 [[ADD277]], ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 1), align 4
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 0), align 4
|
|
; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 undef, i32 poison, i32 poison>, i32 [[TMP0]], i32 0
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 poison, i32 undef, i32 undef>, i32 [[ADD277]], i32 1
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], <i32 6, i32 6, i32 6, i32 6>
|
|
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4
|
|
; CHECK-NEXT: unreachable
|
|
;
|
|
entry:
|
|
%add277 = add nsw i32 undef, undef
|
|
store i32 %add277, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 1), align 4
|
|
%0 = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 0), align 4
|
|
%sub355 = add nsw i32 undef, %0
|
|
%shr.i = ashr i32 %sub355, 6
|
|
%arrayidx372 = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0
|
|
store i32 %shr.i, ptr %arrayidx372, align 4
|
|
%sub355.1 = add nsw i32 undef, %add277
|
|
%shr.i.1 = ashr i32 %sub355.1, 6
|
|
%arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 1
|
|
store i32 %shr.i.1, ptr %arrayidx372.1, align 4
|
|
%1 = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4
|
|
%sub355.2 = add nsw i32 undef, %1
|
|
%shr.i.2 = ashr i32 %sub355.2, 6
|
|
%arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 2
|
|
store i32 %shr.i.2, ptr %arrayidx372.2, align 4
|
|
%2 = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 3), align 4
|
|
%sub355.3 = add nsw i32 undef, %2
|
|
%shr.i.3 = ashr i32 %sub355.3, 6
|
|
%arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 3
|
|
store i32 %shr.i.3, ptr %arrayidx372.3, align 4
|
|
unreachable
|
|
}
|