SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/104144
59 lines
2.9 KiB
LLVM
59 lines
2.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-14 | FileCheck %s
|
|
|
|
define void @test(i1 %c, ptr %arg) {
|
|
; CHECK-LABEL: @test(
|
|
; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
|
|
; CHECK: if:
|
|
; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0)
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2)
|
|
; CHECK-NEXT: br label [[JOIN:%.*]]
|
|
; CHECK: else:
|
|
; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
|
|
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
|
|
; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0)
|
|
; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2)
|
|
; CHECK-NEXT: br label [[JOIN]]
|
|
; CHECK: join:
|
|
; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
br i1 %c, label %if, label %else
|
|
|
|
if:
|
|
%i2.0 = load i64, ptr %arg, align 8
|
|
%arg2.1 = getelementptr inbounds i8, ptr %arg, i64 8
|
|
%i2.1 = load i64, ptr %arg2.1, align 8
|
|
%arg2.2 = getelementptr inbounds i8, ptr %arg, i64 24
|
|
%i2.2 = load i64, ptr %arg2.2, align 8
|
|
%arg2.3 = getelementptr inbounds i8, ptr %arg, i64 32
|
|
%i2.3 = load i64, ptr %arg2.3, align 8
|
|
br label %join
|
|
|
|
else:
|
|
%i.0 = load i64, ptr %arg, align 8
|
|
%arg.1 = getelementptr inbounds i8, ptr %arg, i64 8
|
|
%i.1 = load i64, ptr %arg.1, align 8
|
|
%arg.2 = getelementptr inbounds i8, ptr %arg, i64 24
|
|
%i.2 = load i64, ptr %arg.2, align 8
|
|
%arg.3 = getelementptr inbounds i8, ptr %arg, i64 32
|
|
%i.3 = load i64, ptr %arg.3, align 8
|
|
br label %join
|
|
|
|
join:
|
|
%phi.3 = phi i64 [ %i2.3, %if ], [ %i.3, %else ]
|
|
%phi.2 = phi i64 [ %i2.2, %if ], [ %i.2, %else ]
|
|
%phi.1 = phi i64 [ %i2.1, %if ], [ %i.1, %else ]
|
|
%phi.0 = phi i64 [ %i2.0, %if ], [ %i.0, %else ]
|
|
ret void
|
|
}
|
|
|