Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
Alexey Bataev 44eca64224 [SLP]Check scalars before trying scheduling.
Need to check the scalars if they can be vectorized before trying to
schedule them. It may save compile time and improve vectorization on
large functions/basic blocks.

Differential Revision: https://reviews.llvm.org/D154891
2023-07-24 09:25:19 -07:00

107 lines
3.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 < %s | FileCheck %s
; S[0] = %v1 + %v2
; S[1] = %v2 + %v1
; S[2] = %v2 + %v1
; S[3] = %v1 + %v2
;
; We broadcast %v1 and %v2
;
define void @bcast_vals(ptr %A, ptr %B, ptr %S) {
; CHECK-LABEL: @bcast_vals(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A0:%.*]] = load i64, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[B0:%.*]] = load i64, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1
; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[S:%.*]], align 8
; CHECK-NEXT: ret void
;
entry:
%A0 = load i64, ptr %A, align 8
%B0 = load i64, ptr %B, align 8
%v1 = sub i64 %A0, 1
%v2 = sub i64 %B0, 1
%Add0 = add i64 %v1, %v2
%Add1 = add i64 %v2, %v1
%Add2 = add i64 %v2, %v1
%Add3 = add i64 %v1, %v2
%idxS1 = getelementptr inbounds i64, ptr %S, i64 1
%idxS2 = getelementptr inbounds i64, ptr %S, i64 2
%idxS3 = getelementptr inbounds i64, ptr %S, i64 3
store i64 %Add0, ptr %S, align 8
store i64 %Add1, ptr %idxS1, align 8
store i64 %Add2, ptr %idxS2, align 8
store i64 %Add3, ptr %idxS3, align 8
ret void
}
; S[0] = %v1 + %v2
; S[1] = %v3 + %v1
; S[2] = %v5 + %v1
; S[3] = %v1 + %v4
;
; We broadcast %v1.
;
define void @bcast_vals2(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E, ptr %S) {
; CHECK-LABEL: @bcast_vals2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A0:%.*]] = load i16, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[B0:%.*]] = load i16, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[C0:%.*]] = load i16, ptr [[C:%.*]], align 8
; CHECK-NEXT: [[D0:%.*]] = load i16, ptr [[D:%.*]], align 8
; CHECK-NEXT: [[E0:%.*]] = load i16, ptr [[E:%.*]], align 8
; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP4]]
; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[S:%.*]], align 8
; CHECK-NEXT: ret void
;
entry:
%A0 = load i16, ptr %A, align 8
%B0 = load i16, ptr %B, align 8
%C0 = load i16, ptr %C, align 8
%D0 = load i16, ptr %D, align 8
%E0 = load i16, ptr %E, align 8
%v1 = sext i16 %A0 to i32
%v2 = sext i16 %B0 to i32
%v3 = sext i16 %C0 to i32
%v4 = sext i16 %D0 to i32
%v5 = sext i16 %E0 to i32
%Add0 = add i32 %v1, %v2
%Add1 = add i32 %v3, %v1
%Add2 = add i32 %v5, %v1
%Add3 = add i32 %v1, %v4
%idxS1 = getelementptr inbounds i32, ptr %S, i64 1
%idxS2 = getelementptr inbounds i32, ptr %S, i64 2
%idxS3 = getelementptr inbounds i32, ptr %S, i64 3
store i32 %Add0, ptr %S, align 8
store i32 %Add1, ptr %idxS1, align 8
store i32 %Add2, ptr %idxS2, align 8
store i32 %Add3, ptr %idxS3, align 8
ret void
}