We still can try to vectorize the bundle of the instructions, even if the repeated number of instruction is non-power-of-2. In this case need to adjust the cost (calculate the cost only for unique scalar instructions) and cost of the extracts. Also, when scheduling the bundle need to schedule only unique scalars to avoid compiler crash because of the multiple dependencies. Can be safely applied only if all scalars's users are also vectorized and do not require memory accesses (this one is a temporarily requirement, can be relaxed later). --------- Co-authored-by: Alexey Bataev <a.bataev@outlook.com>
256 lines
12 KiB
LLVM
256 lines
12 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
|
target triple = "aarch64--linux-gnu"
|
|
|
|
define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
|
|
; CHECK-LABEL: @build_vec_v2i64(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: ret <2 x i64> [[TMP5]]
|
|
;
|
|
%v0.0 = extractelement <2 x i64> %v0, i32 0
|
|
%v0.1 = extractelement <2 x i64> %v0, i32 1
|
|
%v1.0 = extractelement <2 x i64> %v1, i32 0
|
|
%v1.1 = extractelement <2 x i64> %v1, i32 1
|
|
%tmp0.0 = add i64 %v0.0, %v1.0
|
|
%tmp0.1 = add i64 %v0.1, %v1.1
|
|
%tmp1.0 = sub i64 %v0.0, %v1.0
|
|
%tmp1.1 = sub i64 %v0.1, %v1.1
|
|
%tmp2.0 = add i64 %tmp0.0, %tmp0.1
|
|
%tmp2.1 = add i64 %tmp1.0, %tmp1.1
|
|
%tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0
|
|
%tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
|
|
ret <2 x i64> %tmp3.1
|
|
}
|
|
|
|
define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) {
|
|
; CHECK-LABEL: @store_chain_v2i64(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
|
|
; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 1, i32 2>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]]
|
|
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%a.1 = getelementptr i64, ptr %a, i64 1
|
|
%b.1 = getelementptr i64, ptr %b, i64 1
|
|
%c.1 = getelementptr i64, ptr %c, i64 1
|
|
%v0.0 = load i64, ptr %a, align 8
|
|
%v0.1 = load i64, ptr %a.1, align 8
|
|
%v1.0 = load i64, ptr %b, align 8
|
|
%v1.1 = load i64, ptr %b.1, align 8
|
|
%tmp0.0 = add i64 %v0.0, %v1.0
|
|
%tmp0.1 = add i64 %v0.1, %v1.1
|
|
%tmp1.0 = sub i64 %v0.0, %v1.0
|
|
%tmp1.1 = sub i64 %v0.1, %v1.1
|
|
%tmp2.0 = add i64 %tmp0.0, %tmp0.1
|
|
%tmp2.1 = add i64 %tmp1.0, %tmp1.1
|
|
store i64 %tmp2.0, ptr %c, align 8
|
|
store i64 %tmp2.1, ptr %c.1, align 8
|
|
ret void
|
|
}
|
|
|
|
define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
|
|
; CHECK-LABEL: @build_vec_v4i32(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
|
|
;
|
|
%v0.0 = extractelement <4 x i32> %v0, i32 0
|
|
%v0.1 = extractelement <4 x i32> %v0, i32 1
|
|
%v0.2 = extractelement <4 x i32> %v0, i32 2
|
|
%v0.3 = extractelement <4 x i32> %v0, i32 3
|
|
%v1.0 = extractelement <4 x i32> %v1, i32 0
|
|
%v1.1 = extractelement <4 x i32> %v1, i32 1
|
|
%v1.2 = extractelement <4 x i32> %v1, i32 2
|
|
%v1.3 = extractelement <4 x i32> %v1, i32 3
|
|
%tmp0.0 = add i32 %v0.0, %v1.0
|
|
%tmp0.1 = add i32 %v0.1, %v1.1
|
|
%tmp0.2 = add i32 %v0.2, %v1.2
|
|
%tmp0.3 = add i32 %v0.3, %v1.3
|
|
%tmp1.0 = sub i32 %v0.0, %v1.0
|
|
%tmp1.1 = sub i32 %v0.1, %v1.1
|
|
%tmp1.2 = sub i32 %v0.2, %v1.2
|
|
%tmp1.3 = sub i32 %v0.3, %v1.3
|
|
%tmp2.0 = add i32 %tmp0.0, %tmp0.1
|
|
%tmp2.1 = add i32 %tmp1.0, %tmp1.1
|
|
%tmp2.2 = add i32 %tmp0.2, %tmp0.3
|
|
%tmp2.3 = add i32 %tmp1.2, %tmp1.3
|
|
%tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
|
|
%tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
|
|
%tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
|
|
%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
|
|
ret <4 x i32> %tmp3.3
|
|
}
|
|
|
|
define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
|
|
; CHECK-LABEL: @build_vec_v4i32_reuse_0(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
|
; CHECK-NEXT: ret <4 x i32> [[TMP6]]
|
|
;
|
|
%v0.0 = extractelement <2 x i32> %v0, i32 0
|
|
%v0.1 = extractelement <2 x i32> %v0, i32 1
|
|
%v1.0 = extractelement <2 x i32> %v1, i32 0
|
|
%v1.1 = extractelement <2 x i32> %v1, i32 1
|
|
%tmp0.0 = add i32 %v0.0, %v1.0
|
|
%tmp0.1 = add i32 %v0.1, %v1.1
|
|
%tmp1.0 = sub i32 %v0.0, %v1.0
|
|
%tmp1.1 = sub i32 %v0.1, %v1.1
|
|
%tmp2.0 = add i32 %tmp0.0, %tmp0.1
|
|
%tmp2.1 = add i32 %tmp1.0, %tmp1.1
|
|
%tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
|
|
%tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
|
|
%tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
|
|
%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
|
|
ret <4 x i32> %tmp3.3
|
|
}
|
|
|
|
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
|
|
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
|
|
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
|
|
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
|
|
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
|
|
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
|
|
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
|
|
;
|
|
%v0.0 = extractelement <2 x i32> %v0, i32 0
|
|
%v0.1 = extractelement <2 x i32> %v0, i32 1
|
|
%v1.0 = extractelement <2 x i32> %v1, i32 0
|
|
%v1.1 = extractelement <2 x i32> %v1, i32 1
|
|
%tmp0.0 = add i32 %v0.0, %v1.0
|
|
%tmp0.1 = add i32 %v0.1, %v1.1
|
|
%tmp0.2 = xor i32 %v0.0, %v1.0
|
|
%tmp0.3 = xor i32 %v0.1, %v1.1
|
|
%tmp1.0 = sub i32 %tmp0.0, %tmp0.1
|
|
%tmp1.1 = sub i32 %tmp0.0, %tmp0.1
|
|
%tmp1.2 = sub i32 %tmp0.2, %tmp0.3
|
|
%tmp1.3 = sub i32 %tmp0.3, %tmp0.2
|
|
%tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0
|
|
%tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
|
|
%tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
|
|
%tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
|
|
ret <4 x i32> %tmp2.3
|
|
}
|
|
|
|
define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
|
|
; CHECK-LABEL: @build_vec_v4i32_3_binops(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
|
|
; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
; CHECK-NEXT: ret <4 x i32> [[TMP3_31]]
|
|
;
|
|
%v0.0 = extractelement <2 x i32> %v0, i32 0
|
|
%v0.1 = extractelement <2 x i32> %v0, i32 1
|
|
%v1.0 = extractelement <2 x i32> %v1, i32 0
|
|
%v1.1 = extractelement <2 x i32> %v1, i32 1
|
|
%tmp0.0 = add i32 %v0.0, %v1.0
|
|
%tmp0.1 = add i32 %v0.1, %v1.1
|
|
%tmp0.2 = xor i32 %v0.0, %v1.0
|
|
%tmp0.3 = xor i32 %v0.1, %v1.1
|
|
%tmp1.0 = mul i32 %v0.0, %v1.0
|
|
%tmp1.1 = mul i32 %v0.1, %v1.1
|
|
%tmp1.2 = xor i32 %v0.0, %v1.0
|
|
%tmp1.3 = xor i32 %v0.1, %v1.1
|
|
%tmp2.0 = add i32 %tmp0.0, %tmp0.1
|
|
%tmp2.1 = add i32 %tmp1.0, %tmp1.1
|
|
%tmp2.2 = add i32 %tmp0.2, %tmp0.3
|
|
%tmp2.3 = add i32 %tmp1.2, %tmp1.3
|
|
%tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
|
|
%tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
|
|
%tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
|
|
%tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
|
|
ret <4 x i32> %tmp3.3
|
|
}
|
|
|
|
define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
|
|
; CHECK-LABEL: @reduction_v4i32(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
|
|
; CHECK-NEXT: ret i32 [[TMP11]]
|
|
;
|
|
%v0.0 = extractelement <4 x i32> %v0, i32 0
|
|
%v0.1 = extractelement <4 x i32> %v0, i32 1
|
|
%v0.2 = extractelement <4 x i32> %v0, i32 2
|
|
%v0.3 = extractelement <4 x i32> %v0, i32 3
|
|
%v1.0 = extractelement <4 x i32> %v1, i32 0
|
|
%v1.1 = extractelement <4 x i32> %v1, i32 1
|
|
%v1.2 = extractelement <4 x i32> %v1, i32 2
|
|
%v1.3 = extractelement <4 x i32> %v1, i32 3
|
|
%tmp0.0 = add i32 %v0.0, %v1.0
|
|
%tmp0.1 = add i32 %v0.1, %v1.1
|
|
%tmp0.2 = add i32 %v0.2, %v1.2
|
|
%tmp0.3 = add i32 %v0.3, %v1.3
|
|
%tmp1.0 = sub i32 %v0.0, %v1.0
|
|
%tmp1.1 = sub i32 %v0.1, %v1.1
|
|
%tmp1.2 = sub i32 %v0.2, %v1.2
|
|
%tmp1.3 = sub i32 %v0.3, %v1.3
|
|
%tmp2.0 = add i32 %tmp0.0, %tmp0.1
|
|
%tmp2.1 = add i32 %tmp1.0, %tmp1.1
|
|
%tmp2.2 = add i32 %tmp0.2, %tmp0.3
|
|
%tmp2.3 = add i32 %tmp1.2, %tmp1.3
|
|
%tmp3.0 = lshr i32 %tmp2.0, 15
|
|
%tmp3.1 = lshr i32 %tmp2.1, 15
|
|
%tmp3.2 = lshr i32 %tmp2.2, 15
|
|
%tmp3.3 = lshr i32 %tmp2.3, 15
|
|
%tmp4.0 = and i32 %tmp3.0, 65537
|
|
%tmp4.1 = and i32 %tmp3.1, 65537
|
|
%tmp4.2 = and i32 %tmp3.2, 65537
|
|
%tmp4.3 = and i32 %tmp3.3, 65537
|
|
%tmp5.0 = mul nuw i32 %tmp4.0, 65535
|
|
%tmp5.1 = mul nuw i32 %tmp4.1, 65535
|
|
%tmp5.2 = mul nuw i32 %tmp4.2, 65535
|
|
%tmp5.3 = mul nuw i32 %tmp4.3, 65535
|
|
%tmp6.0 = add i32 %tmp5.0, %tmp2.0
|
|
%tmp6.1 = add i32 %tmp5.1, %tmp2.1
|
|
%tmp6.2 = add i32 %tmp5.2, %tmp2.2
|
|
%tmp6.3 = add i32 %tmp5.3, %tmp2.3
|
|
%tmp7.0 = xor i32 %tmp6.0, %tmp5.0
|
|
%tmp7.1 = xor i32 %tmp6.1, %tmp5.1
|
|
%tmp7.2 = xor i32 %tmp6.2, %tmp5.2
|
|
%tmp7.3 = xor i32 %tmp6.3, %tmp5.3
|
|
%reduce.0 = add i32 %tmp7.1, %tmp7.0
|
|
%reduce.1 = add i32 %reduce.0, %tmp7.2
|
|
%reduce.2 = add i32 %reduce.1, %tmp7.3
|
|
ret i32 %reduce.2
|
|
}
|