Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
Philip Reames 7f6bbb3c4f [RISCV][TTI] Reduce cost of a build_vector pattern (#108419)
This change is actually two related changes, but they're very hard to
meaningfully separate as the second balances the first, and yet doesn't
do much good on it's own.

First, we can reduce the cost of a build_vector pattern. Our current
costing for this defers to generic insertelement costing which isn't
unreasonable, but also isn't correct. While inserting N elements
requires N-1 slides and N vmv.s.x, doing the full build_vector only
requires N vslide1down. (Note there are other cases that our build
vector lowering can do more cheaply, this is simply the easiest upper
bound which appears to be "good enough" for SLP costing purposes.)

Second, we need to tell SLP that calls don't preserve vector registers.
Without this, SLP will vectorize scalar code which performs e.g. 4 x
float @exp calls as two <2 x float> @exp intrinsic calls. Oddly, the
costing works out that this is in fact the optimal choice - except that
we don't actually have a <2 x float> @exp, and unroll during DAG. This
would be fine (or at least cost neutral) except that the libcall for the
scalar @exp blows all vector registers. So the net effect is we added a
bunch of spills that SLP had no idea about. Thankfully, AArch64 has a
similiar problem, and has taught SLP how to reason about spill cost once
the right TTI hook is implemented.

Now, for some implications...

The SLP solution for spill costing has some inaccuracies. In particular,
it basically just guesses whether a intrinsic will be lowered to a call
or not, and can be wrong in both directions. It also has no mechanism to
differentiate on calling convention.

This has the effect of making partial vectorization (i.e. starting in
scalar) more profitable. In practice, the major effect of this is to
make it more like SLP will vectorize part of a tree in an intersecting
forrest, and then vectorize the remaining tree once those uses have been
removed.

This has the effect of biasing us slightly away from strided, or indexed
loads during vectorization - because the scalar cost is more accurately
modeled, and these instructions look relevatively less profitable.
2024-09-20 08:34:36 -07:00

826 lines
40 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], <i32 10, i32 10, i32 10>
; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 10, i32 10>
; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
%mul.0 = mul nsw i32 %l.src.0, 10
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
%mul.1 = mul nsw i32 %l.src.1, 10
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
%mul.2 = mul nsw i32 %l.src.2, 10
store i32 %mul.0, ptr %dst
%dst.1 = getelementptr i32, ptr %dst, i32 1
store i32 %mul.1, ptr %dst.1
%dst.2 = getelementptr i32, ptr %dst, i32 2
store i32 %mul.2, ptr %dst.2
ret void
}
; Should no be vectorized with a undef/poison element as padding, as
; division by undef/poison may cause UB. Must use VL predication or
; masking instead, where RISCV wins.
define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) {
; NON-POW2-LABEL: @v3_load_i32_udiv_by_constant_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = udiv <3 x i32> <i32 10, i32 10, i32 10>, [[TMP0]]
; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_udiv_by_constant_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]]
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]]
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]]
; POW2-ONLY-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
; POW2-ONLY-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
%mul.0 = udiv i32 10, %l.src.0
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
%mul.1 = udiv i32 10, %l.src.1
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
%mul.2 = udiv i32 10, %l.src.2
store i32 %mul.0, ptr %dst
%dst.1 = getelementptr i32, ptr %dst, i32 1
store i32 %mul.1, ptr %dst.1
%dst.2 = getelementptr i32, ptr %dst, i32 2
store i32 %mul.2, ptr %dst.2
ret void
}
define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
; NON-POW2-LABEL: @v3_load_i32_mul_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_mul_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
%l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
%gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
%l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
%mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
%gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
%l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
%gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
%l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
%mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
%gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
%l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
%gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
%l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
%mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
store i32 %mul.0, ptr %dst
%dst.1 = getelementptr i32, ptr %dst, i32 1
store i32 %mul.1, ptr %dst.1
%dst.2 = getelementptr i32, ptr %dst, i32 2
store i32 %mul.2, ptr %dst.2
ret void
}
define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], <i32 9, i32 9, i32 9>
; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], <i32 9, i32 9>
; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
%l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
%gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
%l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
%mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
%add.0 = add i32 %mul.0, 9
%gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
%l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
%gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
%l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
%mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
%add.1 = add i32 %mul.1, 9
%gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
%l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
%gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
%l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
%mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
%add.2 = add i32 %mul.2, 9
store i32 %add.0, ptr %dst
%dst.1 = getelementptr i32, ptr %dst, i32 1
store i32 %add.1, ptr %dst.1
%dst.2 = getelementptr i32, ptr %dst, i32 2
store i32 %add.2, ptr %dst.2
ret void
}
define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
%l.src.0 = load float , ptr %gep.src.0, align 4
%fadd.0 = fadd float %l.src.0, 10.0
%gep.src.1 = getelementptr inbounds float , ptr %src, i32 1
%l.src.1 = load float, ptr %gep.src.1, align 4
%fadd.1 = fadd float %l.src.1, 10.0
%gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
%l.src.2 = load float, ptr %gep.src.2, align 4
%fadd.2 = fadd float %l.src.2, 10.0
store float %fadd.0, ptr %dst
%dst.1 = getelementptr float, ptr %dst, i32 1
store float %fadd.1, ptr %dst.1
%dst.2 = getelementptr float, ptr %dst, i32 2
store float %fadd.2, ptr %dst.2
ret void
}
define void @phi_store3(ptr %dst) {
; NON-POW2-LABEL: @phi_store3(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: br label [[EXIT:%.*]]
; NON-POW2: invoke.cont8.loopexit:
; NON-POW2-NEXT: br label [[EXIT]]
; NON-POW2: exit:
; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @phi_store3(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: br label [[EXIT:%.*]]
; POW2-ONLY: invoke.cont8.loopexit:
; POW2-ONLY-NEXT: br label [[EXIT]]
; POW2-ONLY: exit:
; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
br label %exit
invoke.cont8.loopexit: ; No predecessors!
br label %exit
exit:
%p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ]
%p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ]
%p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ]
%dst.1 = getelementptr i32, ptr %dst, i32 1
%dst.2 = getelementptr i32, ptr %dst, i32 2
store i32 %p.0, ptr %dst, align 4
store i32 %p.1, ptr %dst.1, align 4
store i32 %p.2, ptr %dst.2, align 4
ret void
}
define void @store_try_reorder(ptr %dst) {
; NON-POW2-LABEL: @store_try_reorder(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @store_try_reorder(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0
; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%add = add i32 0, 0
store i32 %add, ptr %dst, align 4
%add207 = sub i32 0, 0
%arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1
store i32 %add207, ptr %arrayidx.i1887, align 4
%add216 = sub i32 0, 0
%arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2
store i32 %add216, ptr %arrayidx.i1891, align 4
ret void
}
define void @vec3_fpext_cost(ptr %Colour, float %0) {
; NON-POW2-LABEL: @vec3_fpext_cost(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 0
; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer
; NON-POW2-NEXT: [[TMP3:%.*]] = fpext <3 x float> [[TMP2]] to <3 x double>
; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> [[TMP3]], <3 x double> zeroinitializer, <3 x double> zeroinitializer)
; NON-POW2-NEXT: [[TMP5:%.*]] = fptrunc <3 x double> [[TMP4]] to <3 x float>
; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR:%.*]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @vec3_fpext_cost(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
; POW2-ONLY-NEXT: [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4
; POW2-ONLY-NEXT: [[CONV78:%.*]] = fpext float [[TMP0]] to double
; POW2-ONLY-NEXT: [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00)
; POW2-ONLY-NEXT: [[CONV82:%.*]] = fptrunc double [[TMP6]] to float
; POW2-ONLY-NEXT: store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
; POW2-ONLY-NEXT: ret void
;
entry:
%arrayidx72 = getelementptr float, ptr %Colour, i64 1
%arrayidx80 = getelementptr float, ptr %Colour, i64 2
%conv62 = fpext float %0 to double
%1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00)
%conv66 = fptrunc double %1 to float
store float %conv66, ptr %Colour, align 4
%conv70 = fpext float %0 to double
%2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00)
%conv74 = fptrunc double %2 to float
store float %conv74, ptr %arrayidx72, align 4
%conv78 = fpext float %0 to double
%3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00)
%conv82 = fptrunc double %3 to float
store float %conv82, ptr %arrayidx80, align 4
ret void
}
define void @fpext_scatter(ptr %dst, double %conv) {
; CHECK-LABEL: @fpext_scatter(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float
; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
; CHECK-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX37]], align 4
; CHECK-NEXT: ret void
;
entry:
%conv25 = fptrunc double %conv to float
%Lengths = getelementptr float, ptr %dst, i64 0
store float %conv25, ptr %Lengths, align 4
%arrayidx32 = getelementptr float, ptr %dst, i64 1
store float %conv25, ptr %arrayidx32, align 4
%arrayidx37 = getelementptr float, ptr %dst, i64 2
store float %conv25, ptr %arrayidx37, align 4
ret void
}
define i32 @reduce_add(ptr %src) {
; CHECK-LABEL: @reduce_add(
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[L_SRC_0]], [[L_SRC_1]]
; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[L_SRC_2]]
; CHECK-NEXT: ret i32 [[ADD_1]]
;
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
%add.0 = add i32 %l.src.0, %l.src.1
%add.1 = add i32 %add.0, %l.src.2
ret i32 %add.1
}
define float @reduce_fadd(ptr %src) {
; NON-POW2-LABEL: @reduce_fadd(
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP1]])
; NON-POW2-NEXT: ret float [[TMP2]]
;
; POW2-ONLY-LABEL: @reduce_fadd(
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]]
; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
%l.src.0 = load float, ptr %gep.src.0, align 4
%gep.src.1 = getelementptr inbounds float, ptr %src, i32 1
%l.src.1 = load float, ptr %gep.src.1, align 4
%gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
%l.src.2 = load float, ptr %gep.src.2, align 4
%add.0 = fadd fast float %l.src.0, %l.src.1
%add.1 = fadd fast float %add.0, %l.src.2
ret float %add.1
}
define i32 @reduce_add_after_mul(ptr %src) {
; NON-POW2-LABEL: @reduce_add_after_mul(
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], <i32 10, i32 10, i32 10>
; NON-POW2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]])
; NON-POW2-NEXT: ret i32 [[TMP3]]
;
; POW2-ONLY-LABEL: @reduce_add_after_mul(
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
;
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
%mul.0 = mul nsw i32 %l.src.0, 10
%mul.1 = mul nsw i32 %l.src.1, 10
%mul.2 = mul nsw i32 %l.src.2, 10
%add.0 = add i32 %mul.0, %mul.1
%add.1 = add i32 %add.0, %mul.2
ret i32 %add.1
}
define i32 @dot_product_i32(ptr %a, ptr %b) {
; NON-POW2-LABEL: @dot_product_i32(
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
; NON-POW2-NEXT: ret i32 [[TMP4]]
;
; POW2-ONLY-LABEL: @dot_product_i32(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
%l.a.0 = load i32, ptr %gep.a.0, align 4
%gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
%l.a.1 = load i32, ptr %gep.a.1, align 4
%gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
%l.a.2 = load i32, ptr %gep.a.2, align 4
%gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
%l.b.0 = load i32, ptr %gep.b.0, align 4
%gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
%l.b.1 = load i32, ptr %gep.b.1, align 4
%gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
%l.b.2 = load i32, ptr %gep.b.2, align 4
%mul.0 = mul nsw i32 %l.a.0, %l.b.0
%mul.1 = mul nsw i32 %l.a.1, %l.b.1
%mul.2 = mul nsw i32 %l.a.2, %l.b.2
%add.0 = add i32 %mul.0, %mul.1
%add.1 = add i32 %add.0, %mul.2
ret i32 %add.1
}
; Same as above, except the reduction order has been perturbed. This
; is checking for our ability to reorder.
define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
; NON-POW2-LABEL: @dot_product_i32_reorder(
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
; NON-POW2-NEXT: ret i32 [[TMP4]]
;
; POW2-ONLY-LABEL: @dot_product_i32_reorder(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
%l.a.0 = load i32, ptr %gep.a.0, align 4
%gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
%l.a.1 = load i32, ptr %gep.a.1, align 4
%gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
%l.a.2 = load i32, ptr %gep.a.2, align 4
%gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
%l.b.0 = load i32, ptr %gep.b.0, align 4
%gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
%l.b.1 = load i32, ptr %gep.b.1, align 4
%gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
%l.b.2 = load i32, ptr %gep.b.2, align 4
%mul.0 = mul nsw i32 %l.a.0, %l.b.0
%mul.1 = mul nsw i32 %l.a.1, %l.b.1
%mul.2 = mul nsw i32 %l.a.2, %l.b.2
%add.0 = add i32 %mul.1, %mul.0
%add.1 = add i32 %add.0, %mul.2
ret i32 %add.1
}
define float @dot_product_fp32(ptr %a, ptr %b) {
; NON-POW2-LABEL: @dot_product_fp32(
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
; NON-POW2-NEXT: ret float [[TMP4]]
;
; POW2-ONLY-LABEL: @dot_product_fp32(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
%l.a.0 = load float, ptr %gep.a.0, align 4
%gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
%l.a.1 = load float, ptr %gep.a.1, align 4
%gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
%l.a.2 = load float, ptr %gep.a.2, align 4
%gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
%l.b.0 = load float, ptr %gep.b.0, align 4
%gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
%l.b.1 = load float, ptr %gep.b.1, align 4
%gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
%l.b.2 = load float, ptr %gep.b.2, align 4
%mul.0 = fmul fast float %l.a.0, %l.b.0
%mul.1 = fmul fast float %l.a.1, %l.b.1
%mul.2 = fmul fast float %l.a.2, %l.b.2
%add.0 = fadd fast float %mul.0, %mul.1
%add.1 = fadd fast float %add.0, %mul.2
ret float %add.1
}
; Same as above, except the reduction order has been perturbed. This
; is checking for our ability to reorder.
define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
; NON-POW2-LABEL: @dot_product_fp32_reorder(
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
; NON-POW2-NEXT: ret float [[TMP4]]
;
; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
%l.a.0 = load float, ptr %gep.a.0, align 4
%gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
%l.a.1 = load float, ptr %gep.a.1, align 4
%gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
%l.a.2 = load float, ptr %gep.a.2, align 4
%gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
%l.b.0 = load float, ptr %gep.b.0, align 4
%gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
%l.b.1 = load float, ptr %gep.b.1, align 4
%gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
%l.b.2 = load float, ptr %gep.b.2, align 4
%mul.0 = fmul fast float %l.a.0, %l.b.0
%mul.1 = fmul fast float %l.a.1, %l.b.1
%mul.2 = fmul fast float %l.a.2, %l.b.2
%add.0 = fadd fast float %mul.1, %mul.0
%add.1 = fadd fast float %add.0, %mul.2
ret float %add.1
}
define double @dot_product_fp64(ptr %a, ptr %b) {
; NON-POW2-LABEL: @dot_product_fp64(
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
; NON-POW2-NEXT: ret double [[TMP4]]
;
; POW2-ONLY-LABEL: @dot_product_fp64(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret double [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
%l.a.0 = load double, ptr %gep.a.0, align 4
%gep.a.1 = getelementptr inbounds double, ptr %a, i32 1
%l.a.1 = load double, ptr %gep.a.1, align 4
%gep.a.2 = getelementptr inbounds double, ptr %a, i32 2
%l.a.2 = load double, ptr %gep.a.2, align 4
%gep.b.0 = getelementptr inbounds double, ptr %b, i32 0
%l.b.0 = load double, ptr %gep.b.0, align 4
%gep.b.1 = getelementptr inbounds double, ptr %b, i32 1
%l.b.1 = load double, ptr %gep.b.1, align 4
%gep.b.2 = getelementptr inbounds double, ptr %b, i32 2
%l.b.2 = load double, ptr %gep.b.2, align 4
%mul.0 = fmul fast double %l.a.0, %l.b.0
%mul.1 = fmul fast double %l.a.1, %l.b.1
%mul.2 = fmul fast double %l.a.2, %l.b.2
%add.0 = fadd fast double %mul.0, %mul.1
%add.1 = fadd fast double %add.0, %mul.2
ret double %add.1
}
;; Covers a case where SLP would previous crash due to a
;; missing bailout in TryToFindDuplicates for the case
;; where a VL=3 list was vectorized directly (without
;; a root instruction such as a store or reduce).
define double @no_root_reshuffle(ptr %ptr) {
; CHECK-LABEL: @no_root_reshuffle(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8
; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]]
; CHECK-NEXT: [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]]
; CHECK-NEXT: ret double [[ADD]]
;
entry:
%0 = load double, ptr %ptr, align 8
%mul = fmul fast double %0, %0
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8
%1 = load double, ptr %arrayidx2, align 8
%arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16
%2 = load double, ptr %arrayidx3, align 8
%3 = fmul fast double %2, %2
%mul6 = fmul fast double %3, %1
%add = fadd fast double %mul6, %mul
ret double %add
}
define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
; NON-POW2-NEXT: ret float [[TMP5]]
;
; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%mul.0 = fmul fast float %a, 10.0
%mul.1 = fmul fast float %b, 10.0
%mul.2 = fmul fast float %c, 10.0
%add.0 = fadd fast float %mul.0, %mul.1
%add.1 = fadd fast float %add.0, %mul.2
ret float %add.1
}
declare float @llvm.fmuladd.f32(float, float, float)
declare double @llvm.fmuladd.f64(double, double, double)