For all of the following reductions: vector.reduce.or vector.reduce.and vector.reduce.xor vector.reduce.add vector.reduce.mul vector.reduce.umin vector.reduce.umax vector.reduce.smin vector.reduce.smax vector.reduce.fmin vector.reduce.fmax if the input operand is the result of a vector.reverse then we can perform a reduction on the vector.reverse input instead since the answer is the same. If the reassociation is permitted we can also do the same folds for these: vector.reduce.fadd vector.reduce.fmul
311 lines
14 KiB
LLVM
311 lines
14 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
|
|
|
|
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
|
|
declare float @llvm.vector.reduce.fmul.f32.nxv4f32(float, <vscale x 4 x float>)
|
|
declare float @llvm.vector.reduce.fmin.f32.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fmax.f32.nxv4f32(float, <vscale x 4 x float>)
|
|
declare void @use_f32(float)
|
|
|
|
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
|
|
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
|
|
declare void @use_i32(i32)
|
|
|
|
declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32>)
|
|
|
|
declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32>)
|
|
declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>)
|
|
|
|
declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
|
|
declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
|
|
declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
|
|
declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
|
|
|
|
define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_v4f32(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]])
|
|
; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]]
|
|
; CHECK-NEXT: ret float [[R]]
|
|
;
|
|
%r0 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
|
|
%r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
|
|
%r = fsub reassoc nsz float %r0, %r1
|
|
ret float %r
|
|
}
|
|
|
|
define float @reassoc_sum_of_reverse_v4f32(<4 x float> %v0) {
|
|
; CHECK-LABEL: @reassoc_sum_of_reverse_v4f32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: ret float [[RED]]
|
|
;
|
|
%rev = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %v0)
|
|
%red = call reassoc float @llvm.vector.reduce.fadd.v4f32(float zeroinitializer, <4 x float> %rev)
|
|
ret float %red
|
|
}
|
|
|
|
define float @reassoc_mul_reduction_of_reverse_nxv4f32(<vscale x 4 x float> %v0) {
|
|
; CHECK-LABEL: @reassoc_mul_reduction_of_reverse_nxv4f32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call reassoc float @llvm.vector.reduce.fmul.nxv4f32(float 1.000000e+00, <vscale x 4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: ret float [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %v0)
|
|
%red = call reassoc float @llvm.vector.reduce.fmul.nxv4f32(float 1.0, <vscale x 4 x float> %rev)
|
|
ret float %red
|
|
}
|
|
|
|
define float @fmax_of_reverse_v4f32(<4 x float> %v0) {
|
|
; CHECK-LABEL: @fmax_of_reverse_v4f32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: ret float [[RED]]
|
|
;
|
|
%rev = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %v0)
|
|
%red = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %rev)
|
|
ret float %red
|
|
}
|
|
|
|
define float @fmin_of_reverse_nxv4f32(<vscale x 4 x float> %v0) {
|
|
; CHECK-LABEL: @fmin_of_reverse_nxv4f32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: ret float [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %v0)
|
|
%red = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %rev)
|
|
ret float %red
|
|
}
|
|
|
|
; negative test - fadd cannot be folded with reverse due to lack of reassoc
|
|
define float @sum_of_reverse_v4f32(<4 x float> %v0) {
|
|
; CHECK-LABEL: @sum_of_reverse_v4f32(
|
|
; CHECK-NEXT: [[REV:%.*]] = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[REV]])
|
|
; CHECK-NEXT: ret float [[RED]]
|
|
;
|
|
%rev = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %v0)
|
|
%red = call float @llvm.vector.reduce.fadd.v4f32(float zeroinitializer, <4 x float> %rev)
|
|
ret float %red
|
|
}
|
|
|
|
; negative test - fmul cannot be folded with reverse due to lack of reassoc
|
|
define float @mul_reduction_of_reverse_nxv4f32(<vscale x 4 x float> %v0) {
|
|
; CHECK-LABEL: @mul_reduction_of_reverse_nxv4f32(
|
|
; CHECK-NEXT: [[REV:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: [[RED:%.*]] = call float @llvm.vector.reduce.fmul.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[REV]])
|
|
; CHECK-NEXT: ret float [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %v0)
|
|
%red = call float @llvm.vector.reduce.fmul.nxv4f32(float zeroinitializer, <vscale x 4 x float> %rev)
|
|
ret float %red
|
|
}
|
|
|
|
|
|
; negative test - fsub must allow reassociation
|
|
|
|
define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_v4f32_fmf(
|
|
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
|
|
; CHECK-NEXT: [[R:%.*]] = fsub nnan ninf nsz float [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret float [[R]]
|
|
;
|
|
%r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
|
|
%r = fsub ninf nnan nsz float %r0, %r1
|
|
ret float %r
|
|
}
|
|
|
|
; negative test - extra uses could create extra instructions
|
|
|
|
define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_extra_use1(
|
|
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: call void @use_f32(float [[R0]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
|
|
; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret float [[R]]
|
|
;
|
|
%r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
|
|
call void @use_f32(float %r0)
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
|
|
%r = fsub fast float %r0, %r1
|
|
ret float %r
|
|
}
|
|
|
|
; negative test - extra uses could create extra instructions
|
|
|
|
define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_extra_use2(
|
|
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
|
|
; CHECK-NEXT: call void @use_f32(float [[R1]])
|
|
; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret float [[R]]
|
|
;
|
|
%r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1)
|
|
call void @use_f32(float %r1)
|
|
%r = fsub fast float %r0, %r1
|
|
ret float %r
|
|
}
|
|
|
|
; negative test - can't reassociate different vector types
|
|
|
|
define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_type_mismatch(
|
|
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]])
|
|
; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret float [[R]]
|
|
;
|
|
%r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0)
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a1, <8 x float> %v1)
|
|
%r = fsub fast float %r0, %r1
|
|
ret float %r
|
|
}
|
|
|
|
define i32 @diff_of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_v4i32(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
|
|
; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
|
|
; CHECK-NEXT: ret i32 [[R]]
|
|
;
|
|
%r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
|
|
%r = sub i32 %r0, %r1
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @sum_of_reverse_v4i32(<4 x i32> %v0) {
|
|
; CHECK-LABEL: @sum_of_reverse_v4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @sum_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) {
|
|
; CHECK-LABEL: @sum_of_reverse_nxv4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @mul_reduce_of_reverse_v4i32(<4 x i32> %v0) {
|
|
; CHECK-LABEL: @mul_reduce_of_reverse_v4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @mul_reduce_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) {
|
|
; CHECK-LABEL: @mul_reduce_of_reverse_nxv4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @smin_reduce_of_reverse_v4i32(<4 x i32> %v0) {
|
|
; CHECK-LABEL: @smin_reduce_of_reverse_v4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @smax_reduce_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) {
|
|
; CHECK-LABEL: @smax_reduce_of_reverse_nxv4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @umin_reduce_of_reverse_v4i32(<4 x i32> %v0) {
|
|
; CHECK-LABEL: @umin_reduce_of_reverse_v4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
define i32 @umax_reduce_of_reverse_nxv4i32(<vscale x 4 x i32> %v0) {
|
|
; CHECK-LABEL: @umax_reduce_of_reverse_nxv4i32(
|
|
; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: ret i32 [[RED]]
|
|
;
|
|
%rev = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %v0)
|
|
%red = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %rev)
|
|
ret i32 %red
|
|
}
|
|
|
|
; negative test - extra uses could create extra instructions
|
|
|
|
define i32 @diff_of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_v4i32_extra_use1(
|
|
; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: call void @use_i32(i32 [[R0]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
|
|
; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret i32 [[R]]
|
|
;
|
|
%r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
|
|
call void @use_i32(i32 %r0)
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
|
|
%r = sub i32 %r0, %r1
|
|
ret i32 %r
|
|
}
|
|
|
|
; negative test - extra uses could create extra instructions
|
|
|
|
define i32 @diff_of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_v4i32_extra_use2(
|
|
; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
|
|
; CHECK-NEXT: call void @use_i32(i32 [[R1]])
|
|
; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret i32 [[R]]
|
|
;
|
|
%r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0)
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
|
|
call void @use_i32(i32 %r1)
|
|
%r = sub i32 %r0, %r1
|
|
ret i32 %r
|
|
}
|
|
|
|
; negative test - can't reassociate different vector types
|
|
|
|
define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
|
|
; CHECK-LABEL: @diff_of_sums_type_mismatch2(
|
|
; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]])
|
|
; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
|
|
; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]]
|
|
; CHECK-NEXT: ret i32 [[R]]
|
|
;
|
|
%r0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v0)
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1)
|
|
%r = sub i32 %r0, %r1
|
|
ret i32 %r
|
|
}
|