This PR adds more realistic cost estimates for these reduction intrinsics - `llvm.vector.reduce.umax` - `llvm.vector.reduce.umin` - `llvm.vector.reduce.smax` - `llvm.vector.reduce.smin` - `llvm.vector.reduce.fadd` - `llvm.vector.reduce.fmul` - `llvm.vector.reduce.fmax` - `llvm.vector.reduce.fmin` - `llvm.vector.reduce.fmaximum` - `llvm.vector.reduce.fminimum` - `llvm.vector.reduce.mul ` The pre-existing cost estimates for `llvm.vector.reduce.add` are moved to `getArithmeticReductionCosts` to reduce complexity in `getVectorIntrinsicInstrCost` and enable other passes, like the SLP vectorizer, to benefit from these updated calculations. These are not expected to provide noticable performance improvements and are rather provided for the sake of completeness and correctness. This PR is in draft mode pending benchmark confirmation of this. This also provides and/or updates cost tests for all of these intrinsics. This PR was co-authored by me and @JonPsson1 .
377 lines
25 KiB
LLVM
377 lines
25 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
|
|
; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
|
|
; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
|
|
|
|
define void @fadd_reductions() {
|
|
; Z15-LABEL: 'fadd_reductions'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
|
|
;
|
|
%fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
|
|
%fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
|
|
%fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
|
|
%fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
|
|
%fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
|
|
ret void
|
|
}
|
|
|
|
define void @fast_fadd_reductions(ptr %src, ptr %dst) {
|
|
; Z15-LABEL: 'fast_fadd_reductions'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
|
|
;
|
|
%fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
|
|
%fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
|
|
%fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
|
|
%fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
|
|
%fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
|
|
ret void
|
|
}
|
|
|
|
define void @fmul_reductions() {
|
|
; Z15-LABEL: 'fmul_reductions'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
|
|
;
|
|
%fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
|
|
%fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
|
|
%fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
|
|
%fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
|
|
%fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
|
|
ret void
|
|
}
|
|
|
|
define void @fast_fmul_reductions() {
|
|
; Z15-LABEL: 'fast_fmul_reductions'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
|
|
;
|
|
%fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
|
|
%fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
|
|
%fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
|
|
%fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
|
|
%fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @fmin_reductions() {
|
|
; Z15-LABEL: 'fmin_reductions'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
|
|
;
|
|
%V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
|
|
%V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
|
|
%V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
|
|
%V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
|
|
%V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
|
|
ret void
|
|
}
|
|
|
|
define void @fmax_reductions() {
|
|
; Z15-LABEL: 'fmax_reductions'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
|
|
;
|
|
%V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
|
|
%V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
|
|
%V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
|
|
%V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
|
|
%V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
|
|
ret void
|
|
}
|
|
|
|
define void @reduceumin() {
|
|
; Z15-LABEL: 'reduceumin'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)
|
|
;
|
|
%V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
|
|
%V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
|
|
%V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
|
|
%V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
|
|
|
|
%V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
|
|
%V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @reduceumax() {
|
|
; Z15-LABEL: 'reduceumax'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)
|
|
;
|
|
%V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
|
|
%V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
|
|
%V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
|
|
%V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
|
|
|
|
%V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
|
|
%V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @reducesmin() {
|
|
; Z15-LABEL: 'reducesmin'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)
|
|
;
|
|
%V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
|
|
%V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
|
|
%V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
|
|
%V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
|
|
|
|
%V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
|
|
%V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @reducesmax() {
|
|
; Z15-LABEL: 'reducesmax'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
|
|
; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)
|
|
;
|
|
%V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
|
|
%V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
|
|
%V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
|
|
%V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
|
|
|
|
%V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
|
|
%V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @reduceadd() {
|
|
; Z15-LABEL: 'reduceadd'
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
|
|
;
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
|
|
; Z15-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)
|
|
|
|
; REDUCEADD64
|
|
%V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
|
|
%V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
|
|
%V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
|
|
%V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
|
|
; REDUCEADD32
|
|
%V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
|
|
%V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
|
|
%V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
|
|
%V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
|
|
; REDUCEADD16
|
|
%V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
|
|
%V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
|
|
%V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
|
|
%V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
|
|
; REDUCEADD8
|
|
%V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
|
|
%V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
|
|
%V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
|
|
%V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
|
|
; EXTREME VALUES
|
|
%V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
|
|
%V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @reducemul() {
|
|
; CHECK-LABEL: 'reducemul'
|
|
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
|
|
;
|
|
; CHECK: Cost Model: Found an estimated cost of 15 for instruction: %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
|
|
; CHECK: Cost Model: Found an estimated cost of 28 for instruction: %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)
|
|
|
|
; REDUCEADD64
|
|
%V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
|
|
%V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
|
|
%V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
|
|
%V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
|
|
; REDUCEADD32
|
|
%V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
|
|
%V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
|
|
%V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
|
|
%V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
|
|
; REDUCEADD16
|
|
%V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
|
|
%V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
|
|
%V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
|
|
%V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
|
|
; REDUCEADD8
|
|
%V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
|
|
%V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
|
|
%V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
|
|
%V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
|
|
; EXTREME VALUES
|
|
%V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
|
|
%V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)
|
|
|
|
ret void
|
|
}
|
|
|
|
declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
|
|
declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
|
|
declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
|
|
declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
|
|
|
|
declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
|
|
declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
|
|
declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
|
|
declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
|
|
|
|
declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
|
|
declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
|
|
declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
|
|
declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
|
|
declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
|
|
|
|
declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
|
|
declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
|
|
declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
|
|
declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
|
|
declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
|
|
|
|
declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
|
|
declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
|
|
declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
|
|
declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
|
|
declare i128 @llvm.vector.reduce.umin.v4i128(<4 x i128>)
|
|
|
|
declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
|
|
declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
|
|
declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
|
|
declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
|
|
declare i128 @llvm.vector.reduce.umax.v4i128(<4 x i128>)
|
|
|
|
declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
|
|
declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
|
|
declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
|
|
declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
|
|
declare i128 @llvm.vector.reduce.smin.v4i128(<4 x i128>)
|
|
|
|
declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
|
|
declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
|
|
declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
|
|
declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
|
|
declare i128 @llvm.vector.reduce.smax.v4i128(<4 x i128>)
|
|
|
|
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
|
|
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
|
|
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
|
|
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
|
|
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
|
|
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
|
|
declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
|
|
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
|
|
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
|
|
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
|
|
declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
|
|
declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
|
|
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
|
|
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
|
|
|
|
declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
|
|
declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)
|
|
|
|
declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
|
|
declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
|
|
declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
|
|
declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
|
|
declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
|
|
declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
|
|
declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
|
|
declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
|
|
declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
|
|
declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
|
|
declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
|
|
declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
|
|
declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
|
|
declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
|
|
|
|
declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
|
|
declare i256 @llvm.vector.reduce.mul.v4i256(<4 x i256>)
|