This PR adds more realistic cost estimates for these reduction intrinsics - `llvm.vector.reduce.umax` - `llvm.vector.reduce.umin` - `llvm.vector.reduce.smax` - `llvm.vector.reduce.smin` - `llvm.vector.reduce.fadd` - `llvm.vector.reduce.fmul` - `llvm.vector.reduce.fmax` - `llvm.vector.reduce.fmin` - `llvm.vector.reduce.fmaximum` - `llvm.vector.reduce.fminimum` - `llvm.vector.reduce.mul ` The pre-existing cost estimates for `llvm.vector.reduce.add` are moved to `getArithmeticReductionCosts` to reduce complexity in `getVectorIntrinsicInstrCost` and enable other passes, like the SLP vectorizer, to benefit from these updated calculations. These are not expected to provide noticable performance improvements and are rather provided for the sake of completeness and correctness. This PR is in draft mode pending benchmark confirmation of this. This also provides and/or updates cost tests for all of these intrinsics. This PR was co-authored by me and @JonPsson1 .
189 lines
12 KiB
LLVM
189 lines
12 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
|
|
; RUN: | FileCheck %s
|
|
|
|
; Test vectorization and reassociation of fmul operations. If the loads can
|
|
; be vectorized, cases of fewer operands are also profitable to vectorize.
|
|
|
|
define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
|
|
; CHECK-LABEL: define double @fmul_double_4_factors_seq(
|
|
; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
|
|
; CHECK-NEXT: ret double [[TMP1]]
|
|
;
|
|
entry:
|
|
%0 = load double, ptr %x, align 8
|
|
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
|
|
%1 = load double, ptr %arrayidx1, align 8
|
|
%mul = fmul reassoc nsz arcp contract afn double %1, %0
|
|
%arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
|
|
%2 = load double, ptr %arrayidx2, align 8
|
|
%mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
|
|
%arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
|
|
%3 = load double, ptr %arrayidx4, align 8
|
|
%mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
|
|
ret double %mul5
|
|
}
|
|
|
|
define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
|
|
; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
|
|
; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
|
|
; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
|
|
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
|
|
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
|
|
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
|
|
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
|
|
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
|
|
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
|
|
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
|
|
; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
|
|
; CHECK-NEXT: ret double [[TMP16]]
|
|
;
|
|
entry:
|
|
%0 = load double, ptr %x, align 8
|
|
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
|
|
%1 = load double, ptr %arrayidx1, align 8
|
|
%mul = fmul reassoc nsz arcp contract afn double %1, %0
|
|
%arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
|
|
%2 = load double, ptr %arrayidx2, align 8
|
|
%mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
|
|
%arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
|
|
%3 = load double, ptr %arrayidx4, align 8
|
|
%mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
|
|
%arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
|
|
%4 = load double, ptr %arrayidx6, align 8
|
|
%mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
|
|
%arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
|
|
%5 = load double, ptr %arrayidx8, align 8
|
|
%mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
|
|
%arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
|
|
%6 = load double, ptr %arrayidx10, align 8
|
|
%mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
|
|
%arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
|
|
%7 = load double, ptr %arrayidx12, align 8
|
|
%mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
|
|
ret double %mul13
|
|
}
|
|
|
|
define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
|
|
; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
|
|
; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
|
|
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
|
|
; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
|
|
; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
|
|
; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
|
|
; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
|
|
; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
|
|
; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
|
|
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
|
|
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
|
|
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
|
|
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
|
|
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
|
|
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
|
|
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
|
|
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
|
|
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
|
|
; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
|
|
; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
|
|
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
|
|
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
|
|
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
|
|
; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
|
|
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
|
|
; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
|
|
; CHECK-NEXT: ret float [[TMP32]]
|
|
;
|
|
entry:
|
|
%0 = load float, ptr %x, align 4
|
|
%arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
|
|
%1 = load float, ptr %arrayidx1, align 4
|
|
%mul = fmul reassoc nsz arcp contract afn float %1, %0
|
|
%arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
|
|
%2 = load float, ptr %arrayidx2, align 4
|
|
%mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
|
|
%arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
|
|
%3 = load float, ptr %arrayidx4, align 4
|
|
%mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
|
|
%arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
|
|
%4 = load float, ptr %arrayidx6, align 4
|
|
%mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
|
|
%arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
|
|
%5 = load float, ptr %arrayidx8, align 4
|
|
%mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
|
|
%arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
|
|
%6 = load float, ptr %arrayidx10, align 4
|
|
%mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
|
|
%arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
|
|
%7 = load float, ptr %arrayidx12, align 4
|
|
%mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
|
|
%arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
|
|
%8 = load float, ptr %arrayidx14, align 4
|
|
%mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
|
|
%arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
|
|
%9 = load float, ptr %arrayidx16, align 4
|
|
%mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
|
|
%arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
|
|
%10 = load float, ptr %arrayidx18, align 4
|
|
%mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
|
|
%arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
|
|
%11 = load float, ptr %arrayidx20, align 4
|
|
%mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
|
|
%arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
|
|
%12 = load float, ptr %arrayidx22, align 4
|
|
%mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
|
|
%arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
|
|
%13 = load float, ptr %arrayidx24, align 4
|
|
%mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
|
|
%arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
|
|
%14 = load float, ptr %arrayidx26, align 4
|
|
%mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
|
|
%arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
|
|
%15 = load float, ptr %arrayidx28, align 4
|
|
%mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
|
|
ret float %mul29
|
|
}
|