[RISCV] Add cost model for fixed broadcast shuffle

This patch adds basic broadcast shuffle costs in order to enable SLP vectorization.
And adds `getLMULCost` to consider reciprocal throughput for different LMUL.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D137276
This commit is contained in:
ShihPo Hung
2022-11-30 04:58:52 -08:00
parent bb1eca6bbb
commit 0e6f0b7cc3
3 changed files with 177 additions and 0 deletions

View File

@@ -31,6 +31,27 @@ static cl::opt<unsigned> SLPMaxVF(
"SLP vectorizer. Defaults to 1 which disables SLP."),
cl::init(1), cl::Hidden);
InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
// TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
// implementation-defined.
if (!VT.isVector())
return InstructionCost::getInvalid();
unsigned Cost;
if (VT.isScalableVector()) {
unsigned LMul;
bool Fractional;
std::tie(LMul, Fractional) =
RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
if (Fractional)
Cost = 1;
else
Cost = LMul;
} else {
Cost = VT.getSizeInBits() / ST->getRealMinVLen();
}
return std::max<unsigned>(Cost, 1);
}
InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy() &&
@@ -255,6 +276,44 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
}
}
if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
Instruction::InsertElement);
if (LT.second.getScalarSizeInBits() == 1) {
if (HasScalar) {
// Example sequence:
// andi a0, a0, 1
// vsetivli zero, 2, e8, mf8, ta, ma (ignored)
// vmv.v.x v8, a0
// vmsne.vi v0, v8, 0
return LT.first * getLMULCost(LT.second) * 3;
}
// Example sequence:
// vsetivli zero, 2, e8, mf8, ta, mu (ignored)
// vmv.v.i v8, 0
// vmerge.vim v8, v8, 1, v0
// vmv.x.s a0, v8
// andi a0, a0, 1
// vmv.v.x v8, a0
// vmsne.vi v0, v8, 0
return LT.first * getLMULCost(LT.second) * 6;
}
if (HasScalar) {
// Example sequence:
// vmv.v.x v8, a0
return LT.first * getLMULCost(LT.second);
}
// Example sequence:
// vrgather.vi v9, v8, 0
// TODO: vrgather could be slower than vmv.v.x. It is
// implementation-dependent.
return LT.first * getLMULCost(LT.second);
}
return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
}

View File

@@ -46,6 +46,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
/// the true cost significantly if getVScaleForTuning is wildly off for the
/// actual target hardware.
unsigned getEstimatedVLFor(VectorType *Ty);
/// Return the cost of LMUL. The larger the LMUL, the higher the cost.
InstructionCost getLMULCost(MVT VT);
public:
explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),

View File

@@ -0,0 +1,114 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+experimental-zvfh | FileCheck %s
define void @broadcast_fixed() #0{
; CHECK-LABEL: 'broadcast_fixed'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
%1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
%2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
%3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
%4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
%5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
%6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
%7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
%8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
%9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
%10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
%11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
%12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
%13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
%14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
%15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
%16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
%17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
%18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
%19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
%20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
%21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
%22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
%23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
%24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
%25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
%26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
%27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
%28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
%29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
%30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
%31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
%32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
%33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
%34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
%35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
%36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
%37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
%38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
%39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
%40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
%41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
%42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
%43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
%ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
%44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
%ins2 = insertelement <2 x i8> poison, i8 3, i32 0
%45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
ret void
}