[RISCV] Tune flag for fast vrgather.vv (#124664)

Add tune knob for N*Log2(N) vrgather.vv cost.
2025-03-03 16:04:49 -08:00
parent ee09df8e73
commit b44fbdee00
7 changed files with 76 additions and 2 deletions
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -111,6 +111,7 @@ Changes to the RISC-V Backend
  extension.
 * Adds experimental assembler support for the Qualcomm 'Xqccmp' extension, which
  is a frame-pointer convention compatible version of Zcmp.
+* Added non-quadratic ``log-vrgather`` cost model for ``vrgather.vv`` instruction

 Changes to the WebAssembly Backend
 ----------------------------------
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1445,6 +1445,10 @@ def FeatureUnalignedVectorMem
                      "true", "Has reasonably performant unaligned vector "
                      "loads and stores">;

+def TuneNLogNVRGather
+   : SubtargetFeature<"log-vrgather", "RISCVVRGatherCostModel", "NLog2N",
+                      "Has vrgather.vv with LMUL*log2(LMUL) latency">;
+
 def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
    "UsePostRAScheduler", "true", "Schedule again after register allocation">;

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2869,10 +2869,19 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const {


 /// Return the cost of a vrgather.vv instruction for the type VT.  vrgather.vv
-/// is generally quadratic in the number of vreg implied by LMUL.  Note that
+/// may be quadratic in the number of vreg implied by LMUL, and is assumed to
+/// be by default.  VRGatherCostModel reflects available options.  Note that
 /// operand (index and possibly mask) are handled separately.
 InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
-  return getLMULCost(VT) * getLMULCost(VT);
+  auto LMULCost = getLMULCost(VT);
+  bool Log2CostModel =
+      Subtarget.getVRGatherCostModel() == llvm::RISCVSubtarget::NLog2N;
+  if (Log2CostModel && LMULCost.isValid()) {
+    unsigned Log = Log2_64(*LMULCost.getValue());
+    if (Log > 0)
+      return LMULCost * Log;
+  }
+  return LMULCost * LMULCost;
 }

 /// Return the cost of a vrgather.vi (or vx) instruction for the type VT.
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -494,6 +494,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
                                                  FeatureUnalignedScalarMem,
                                                  FeatureUnalignedVectorMem]),
                                                 [TuneNoDefaultUnroll,
+                                                  TuneNLogNVRGather,
                                                  TuneOptimizedZeroStrideLoad,
                                                  TunePostRAScheduler]>;

--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -84,11 +84,16 @@ public:
    VentanaVeyron,
    MIPSP8700,
  };
+  enum RISCVVRGatherCostModelEnum : uint8_t {
+    Quadratic,
+    NLog2N,
+  };
  // clang-format on
 private:
  virtual void anchor();

  RISCVProcFamilyEnum RISCVProcFamily = Others;
+  RISCVVRGatherCostModelEnum RISCVVRGatherCostModel = Quadratic;

 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
  bool ATTRIBUTE = DEFAULT;
@@ -155,6 +160,8 @@ public:
  /// initializeProperties().
  RISCVProcFamilyEnum getProcFamily() const { return RISCVProcFamily; }

+  RISCVVRGatherCostModelEnum getVRGatherCostModel() const { return RISCVVRGatherCostModel; }
+
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
  bool GETTER() const { return ATTRIBUTE; }
 #include "RISCVGenSubtargetInfo.inc"
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
@@ -3,6 +3,8 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32

@@ -44,6 +46,24 @@ define void @general_permute_single_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; LOG-VRG-LABEL: 'general_permute_single_source'
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
  %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
  %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
@@ -133,6 +153,37 @@ define void @general_permute_two_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; LOG-VRG-LABEL: 'general_permute_two_source'
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
  %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
  %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -35,6 +35,7 @@
 ; CHECK:   h                                - 'H' (Hypervisor).
 ; CHECK:   i                                - 'I' (Base Integer Instruction Set).
 ; CHECK:   ld-add-fusion                    - Enable LD+ADD macrofusion.
+; CHECK:   log-vrgather                     - Has vrgather.vv with LMUL*log2(LMUL) latency
 ; CHECK:   lui-addi-fusion                  - Enable LUI+ADDI macro fusion.
 ; CHECK:   m                                - 'M' (Integer Multiplication and Division).
 ; CHECK:   mips-p8700                       - MIPS p8700 processor.