[llvm-mca] Add command line option -call-latency (#92958)

Currently we assume a constant latency of 100 cycles for call instructions. This commit allows the user to specify a custom value for the same as a command line argument. Default latency is set to 100.
2024-05-22 13:51:55 -07:00
parent 25b65be43d
commit 848bef5d85
6 changed files with 80 additions and 14 deletions
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -78,6 +78,7 @@ class InstrBuilder {

  bool FirstCallInst;
  bool FirstReturnInst;
+  unsigned CallLatency;

  using InstRecycleCallback = std::function<Instruction *(const InstrDesc &)>;
  InstRecycleCallback InstRecycleCB;
@@ -98,7 +99,7 @@ class InstrBuilder {
 public:
  InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
               const MCRegisterInfo &RI, const MCInstrAnalysis *IA,
-               const InstrumentManager &IM);
+               const InstrumentManager &IM, unsigned CallLatency);

  void clear() {
    Descriptors.clear();
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -31,9 +31,9 @@ InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                           const llvm::MCInstrInfo &mcii,
                           const llvm::MCRegisterInfo &mri,
                           const llvm::MCInstrAnalysis *mcia,
-                           const mca::InstrumentManager &im)
+                           const mca::InstrumentManager &im, unsigned cl)
    : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), IM(im), FirstCallInst(true),
-      FirstReturnInst(true) {
+      FirstReturnInst(true), CallLatency(cl) {
  const MCSchedModel &SM = STI.getSchedModel();
  ProcResourceMasks.resize(SM.getNumProcResourceKinds());
  computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
@@ -220,17 +220,19 @@ static void initializeUsedResources(InstrDesc &ID,

 static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
                              const MCSchedClassDesc &SCDesc,
-                              const MCSubtargetInfo &STI) {
+                              const MCSubtargetInfo &STI,
+                              unsigned CallLatency) {
  if (MCDesc.isCall()) {
    // We cannot estimate how long this call will take.
-    // Artificially set an arbitrarily high latency (100cy).
-    ID.MaxLatency = 100U;
+    // Artificially set an arbitrarily high latency.
+    ID.MaxLatency = CallLatency;
    return;
  }

  int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
-  // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
-  ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
+  // If latency is unknown, then conservatively assume the MaxLatency set for
+  // calls.
+  ID.MaxLatency = Latency < 0 ? CallLatency : static_cast<unsigned>(Latency);
 }

 static Error verifyOperands(const MCInstrDesc &MCDesc, const MCInst &MCI) {
@@ -568,7 +570,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
    // We don't correctly model calls.
    WithColor::warning() << "found a call in the input assembly sequence.\n";
    WithColor::note() << "call instructions are not correctly modeled. "
-                      << "Assume a latency of 100cy.\n";
+                      << "Assume a latency of " << CallLatency << "cy.\n";
    FirstCallInst = false;
  }

@@ -580,7 +582,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
  }

  initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
-  computeMaxLatency(*ID, MCDesc, SCDesc, STI);
+  computeMaxLatency(*ID, MCDesc, SCDesc, STI, CallLatency);

  if (Error Err = verifyOperands(MCDesc, MCI))
    return std::move(Err);
--- a/llvm/test/tools/llvm-mca/X86/call-latency.s
+++ b/llvm/test/tools/llvm-mca/X86/call-latency.s
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2                  -iterations=1 %s | FileCheck --check-prefixes=ALL,DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -call-latency=50 -iterations=1 %s | FileCheck --check-prefixes=ALL,CUSTOM %s
+
+callq printf
+
+# ALL:          Iterations:        1
+# ALL-NEXT:     Instructions:      1
+
+# CUSTOM-NEXT:  Total Cycles:      53
+# DEFAULT-NEXT: Total Cycles:      103
+
+# ALL-NEXT:     Total uOps:        1
+
+# ALL:          Dispatch Width:    2
+
+# CUSTOM-NEXT:  uOps Per Cycle:    0.02
+# CUSTOM-NEXT:  IPC:               0.02
+
+# DEFAULT-NEXT: uOps Per Cycle:    0.01
+# DEFAULT-NEXT: IPC:               0.01
+
+# ALL-NEXT:     Block RThroughput: 0.5
+
+# ALL:          Instruction Info:
+# ALL-NEXT:     [1]: #uOps
+# ALL-NEXT:     [2]: Latency
+# ALL-NEXT:     [3]: RThroughput
+# ALL-NEXT:     [4]: MayLoad
+# ALL-NEXT:     [5]: MayStore
+# ALL-NEXT:     [6]: HasSideEffects (U)
+
+# ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# ALL-NEXT:      1      1     0.50                        callq	printf
+
+# ALL:          Resources:
+# ALL-NEXT:     [0]   - JALU0
+# ALL-NEXT:     [1]   - JALU1
+# ALL-NEXT:     [2]   - JDiv
+# ALL-NEXT:     [3]   - JFPA
+# ALL-NEXT:     [4]   - JFPM
+# ALL-NEXT:     [5]   - JFPU0
+# ALL-NEXT:     [6]   - JFPU1
+# ALL-NEXT:     [7]   - JLAGU
+# ALL-NEXT:     [8]   - JMul
+# ALL-NEXT:     [9]   - JSAGU
+# ALL-NEXT:     [10]  - JSTC
+# ALL-NEXT:     [11]  - JVALU0
+# ALL-NEXT:     [12]  - JVALU1
+# ALL-NEXT:     [13]  - JVIMUL
+
+# ALL:          Resource pressure per iteration:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# ALL-NEXT:      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -
+
+# ALL:          Resource pressure by instruction:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# ALL-NEXT:      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     callq	printf
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -135,6 +135,11 @@ static cl::opt<unsigned>
                               "(instructions per cycle)"),
                      cl::cat(ToolOptions), cl::init(0));

+static cl::opt<unsigned>
+    CallLatency("call-latency", cl::Hidden,
+                cl::desc("Number of cycles to assume for a call instruction"),
+                cl::cat(ToolOptions), cl::init(100U));
+
 enum class SkipType { NONE, LACK_SCHED, PARSE_FAILURE, ANY_FAILURE };

 static cl::opt<enum SkipType> SkipUnsupportedInstructions(
@@ -568,7 +573,7 @@ int main(int argc, char **argv) {
  }

  // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, CallLatency);

  // Create a context to control ownership of the pipeline hardware.
  mca::Context MCA(*MRI, *STI);
--- a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
+++ b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
@@ -66,7 +66,7 @@ Error MCATestBase::runBaselineMCA(json::Object &Result, ArrayRef<MCInst> Insts,

  // Default InstrumentManager
  auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);

  const SmallVector<mca::Instrument *> Instruments;
  SmallVector<std::unique_ptr<mca::Instruction>> LoweredInsts;
--- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
+++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
@@ -33,7 +33,7 @@ TEST_F(X86TestBase, TestResumablePipeline) {
  P->addEventListener(SV.get());

  auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);

  const SmallVector<mca::Instrument *> Instruments;
  // Tile size = 7
@@ -124,7 +124,7 @@ TEST_F(X86TestBase, TestInstructionRecycling) {
  // Default InstrumentManager
  auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);

-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
  IB.setInstRecycleCallback(GetRecycledInst);

  const SmallVector<mca::Instrument *> Instruments;