From ce2b3ce3b6f707b8941dcdbf2c3a9be9fe5fa01b Mon Sep 17 00:00:00 2001 From: ShatianWang <38512325+ShatianWang@users.noreply.github.com> Date: Tue, 22 Apr 2025 15:42:47 -0400 Subject: [PATCH] [BOLT] Improve profile quality reporting (#130810) Improve profile quality reporting by 1) fixing a format issue for small binaries, 2) adding new stats for exception handling usage, 3) excluding selected blocks when computing the CFG flow conservation score. More specifically for 3), we are excluding blocks that satisfy at least one of the following characteristics: a) is a landing pad, b) has at least one landing pad with non-zero execution counts, c) ends with a recursive call. The reason for a) and b) is because the thrower --> landing pad edges are not explicitly represented in the CFG. The reason for c) is because the call-continuation fallthrough edge count is not important in case of recursive calls. Modified test `bolt/test/X86/profile-quality-reporting.test`. Added test `bolt/test/X86/profile-quality-reporting-small-binary.s`. --- bolt/lib/Passes/ProfileQualityStats.cpp | 281 ++++++++++++------ .../profile-quality-reporting-small-binary.s | 35 +++ bolt/test/X86/profile-quality-reporting.test | 2 +- 3 files changed, 230 insertions(+), 88 deletions(-) create mode 100644 bolt/test/X86/profile-quality-reporting-small-binary.s diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp index 332c78da8a1e..dfd74d3dd571 100644 --- a/bolt/lib/Passes/ProfileQualityStats.cpp +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -52,6 +52,16 @@ struct FlowInfo { FunctionFlowMapTy CallGraphIncomingFlows; }; +// When reporting exception handling stats, we only consider functions with at +// least MinLPECSum counts in landing pads to avoid false positives due to +// sampling noise +const uint16_t MinLPECSum = 50; + +// When reporting CFG flow conservation stats, we only consider blocks with +// execution counts > MinBlockCount when reporting the distribution of worst +// gaps. +const uint16_t MinBlockCount = 500; + template void printDistribution(raw_ostream &OS, std::vector &values, bool Fraction = false) { @@ -91,8 +101,12 @@ void printCFGContinuityStats(raw_ostream &OS, std::vector FractionECUnreachables; for (const BinaryFunction *Function : Functions) { - if (Function->size() <= 1) + if (Function->size() <= 1) { + NumUnreachables.push_back(0); + SumECUnreachables.push_back(0); + FractionECUnreachables.push_back(0.0); continue; + } // Compute the sum of all BB execution counts (ECs). size_t NumPosECBBs = 0; @@ -142,8 +156,10 @@ void printCFGContinuityStats(raw_ostream &OS, const size_t NumPosECBBsUnreachableFromEntry = NumPosECBBs - NumReachableBBs; const size_t SumUnreachableBBEC = SumAllBBEC - SumReachableBBEC; - const double FractionECUnreachable = - (double)SumUnreachableBBEC / SumAllBBEC; + + double FractionECUnreachable = 0.0; + if (SumAllBBEC > 0) + FractionECUnreachable = (double)SumUnreachableBBEC / SumAllBBEC; if (opts::Verbosity >= 2 && FractionECUnreachable >= 0.05) { OS << "Non-trivial CFG discontinuity observed in function " @@ -157,9 +173,6 @@ void printCFGContinuityStats(raw_ostream &OS, FractionECUnreachables.push_back(FractionECUnreachable); } - if (FractionECUnreachables.empty()) - return; - llvm::sort(FractionECUnreachables); const int Rank = int(FractionECUnreachables.size() * opts::PercentileForProfileQualityCheck / 100); @@ -187,8 +200,10 @@ void printCallGraphFlowConservationStats( std::vector CallGraphGaps; for (const BinaryFunction *Function : Functions) { - if (Function->size() <= 1 || !Function->isSimple()) + if (Function->size() <= 1 || !Function->isSimple()) { + CallGraphGaps.push_back(0.0); continue; + } const uint64_t FunctionNum = Function->getFunctionNumber(); std::vector &IncomingFlows = @@ -199,61 +214,64 @@ void printCallGraphFlowConservationStats( TotalFlowMap.CallGraphIncomingFlows; // Only consider functions that are not a program entry. - if (CallGraphIncomingFlows.find(FunctionNum) != + if (CallGraphIncomingFlows.find(FunctionNum) == CallGraphIncomingFlows.end()) { - uint64_t EntryInflow = 0; - uint64_t EntryOutflow = 0; - uint32_t NumConsideredEntryBlocks = 0; + CallGraphGaps.push_back(0.0); + continue; + } - Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) { - const BinaryBasicBlock *EntryBB = - Function->getBasicBlockAtOffset(Offset); - if (!EntryBB || EntryBB->succ_size() == 0) - return true; - NumConsideredEntryBlocks++; - EntryInflow += IncomingFlows[EntryBB->getLayoutIndex()]; - EntryOutflow += OutgoingFlows[EntryBB->getLayoutIndex()]; + uint64_t EntryInflow = 0; + uint64_t EntryOutflow = 0; + uint32_t NumConsideredEntryBlocks = 0; + + Function->forEachEntryPoint([&](uint64_t Offset, const MCSymbol *Label) { + const BinaryBasicBlock *EntryBB = Function->getBasicBlockAtOffset(Offset); + if (!EntryBB || EntryBB->succ_size() == 0) return true; - }); + NumConsideredEntryBlocks++; + EntryInflow += IncomingFlows[EntryBB->getLayoutIndex()]; + EntryOutflow += OutgoingFlows[EntryBB->getLayoutIndex()]; + return true; + }); - uint64_t NetEntryOutflow = 0; - if (EntryOutflow < EntryInflow) { - if (opts::Verbosity >= 2) { - // We expect entry blocks' CFG outflow >= inflow, i.e., it has a - // non-negative net outflow. If this is not the case, then raise a - // warning if requested. - OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow " - "in function " - << Function->getPrintName() << "\n"; - if (opts::Verbosity >= 3) - Function->dump(); - } - } else { - NetEntryOutflow = EntryOutflow - EntryInflow; + uint64_t NetEntryOutflow = 0; + if (EntryOutflow < EntryInflow) { + if (opts::Verbosity >= 2) { + // We expect entry blocks' CFG outflow >= inflow, i.e., it has a + // non-negative net outflow. If this is not the case, then raise a + // warning if requested. + OS << "BOLT WARNING: unexpected entry block CFG outflow < inflow " + "in function " + << Function->getPrintName() << "\n"; + if (opts::Verbosity >= 3) + Function->dump(); } - if (NumConsideredEntryBlocks > 0) { - const uint64_t CallGraphInflow = - TotalFlowMap.CallGraphIncomingFlows[Function->getFunctionNumber()]; - const uint64_t Min = std::min(NetEntryOutflow, CallGraphInflow); - const uint64_t Max = std::max(NetEntryOutflow, CallGraphInflow); - const double CallGraphGap = 1 - (double)Min / Max; + } else { + NetEntryOutflow = EntryOutflow - EntryInflow; + } + if (NumConsideredEntryBlocks > 0) { + const uint64_t CallGraphInflow = + TotalFlowMap.CallGraphIncomingFlows[Function->getFunctionNumber()]; + const uint64_t Min = std::min(NetEntryOutflow, CallGraphInflow); + const uint64_t Max = std::max(NetEntryOutflow, CallGraphInflow); + double CallGraphGap = 0.0; + if (Max > 0) + CallGraphGap = 1 - (double)Min / Max; - if (opts::Verbosity >= 2 && CallGraphGap >= 0.5) { - OS << "Nontrivial call graph gap of size " - << formatv("{0:P}", CallGraphGap) << " observed in function " - << Function->getPrintName() << "\n"; - if (opts::Verbosity >= 3) - Function->dump(); - } - - CallGraphGaps.push_back(CallGraphGap); + if (opts::Verbosity >= 2 && CallGraphGap >= 0.5) { + OS << "Non-trivial call graph gap of size " + << formatv("{0:P}", CallGraphGap) << " observed in function " + << Function->getPrintName() << "\n"; + if (opts::Verbosity >= 3) + Function->dump(); } + + CallGraphGaps.push_back(CallGraphGap); + } else { + CallGraphGaps.push_back(0.0); } } - if (CallGraphGaps.empty()) - return; - llvm::sort(CallGraphGaps); const int Rank = int(CallGraphGaps.size() * opts::PercentileForProfileQualityCheck / 100); @@ -265,18 +283,19 @@ void printCallGraphFlowConservationStats( } } -void printCFGFlowConservationStats(raw_ostream &OS, +void printCFGFlowConservationStats(const BinaryContext &BC, raw_ostream &OS, iterator_range &Functions, FlowInfo &TotalFlowMap) { std::vector CFGGapsWeightedAvg; std::vector CFGGapsWorst; std::vector CFGGapsWorstAbs; - // We only consider blocks with execution counts > MinBlockCount when - // reporting the distribution of worst gaps. - const uint16_t MinBlockCount = 500; for (const BinaryFunction *Function : Functions) { - if (Function->size() <= 1 || !Function->isSimple()) + if (Function->size() <= 1 || !Function->isSimple()) { + CFGGapsWeightedAvg.push_back(0.0); + CFGGapsWorst.push_back(0.0); + CFGGapsWorstAbs.push_back(0); continue; + } const uint64_t FunctionNum = Function->getFunctionNumber(); std::vector &MaxCountMaps = @@ -295,12 +314,34 @@ void printCFGFlowConservationStats(raw_ostream &OS, if (BB.isEntryPoint() || BB.succ_size() == 0) continue; + if (BB.getKnownExecutionCount() == 0 || BB.getNumNonPseudos() == 0) + continue; + + // We don't consider blocks that is a landing pad or has a + // positive-execution-count landing pad + if (BB.isLandingPad()) + continue; + + if (llvm::any_of(BB.landing_pads(), + std::mem_fn(&BinaryBasicBlock::getKnownExecutionCount))) + continue; + + // We don't consider blocks that end with a recursive call instruction + const MCInst *Inst = BB.getLastNonPseudoInstr(); + if (BC.MIB->isCall(*Inst)) { + const MCSymbol *DstSym = BC.MIB->getTargetSymbol(*Inst); + const BinaryFunction *DstFunc = + DstSym ? BC.getFunctionForSymbol(DstSym) : nullptr; + if (DstFunc == Function) + continue; + } + const uint64_t Max = MaxCountMaps[BB.getLayoutIndex()]; const uint64_t Min = MinCountMaps[BB.getLayoutIndex()]; - const double Gap = 1 - (double)Min / Max; + double Gap = 0.0; + if (Max > 0) + Gap = 1 - (double)Min / Max; double Weight = BB.getKnownExecutionCount() * BB.getNumNonPseudos(); - if (Weight == 0) - continue; // We use log to prevent the stats from being dominated by extremely hot // blocks Weight = log(Weight); @@ -316,39 +357,36 @@ void printCFGFlowConservationStats(raw_ostream &OS, BBWorstGapAbs = &BB; } } - if (WeightSum > 0) { - const double WeightedGap = WeightedGapSum / WeightSum; - if (opts::Verbosity >= 2 && (WeightedGap >= 0.1 || WorstGap >= 0.9)) { - OS << "Nontrivial CFG gap observed in function " - << Function->getPrintName() << "\n" - << "Weighted gap: " << formatv("{0:P}", WeightedGap) << "\n"; - if (BBWorstGap) - OS << "Worst gap: " << formatv("{0:P}", WorstGap) - << " at BB with input offset: 0x" - << Twine::utohexstr(BBWorstGap->getInputOffset()) << "\n"; - if (BBWorstGapAbs) - OS << "Worst gap (absolute value): " << WorstGapAbs << " at BB with " - << "input offset 0x" - << Twine::utohexstr(BBWorstGapAbs->getInputOffset()) << "\n"; - if (opts::Verbosity >= 3) - Function->dump(); - } - - CFGGapsWeightedAvg.push_back(WeightedGap); - CFGGapsWorst.push_back(WorstGap); - CFGGapsWorstAbs.push_back(WorstGapAbs); + double WeightedGap = WeightedGapSum; + if (WeightSum > 0) + WeightedGap /= WeightSum; + if (opts::Verbosity >= 2 && WorstGap >= 0.9) { + OS << "Non-trivial CFG gap observed in function " + << Function->getPrintName() << "\n" + << "Weighted gap: " << formatv("{0:P}", WeightedGap) << "\n"; + if (BBWorstGap) + OS << "Worst gap: " << formatv("{0:P}", WorstGap) + << " at BB with input offset: 0x" + << Twine::utohexstr(BBWorstGap->getInputOffset()) << "\n"; + if (BBWorstGapAbs) + OS << "Worst gap (absolute value): " << WorstGapAbs << " at BB with " + << "input offset 0x" + << Twine::utohexstr(BBWorstGapAbs->getInputOffset()) << "\n"; + if (opts::Verbosity >= 3) + Function->dump(); } + CFGGapsWeightedAvg.push_back(WeightedGap); + CFGGapsWorst.push_back(WorstGap); + CFGGapsWorstAbs.push_back(WorstGapAbs); } - if (CFGGapsWeightedAvg.empty()) - return; llvm::sort(CFGGapsWeightedAvg); const int RankWA = int(CFGGapsWeightedAvg.size() * opts::PercentileForProfileQualityCheck / 100); llvm::sort(CFGGapsWorst); const int RankW = int(CFGGapsWorst.size() * opts::PercentileForProfileQualityCheck / 100); - OS << formatv("CFG flow conservation gap {0:P} (weighted) {1:P} (worst)\n", + OS << formatv("CFG flow conservation gap {0:P} (weighted) {1:P} (worst); ", CFGGapsWeightedAvg[RankWA], CFGGapsWorst[RankW]); if (opts::Verbosity >= 1) { OS << "distribution of weighted CFG flow conservation gaps\n"; @@ -365,6 +403,74 @@ void printCFGFlowConservationStats(raw_ostream &OS, } } +void printExceptionHandlingStats(const BinaryContext &BC, raw_ostream &OS, + iterator_range &Functions) { + std::vector LPCountFractionsOfTotalBBEC; + std::vector LPCountFractionsOfTotalInvokeEC; + for (const BinaryFunction *Function : Functions) { + size_t LPECSum = 0; + size_t BBECSum = 0; + size_t InvokeECSum = 0; + for (BinaryBasicBlock &BB : *Function) { + const size_t BBEC = BB.getKnownExecutionCount(); + BBECSum += BBEC; + if (BB.isLandingPad()) + LPECSum += BBEC; + for (const MCInst &Inst : BB) { + if (!BC.MIB->isInvoke(Inst)) + continue; + const std::optional EHInfo = + BC.MIB->getEHInfo(Inst); + if (EHInfo->first) + InvokeECSum += BBEC; + } + } + + if (LPECSum <= MinLPECSum) { + LPCountFractionsOfTotalBBEC.push_back(0.0); + LPCountFractionsOfTotalInvokeEC.push_back(0.0); + continue; + } + double FracTotalBBEC = 0.0; + if (BBECSum > 0) + FracTotalBBEC = (double)LPECSum / BBECSum; + double FracTotalInvokeEC = 0.0; + if (InvokeECSum > 0) + FracTotalInvokeEC = (double)LPECSum / InvokeECSum; + LPCountFractionsOfTotalBBEC.push_back(FracTotalBBEC); + LPCountFractionsOfTotalInvokeEC.push_back(FracTotalInvokeEC); + + if (opts::Verbosity >= 2 && FracTotalInvokeEC >= 0.05) { + OS << "Non-trivial usage of exception handling observed in function " + << Function->getPrintName() << "\n" + << formatv( + "Fraction of total InvokeEC that goes to landing pads: {0:P}\n", + FracTotalInvokeEC); + if (opts::Verbosity >= 3) + Function->dump(); + } + } + + llvm::sort(LPCountFractionsOfTotalBBEC); + const int RankBBEC = int(LPCountFractionsOfTotalBBEC.size() * + opts::PercentileForProfileQualityCheck / 100); + llvm::sort(LPCountFractionsOfTotalInvokeEC); + const int RankInvoke = int(LPCountFractionsOfTotalInvokeEC.size() * + opts::PercentileForProfileQualityCheck / 100); + OS << formatv("exception handling usage {0:P} (of total BBEC) {1:P} (of " + "total InvokeEC)\n", + LPCountFractionsOfTotalBBEC[RankBBEC], + LPCountFractionsOfTotalInvokeEC[RankInvoke]); + if (opts::Verbosity >= 1) { + OS << "distribution of exception handling usage as a fraction of total " + "BBEC of each function\n"; + printDistribution(OS, LPCountFractionsOfTotalBBEC, /*Fraction=*/true); + OS << "distribution of exception handling usage as a fraction of total " + "InvokeEC of each function\n"; + printDistribution(OS, LPCountFractionsOfTotalInvokeEC, /*Fraction=*/true); + } +} + void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { // Increment block inflow and outflow with CFG jump counts. TotalFlowMapTy &TotalIncomingFlows = TotalFlowMap.TotalIncomingFlows; @@ -519,8 +625,8 @@ void printAll(BinaryContext &BC, FunctionListType &ValidFunctions, 100 - opts::PercentileForProfileQualityCheck); printCFGContinuityStats(BC.outs(), Functions); printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap); - printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap); - + printCFGFlowConservationStats(BC, BC.outs(), Functions, TotalFlowMap); + printExceptionHandlingStats(BC, BC.outs(), Functions); // Print more detailed bucketed stats if requested. if (opts::Verbosity >= 1 && RealNumTopFunctions >= 5) { const size_t PerBucketSize = RealNumTopFunctions / 5; @@ -550,7 +656,8 @@ void printAll(BinaryContext &BC, FunctionListType &ValidFunctions, MaxFunctionExecutionCount); printCFGContinuityStats(BC.outs(), Functions); printCallGraphFlowConservationStats(BC.outs(), Functions, TotalFlowMap); - printCFGFlowConservationStats(BC.outs(), Functions, TotalFlowMap); + printCFGFlowConservationStats(BC, BC.outs(), Functions, TotalFlowMap); + printExceptionHandlingStats(BC, BC.outs(), Functions); } } } diff --git a/bolt/test/X86/profile-quality-reporting-small-binary.s b/bolt/test/X86/profile-quality-reporting-small-binary.s new file mode 100644 index 000000000000..2b147c5eca81 --- /dev/null +++ b/bolt/test/X86/profile-quality-reporting-small-binary.s @@ -0,0 +1,35 @@ +## Test that BOLT-INFO is correctly formatted after profile quality reporting for +## a small binary. + +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --data=%t.fdata \ +# RUN: 2>&1 | FileCheck %s + +# CHECK: BOLT-INFO: profile quality metrics for the hottest 2 functions (reporting top 5% values): function CFG discontinuity 0.00%; call graph flow conservation gap 0.00%; CFG flow conservation gap 0.00% (weighted) 0.00% (worst); exception handling usage 0.00% (of total BBEC) 0.00% (of total InvokeEC) +# CHECK-NEXT: BOLT-INFO: + + .text + .globl func + .type func, @function +func: + pushq %rbp + ret +LLfunc_end: + .size func, LLfunc_end-func + + + .globl main + .type main, @function +main: + pushq %rbp + movq %rsp, %rbp +LLmain_func: + call func +# FDATA: 1 main #LLmain_func# 1 func 0 0 500 + movl $4, %edi + retq +.Lmain_end: + .size main, .Lmain_end-main diff --git a/bolt/test/X86/profile-quality-reporting.test b/bolt/test/X86/profile-quality-reporting.test index 2e15a6b245af..210d3e10a389 100644 --- a/bolt/test/X86/profile-quality-reporting.test +++ b/bolt/test/X86/profile-quality-reporting.test @@ -1,4 +1,4 @@ ## Check profile quality stats reporting RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s -CHECK: profile quality metrics for the hottest 5 functions (reporting top 5% values): function CFG discontinuity 100.00%; call graph flow conservation gap 60.00%; CFG flow conservation gap 45.53% (weighted) 96.87% (worst) +CHECK: profile quality metrics for the hottest 5 functions (reporting top 5% values): function CFG discontinuity 100.00%; call graph flow conservation gap 60.00%; CFG flow conservation gap 45.53% (weighted) 96.87% (worst); exception handling usage 0.00% (of total BBEC) 0.00% (of total InvokeEC)