[MC][NFC] Statically allocate storage for decoded pseudo probes and function records

Use #102774 to allocate storage for decoded probes (`PseudoProbeVec`)
and function records (`InlineTreeVec`).

Leverage that to also shrink sizes of `MCDecodedPseudoProbe`:
- Drop Guid since it's accessible via `InlineTree`.

`MCDecodedPseudoProbeInlineTree`:
- Keep track of probes and inlinees using `ArrayRef`s now that probes
  and function records belonging to the same function are allocated
  contiguously.

This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing
time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with
400MiB .pseudo_probe section containing 43M probes and 25M function
records.

Depends on:
#102774
#102787
#102788

Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102789
This commit is contained in:
Amir Ayupov
2024-08-26 09:09:13 -07:00
committed by GitHub
parent 121ed07975
commit 04ebd1907c
5 changed files with 165 additions and 77 deletions

View File

@@ -200,7 +200,9 @@ void PseudoProbeRewriter::updatePseudoProbes() {
}
unsigned ProbeTrack = AP.second.size();
std::list<MCDecodedPseudoProbe>::iterator Probe = AP.second.begin();
auto Probe = llvm::map_iterator(
AP.second.begin(),
[](auto RW) -> MCDecodedPseudoProbe & { return RW.get(); });
while (ProbeTrack != 0) {
if (Probe->isBlock()) {
Probe->setAddress(BlkOutputAddress);
@@ -218,9 +220,7 @@ void PseudoProbeRewriter::updatePseudoProbes() {
}
while (CallOutputAddress != CallOutputAddresses.second) {
AP.second.push_back(*Probe);
AP.second.back().setAddress(CallOutputAddress->second);
Probe->getInlineTreeNode()->addProbes(&(AP.second.back()));
ProbeDecoder.addInjectedProbe(*Probe, CallOutputAddress->second);
CallOutputAddress = std::next(CallOutputAddress);
}
}
@@ -332,7 +332,7 @@ void PseudoProbeRewriter::encodePseudoProbes() {
ProbeDecoder.getDummyInlineRoot();
for (auto Child = Root.getChildren().begin();
Child != Root.getChildren().end(); ++Child)
Inlinees[Child->first] = Child->second.get();
Inlinees[Child->getInlineSite()] = &*Child;
for (auto Inlinee : Inlinees)
// INT64_MAX is "placeholder" of unused callsite index field in the pair
@@ -358,25 +358,37 @@ void PseudoProbeRewriter::encodePseudoProbes() {
EmitInt(Cur->Guid, 8);
// Emit number of probes in this node
uint64_t Deleted = 0;
for (MCDecodedPseudoProbe *&Probe : Cur->getProbes())
for (MCDecodedPseudoProbe *&Probe :
llvm::make_pointer_range(Cur->getProbes()))
if (Probe->getAddress() == INT64_MAX)
Deleted++;
LLVM_DEBUG(dbgs() << "Deleted Probes:" << Deleted << "\n");
uint64_t ProbesSize = Cur->getProbes().size() - Deleted;
size_t InjectedProbes = ProbeDecoder.getNumInjectedProbes(Cur);
uint64_t ProbesSize = Cur->getProbes().size() - Deleted + InjectedProbes;
EmitULEB128IntValue(ProbesSize);
// Emit number of direct inlinees
EmitULEB128IntValue(Cur->getChildren().size());
// Emit probes in this group
for (MCDecodedPseudoProbe *&Probe : Cur->getProbes()) {
for (MCDecodedPseudoProbe *&Probe :
llvm::make_pointer_range(Cur->getProbes())) {
if (Probe->getAddress() == INT64_MAX)
continue;
EmitDecodedPseudoProbe(Probe);
LastProbe = Probe;
}
if (InjectedProbes) {
for (MCDecodedPseudoProbe *&Probe :
llvm::make_pointer_range(ProbeDecoder.getInjectedProbes(Cur))) {
if (Probe->getAddress() == INT64_MAX)
continue;
EmitDecodedPseudoProbe(Probe);
LastProbe = Probe;
}
}
for (auto Child = Cur->getChildren().begin();
Child != Cur->getChildren().end(); ++Child)
Inlinees[Child->first] = Child->second.get();
Inlinees[Child->getInlineSite()] = &*Child;
for (const auto &Inlinee : Inlinees) {
assert(Cur->Guid != 0 && "non root tree node must have nonzero Guid");
NextNodes.push_back({std::get<1>(Inlinee.first), Inlinee.second});

View File

@@ -54,20 +54,21 @@
#ifndef LLVM_MC_MCPSEUDOPROBE_H
#define LLVM_MC_MCPSEUDOPROBE_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/iterator.h"
#include "llvm/IR/PseudoProbe.h"
#include "llvm/Support/ErrorOr.h"
#include <list>
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace llvm {
@@ -103,14 +104,15 @@ using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
using GUIDProbeFunctionMap =
std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
// Address to pseudo probes map.
using AddressProbesMap = std::map<uint64_t, std::list<MCDecodedPseudoProbe>>;
using AddressProbesMap =
std::map<uint64_t,
std::vector<std::reference_wrapper<MCDecodedPseudoProbe>>>;
class MCDecodedPseudoProbeInlineTree;
class MCPseudoProbeBase {
protected:
uint64_t Guid;
uint64_t Index;
uint32_t Index;
uint32_t Discriminator;
uint8_t Attributes;
uint8_t Type;
@@ -120,14 +122,12 @@ protected:
const static uint32_t PseudoProbeFirstId = 1;
public:
MCPseudoProbeBase(uint64_t G, uint64_t I, uint64_t At, uint8_t T, uint32_t D)
: Guid(G), Index(I), Discriminator(D), Attributes(At), Type(T) {}
MCPseudoProbeBase(uint64_t I, uint64_t At, uint8_t T, uint32_t D)
: Index(I), Discriminator(D), Attributes(At), Type(T) {}
bool isEntry() const { return Index == PseudoProbeFirstId; }
uint64_t getGuid() const { return Guid; }
uint64_t getIndex() const { return Index; }
uint32_t getIndex() const { return Index; }
uint32_t getDiscriminator() const { return Discriminator; }
@@ -157,18 +157,20 @@ public:
/// uses an address from a temporary label created at the current address in the
/// current section.
class MCPseudoProbe : public MCPseudoProbeBase {
uint64_t Guid;
MCSymbol *Label;
public:
MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
uint64_t Attributes, uint32_t Discriminator)
: MCPseudoProbeBase(Guid, Index, Attributes, Type, Discriminator),
: MCPseudoProbeBase(Index, Attributes, Type, Discriminator), Guid(Guid),
Label(Label) {
assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
assert(Attributes <= 0xFF &&
"Probe attributes too big to encode, exceeding 2^16");
}
uint64_t getGuid() const { return Guid; };
MCSymbol *getLabel() const { return Label; }
void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
};
@@ -181,11 +183,11 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
MCDecodedPseudoProbeInlineTree *InlineTree;
public:
MCDecodedPseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
uint8_t At, uint32_t D,
MCDecodedPseudoProbeInlineTree *Tree)
: MCPseudoProbeBase(G, I, At, static_cast<uint8_t>(K), D), Address(Ad),
MCDecodedPseudoProbe(uint64_t Ad, uint32_t I, PseudoProbeType K, uint8_t At,
uint32_t D, MCDecodedPseudoProbeInlineTree *Tree)
: MCPseudoProbeBase(I, At, static_cast<uint8_t>(K), D), Address(Ad),
InlineTree(Tree){};
uint64_t getGuid() const;
uint64_t getAddress() const { return Address; }
@@ -211,21 +213,14 @@ public:
bool ShowName) const;
};
template <typename ProbeType, typename DerivedProbeInlineTreeType>
template <typename ProbesType, typename DerivedProbeInlineTreeType,
typename InlinedProbeTreeMap>
class MCPseudoProbeInlineTreeBase {
struct InlineSiteHash {
uint64_t operator()(const InlineSite &Site) const {
return std::get<0>(Site) ^ std::get<1>(Site);
}
};
protected:
// Track children (e.g. inlinees) of current context
using InlinedProbeTreeMap = std::unordered_map<
InlineSite, std::unique_ptr<DerivedProbeInlineTreeType>, InlineSiteHash>;
InlinedProbeTreeMap Children;
// Set of probes that come with the function.
std::vector<ProbeType> Probes;
ProbesType Probes;
MCPseudoProbeInlineTreeBase() {
static_assert(std::is_base_of<MCPseudoProbeInlineTreeBase,
DerivedProbeInlineTreeType>::value,
@@ -240,12 +235,10 @@ public:
bool isRoot() const { return Guid == 0; }
InlinedProbeTreeMap &getChildren() { return Children; }
const InlinedProbeTreeMap &getChildren() const { return Children; }
std::vector<ProbeType> &getProbes() { return Probes; }
const std::vector<ProbeType> &getProbes() const { return Probes; }
void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
const ProbesType &getProbes() const { return Probes; }
// Caller node of the inline site
MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent =
nullptr;
MCPseudoProbeInlineTreeBase<ProbesType, DerivedProbeInlineTreeType,
InlinedProbeTreeMap> *Parent = nullptr;
DerivedProbeInlineTreeType *getOrAddNode(const InlineSite &Site) {
auto Ret = Children.emplace(
Site, std::make_unique<DerivedProbeInlineTreeType>(Site));
@@ -259,9 +252,17 @@ public:
// instance is created as the root of a tree.
// A real instance of this class is created for each function, either a
// not inlined function that has code in .text section or an inlined function.
struct InlineSiteHash {
uint64_t operator()(const InlineSite &Site) const {
return std::get<0>(Site) ^ std::get<1>(Site);
}
};
class MCPseudoProbeInlineTree
: public MCPseudoProbeInlineTreeBase<MCPseudoProbe,
MCPseudoProbeInlineTree> {
: public MCPseudoProbeInlineTreeBase<
std::vector<MCPseudoProbe>, MCPseudoProbeInlineTree,
std::unordered_map<InlineSite,
std::unique_ptr<MCPseudoProbeInlineTree>,
InlineSiteHash>> {
public:
MCPseudoProbeInlineTree() = default;
MCPseudoProbeInlineTree(uint64_t Guid) { this->Guid = Guid; }
@@ -277,16 +278,31 @@ public:
// inline tree node for the decoded pseudo probe
class MCDecodedPseudoProbeInlineTree
: public MCPseudoProbeInlineTreeBase<MCDecodedPseudoProbe *,
MCDecodedPseudoProbeInlineTree> {
public:
InlineSite ISite;
: public MCPseudoProbeInlineTreeBase<
MCDecodedPseudoProbe *, MCDecodedPseudoProbeInlineTree,
MutableArrayRef<MCDecodedPseudoProbeInlineTree>> {
uint32_t NumProbes = 0;
uint32_t ProbeId = 0;
public:
MCDecodedPseudoProbeInlineTree() = default;
MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
MCDecodedPseudoProbeInlineTree(const InlineSite &Site,
MCDecodedPseudoProbeInlineTree *Parent)
: ProbeId(std::get<1>(Site)) {
this->Guid = std::get<0>(Site);
this->Parent = Parent;
}
// Return false if it's a dummy inline site
bool hasInlineSite() const { return !isRoot() && !Parent->isRoot(); }
InlineSite getInlineSite() const { return InlineSite(Guid, ProbeId); }
void setProbes(MutableArrayRef<MCDecodedPseudoProbe> ProbesRef) {
Probes = ProbesRef.data();
NumProbes = ProbesRef.size();
}
auto getProbes() const {
return MutableArrayRef<MCDecodedPseudoProbe>(Probes, NumProbes);
}
};
/// Instances of this class represent the pseudo probes inserted into a compile
@@ -336,6 +352,20 @@ public:
};
class MCPseudoProbeDecoder {
// Decoded pseudo probes vector.
std::vector<MCDecodedPseudoProbe> PseudoProbeVec;
// Injected pseudo probes, identified by the containing inline tree node.
// Need to keep injected probes separately for two reasons:
// 1) Probes cannot be added to the PseudoProbeVec: appending may cause
// reallocation so that pointers to its elements will become invalid.
// 2) Probes belonging to function record must be contiguous in PseudoProbeVec
// as owning InlineTree references them with an ArrayRef to save space.
std::unordered_map<const MCDecodedPseudoProbeInlineTree *,
std::vector<MCDecodedPseudoProbe>>
InjectedProbeMap;
// Decoded inline records vector.
std::vector<MCDecodedPseudoProbeInlineTree> InlineTreeVec;
// GUID to PseudoProbeFuncDesc map.
GUIDProbeFunctionMap GUID2FuncDescMap;
@@ -382,10 +412,6 @@ public:
const Uint64Set &GuildFilter,
const Uint64Map &FuncStartAddrs);
bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
uint64_t &LastAddr, const Uint64Set &GuildFilter,
const Uint64Map &FuncStartAddrs);
// Print pseudo_probe_desc section info
void printGUID2FuncDescMap(raw_ostream &OS);
@@ -428,6 +454,34 @@ public:
const MCDecodedPseudoProbeInlineTree &getDummyInlineRoot() const {
return DummyInlineRoot;
}
void addInjectedProbe(const MCDecodedPseudoProbe &Probe, uint64_t Address) {
const MCDecodedPseudoProbeInlineTree *Parent = Probe.getInlineTreeNode();
InjectedProbeMap[Parent].emplace_back(Probe).setAddress(Address);
}
size_t
getNumInjectedProbes(const MCDecodedPseudoProbeInlineTree *Parent) const {
auto It = InjectedProbeMap.find(Parent);
if (It == InjectedProbeMap.end())
return 0;
return It->second.size();
}
auto getInjectedProbes(MCDecodedPseudoProbeInlineTree *Parent) {
auto It = InjectedProbeMap.find(Parent);
assert(It != InjectedProbeMap.end());
return iterator_range(It->second);
}
private:
// Recursively parse an inlining tree encoded in pseudo_probe section. Returns
// whether the the top-level node should be skipped.
template <bool IsTopLevelFunc>
bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
uint64_t &LastAddr, const Uint64Set &GuildFilter,
const Uint64Map &FuncStartAddrs,
const uint32_t CurChildIndex);
};
} // end namespace llvm

View File

@@ -49,6 +49,8 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer *MCOS, const MCSymbol *A,
return AddrDelta;
}
uint64_t MCDecodedPseudoProbe::getGuid() const { return InlineTree->Guid; }
void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
const MCPseudoProbe *LastProbe) const {
bool IsSentinel = isSentinelProbe(getAttributes());
@@ -289,8 +291,8 @@ void MCDecodedPseudoProbe::getInlineContext(
// Note that it won't include the probe's belonging function(leaf location)
while (Cur->hasInlineSite()) {
StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Cur->Parent->Guid);
ContextStack.emplace_back(
MCPseudoProbeFrameLocation(FuncName, std::get<1>(Cur->ISite)));
ContextStack.emplace_back(MCPseudoProbeFrameLocation(
FuncName, std::get<1>(Cur->getInlineSite())));
Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
}
// Make the ContextStack in caller-callee order
@@ -318,10 +320,10 @@ void MCDecodedPseudoProbe::print(raw_ostream &OS,
bool ShowName) const {
OS << "FUNC: ";
if (ShowName) {
StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Guid);
StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, getGuid());
OS << FuncName.str() << " ";
} else {
OS << Guid << " ";
OS << getGuid() << " ";
}
OS << "Index: " << Index << " ";
if (Discriminator)
@@ -417,17 +419,18 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
return true;
}
template <bool IsTopLevelFunc>
bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
MCDecodedPseudoProbeInlineTree *Cur, uint64_t &LastAddr,
const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs) {
const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs,
const uint32_t CurChildIndex) {
// The pseudo_probe section encodes an inline forest and each tree has a
// format defined in MCPseudoProbe.h
uint32_t Index = 0;
bool IsTopLevelFunc = Cur == &DummyInlineRoot;
if (IsTopLevelFunc) {
// Use a sequential id for top level inliner.
Index = Cur->getChildren().size();
Index = CurChildIndex;
} else {
// Read inline site for inlinees
Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
@@ -443,8 +446,9 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
// If the incoming node is null, all its children nodes should be disgarded.
if (Cur) {
// Switch/add to a new tree node(inlinee)
Cur = Cur->getOrAddNode(std::make_tuple(Guid, Index));
Cur->Guid = Guid;
Cur->getChildren()[CurChildIndex] =
MCDecodedPseudoProbeInlineTree(InlineSite(Guid, Index), Cur);
Cur = &Cur->getChildren()[CurChildIndex];
if (IsTopLevelFunc && !EncodingIsAddrBased) {
if (auto V = FuncStartAddrs.lookup(Guid))
LastAddr = V;
@@ -454,6 +458,7 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
// Read number of probes in the current node.
uint32_t NodeCount =
cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
uint32_t CurrentProbeCount = 0;
// Read number of direct inlinees
uint32_t ChildrenToProcess =
cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
@@ -494,19 +499,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
}
if (Cur && !isSentinelProbe(Attr)) {
// Populate Address2ProbesMap
auto &Probes = Address2ProbesMap[Addr];
Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
Discriminator, Cur);
Cur->addProbes(&Probes.back());
PseudoProbeVec.emplace_back(Addr, Index, PseudoProbeType(Kind), Attr,
Discriminator, Cur);
Address2ProbesMap[Addr].emplace_back(PseudoProbeVec.back());
++CurrentProbeCount;
}
LastAddr = Addr;
}
for (uint32_t I = 0; I < ChildrenToProcess; I++) {
buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
if (Cur) {
Cur->setProbes(
MutableArrayRef(PseudoProbeVec).take_back(CurrentProbeCount));
InlineTreeVec.resize(InlineTreeVec.size() + ChildrenToProcess);
Cur->getChildren() =
MutableArrayRef(InlineTreeVec).take_back(ChildrenToProcess);
}
return true;
for (uint32_t I = 0; I < ChildrenToProcess; I++) {
buildAddress2ProbeMap<false>(Cur, LastAddr, GuidFilter, FuncStartAddrs, I);
}
return Cur;
}
template <bool IsTopLevelFunc>
@@ -605,14 +616,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
TopLevelFuncs += !Discard;
}
assert(Data == End && "Have unprocessed data in pseudo_probe section");
PseudoProbeVec.reserve(ProbeCount);
InlineTreeVec.reserve(InlinedCount);
// Allocate top-level function records as children of DummyInlineRoot.
InlineTreeVec.resize(TopLevelFuncs);
DummyInlineRoot.getChildren() = MutableArrayRef(InlineTreeVec);
Data = Start;
End = Data + Size;
uint64_t LastAddr = 0;
uint32_t CurChildIndex = 0;
while (Data < End)
buildAddress2ProbeMap(&DummyInlineRoot, LastAddr, GuidFilter,
FuncStartAddrs);
CurChildIndex += buildAddress2ProbeMap<true>(
&DummyInlineRoot, LastAddr, GuidFilter, FuncStartAddrs, CurChildIndex);
assert(Data == End && "Have unprocessed data in pseudo_probe section");
assert(PseudoProbeVec.size() == ProbeCount &&
"Mismatching probe count pre- and post-parsing");
assert(InlineTreeVec.size() == InlinedCount &&
"Mismatching function records count pre- and post-parsing");
return true;
}

View File

@@ -1293,9 +1293,9 @@ void CSProfileGenerator::populateBodySamplesWithProbes(
// and will be inferred by the compiler.
for (auto &I : FrameSamples) {
for (auto *FunctionProfile : I.second) {
for (auto *Probe : I.first->getProbes()) {
FunctionProfile->addBodySamples(Probe->getIndex(),
Probe->getDiscriminator(), 0);
for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
FunctionProfile->addBodySamples(Probe.getIndex(),
Probe.getDiscriminator(), 0);
}
}
}

View File

@@ -132,7 +132,7 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
MCPseudoProbeDecoder &ProbeDecoder) {
ProbeFrameStack ProbeContext;
for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren())
trackInlineesOptimizedAway(ProbeDecoder, *Child.second, ProbeContext);
trackInlineesOptimizedAway(ProbeDecoder, Child, ProbeContext);
}
void BinarySizeContextTracker::trackInlineesOptimizedAway(
@@ -160,9 +160,9 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
// DFS down the probe inline tree
for (const auto &ChildNode : ProbeNode.getChildren()) {
InlineSite Location = ChildNode.first;
InlineSite Location = ChildNode.getInlineSite();
ProbeContext.back().second = std::get<1>(Location);
trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second, ProbeContext);
trackInlineesOptimizedAway(ProbeDecoder, ChildNode, ProbeContext);
}
ProbeContext.pop_back();
@@ -454,8 +454,8 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
// Build TopLevelProbeFrameMap to track size for optimized inlinees when probe
// is available
if (TrackFuncContextSize) {
for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
auto *Frame = Child.second.get();
for (auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
auto *Frame = &Child;
StringRef FuncName =
ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName;
TopLevelProbeFrameMap[FuncName] = Frame;