Revert "[MemProf] Use radix tree for alloc contexts in bitcode summaries" (#117395)

Reverts llvm/llvm-project#117066

This is causing some build bot failures that need investigation.
This commit is contained in:
Teresa Johnson
2024-11-22 14:57:58 -08:00
committed by GitHub
parent ccb4702038
commit fdb050a502
7 changed files with 30 additions and 243 deletions

View File

@@ -307,12 +307,12 @@ enum GlobalValueSummarySymtabCodes {
// [valueid, n x stackidindex]
FS_PERMODULE_CALLSITE_INFO = 26,
// Summary of per-module allocation memprof metadata.
// [nummib, nummib x (alloc type, context radix tree index),
// [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
// [nummib x (numcontext x total size)]?]
FS_PERMODULE_ALLOC_INFO = 27,
// Summary of combined index memprof callsite metadata.
// [valueid, context radix tree index, numver,
// numver x version]
// [valueid, numstackindices, numver,
// numstackindices x stackidindex, numver x version]
FS_COMBINED_CALLSITE_INFO = 28,
// Summary of combined index allocation memprof metadata.
// [nummib, numver,
@@ -331,10 +331,6 @@ enum GlobalValueSummarySymtabCodes {
// the entries must be in the exact same order as the corresponding sizes.
// [nummib x (numcontext x full stack id)]
FS_ALLOC_CONTEXT_IDS = 31,
// Linearized radix tree of allocation contexts. See the description above the
// CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format.
// [n x entry]
FS_CONTEXT_RADIX_TREE_ARRAY = 32,
};
enum MetadataCodes {

View File

@@ -329,7 +329,6 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
STRINGIFY_CODE(FS, STACK_IDS)
STRINGIFY_CODE(FS, ALLOC_CONTEXT_IDS)
STRINGIFY_CODE(FS, CONTEXT_RADIX_TREE_ARRAY)
}
case bitc::METADATA_ATTACHMENT_ID:
switch (CodeID) {

View File

@@ -987,10 +987,6 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
/// ids from the lists in the callsite and alloc entries to the index.
std::vector<uint64_t> StackIds;
/// Linearized radix tree of allocation contexts. See the description above
/// the CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format.
std::vector<uint64_t> RadixArray;
public:
ModuleSummaryIndexBitcodeReader(
BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
@@ -1017,8 +1013,6 @@ private:
TypeIdCompatibleVtableInfo &TypeId);
std::vector<FunctionSummary::ParamAccess>
parseParamAccesses(ArrayRef<uint64_t> Record);
SmallVector<unsigned> parseAllocInfoContext(ArrayRef<uint64_t> Record,
unsigned &I);
template <bool AllowNullValueInfo = false>
std::pair<ValueInfo, GlobalValue::GUID>
@@ -7550,48 +7544,6 @@ void ModuleSummaryIndexBitcodeReader::parseTypeIdCompatibleVtableSummaryRecord(
parseTypeIdCompatibleVtableInfo(Record, Slot, TypeId);
}
SmallVector<unsigned> ModuleSummaryIndexBitcodeReader::parseAllocInfoContext(
ArrayRef<uint64_t> Record, unsigned &I) {
SmallVector<unsigned> StackIdList;
// For backwards compatibility with old format before radix tree was
// used, simply see if we found a radix tree array record (and thus if
// the RadixArray is non-empty).
if (RadixArray.empty()) {
unsigned NumStackEntries = Record[I++];
assert(Record.size() - I >= NumStackEntries);
StackIdList.reserve(NumStackEntries);
for (unsigned J = 0; J < NumStackEntries; J++) {
assert(Record[I] < StackIds.size());
StackIdList.push_back(
TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
}
} else {
unsigned RadixIndex = Record[I++];
// See the comments above CallStackRadixTreeBuilder in ProfileData/MemProf.h
// for a detailed description of the radix tree array format. Briefly, the
// first entry will be the number of frames, any negative values are the
// negative of the offset of the next frame, and otherwise the frames are in
// increasing linear order.
assert(RadixIndex < RadixArray.size());
unsigned NumStackIds = RadixArray[RadixIndex++];
StackIdList.reserve(NumStackIds);
while (NumStackIds--) {
assert(RadixIndex < RadixArray.size());
unsigned Elem = RadixArray[RadixIndex];
if (static_cast<std::make_signed_t<unsigned>>(Elem) < 0) {
RadixIndex = RadixIndex - Elem;
assert(RadixIndex < RadixArray.size());
Elem = RadixArray[RadixIndex];
// We shouldn't encounter a second offset in a row.
assert(static_cast<std::make_signed_t<unsigned>>(Elem) >= 0);
}
RadixIndex++;
StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[Elem]));
}
}
return StackIdList;
}
static void setSpecialRefs(SmallVectorImpl<ValueInfo> &Refs, unsigned ROCnt,
unsigned WOCnt) {
// Readonly and writeonly refs are in the end of the refs list.
@@ -8058,11 +8010,6 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
break;
}
case bitc::FS_CONTEXT_RADIX_TREE_ARRAY: { // [n x entry]
RadixArray = ArrayRef<uint64_t>(Record);
break;
}
case bitc::FS_PERMODULE_CALLSITE_INFO: {
unsigned ValueID = Record[0];
SmallVector<unsigned> StackIdList;
@@ -8118,7 +8065,14 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
(Version < 10 && I < Record.size())) {
assert(Record.size() - I >= 2);
AllocationType AllocType = (AllocationType)Record[I++];
auto StackIdList = parseAllocInfoContext(Record, I);
unsigned NumStackEntries = Record[I++];
assert(Record.size() - I >= NumStackEntries);
SmallVector<unsigned> StackIdList;
for (unsigned J = 0; J < NumStackEntries; J++) {
assert(Record[I] < StackIds.size());
StackIdList.push_back(
TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
}
MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
}
// We either have nothing left or at least NumMIBs context size info
@@ -8169,7 +8123,14 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
while (MIBsRead++ < NumMIBs) {
assert(Record.size() - I >= 2);
AllocationType AllocType = (AllocationType)Record[I++];
auto StackIdList = parseAllocInfoContext(Record, I);
unsigned NumStackEntries = Record[I++];
assert(Record.size() - I >= NumStackEntries);
SmallVector<unsigned> StackIdList;
for (unsigned J = 0; J < NumStackEntries; J++) {
assert(Record[I] < StackIds.size());
StackIdList.push_back(
TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
}
MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
}
assert(Record.size() - I >= NumVersions);

View File

@@ -60,7 +60,6 @@
#include "llvm/MC/StringTableBuilder.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Object/IRSymtab.h"
#include "llvm/ProfileData/MemProf.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -84,7 +83,6 @@
#include <vector>
using namespace llvm;
using namespace llvm::memprof;
static cl::opt<unsigned>
IndexThreshold("bitcode-mdindex-threshold", cl::Hidden, cl::init(25),
@@ -233,8 +231,7 @@ private:
SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
unsigned CallsiteAbbrev, unsigned AllocAbbrev, unsigned ContextIdAbbvId,
const Function &F, DenseMap<CallStackId, LinearCallStackId> &CallStackPos,
CallStackId &CallStackCount);
const Function &F);
void writeModuleLevelReferences(const GlobalVariable &V,
SmallVector<uint64_t, 64> &NameVals,
unsigned FSModRefsAbbrev,
@@ -4198,58 +4195,12 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
}
}
// Adds the allocation contexts to the CallStacks map. We simply use the
// size at the time the context was added as the CallStackId. This works because
// when we look up the call stacks later on we process the function summaries
// and their allocation records in the same exact order.
static void collectMemProfCallStacks(
FunctionSummary *FS, std::function<LinearFrameId(unsigned)> GetStackIndex,
MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> &CallStacks) {
// The interfaces in ProfileData/MemProf.h use a type alias for a stack frame
// id offset into the index of the full stack frames. The ModuleSummaryIndex
// currently uses unsigned. Make sure these stay in sync.
static_assert(std::is_same_v<LinearFrameId, unsigned>);
for (auto &AI : FS->allocs()) {
for (auto &MIB : AI.MIBs) {
SmallVector<unsigned> StackIdIndices;
StackIdIndices.reserve(MIB.StackIdIndices.size());
for (auto Id : MIB.StackIdIndices)
StackIdIndices.push_back(GetStackIndex(Id));
// The CallStackId is the size at the time this context was inserted.
CallStacks.insert({CallStacks.size(), StackIdIndices});
}
}
}
// Build the radix tree from the accumulated CallStacks, write out the resulting
// linearized radix tree array, and return the map of call stack positions into
// this array for use when writing the allocation records. The returned map is
// indexed by a CallStackId which in this case is implicitly determined by the
// order of function summaries and their allocation infos being written.
static DenseMap<CallStackId, LinearCallStackId> writeMemoryProfileRadixTree(
MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> &&CallStacks,
BitstreamWriter &Stream, unsigned RadixAbbrev) {
assert(!CallStacks.empty());
DenseMap<unsigned, FrameStat> FrameHistogram =
computeFrameHistogram<LinearFrameId>(CallStacks);
CallStackRadixTreeBuilder<LinearFrameId> Builder;
// We don't need a MemProfFrameIndexes map as we have already converted the
// full stack id hash to a linear offset into the StackIds array.
Builder.build(std::move(CallStacks), /*MemProfFrameIndexes=*/std::nullopt,
FrameHistogram);
Stream.EmitRecord(bitc::FS_CONTEXT_RADIX_TREE_ARRAY, Builder.getRadixArray(),
RadixAbbrev);
return Builder.takeCallStackPos();
}
static void writeFunctionHeapProfileRecords(
BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
unsigned AllocAbbrev, unsigned ContextIdAbbvId, bool PerModule,
std::function<unsigned(const ValueInfo &VI)> GetValueID,
std::function<unsigned(unsigned)> GetStackIndex,
bool WriteContextSizeInfoIndex,
DenseMap<CallStackId, LinearCallStackId> &CallStackPos,
CallStackId &CallStackCount) {
bool WriteContextSizeInfoIndex) {
SmallVector<uint64_t> Record;
for (auto &CI : FS->callsites()) {
@@ -4283,9 +4234,9 @@ static void writeFunctionHeapProfileRecords(
Record.push_back(AI.Versions.size());
for (auto &MIB : AI.MIBs) {
Record.push_back((uint8_t)MIB.AllocType);
// Record the index into the radix tree array for this context.
assert(CallStackCount <= CallStackPos.size());
Record.push_back(CallStackPos[CallStackCount++]);
Record.push_back(MIB.StackIdIndices.size());
for (auto Id : MIB.StackIdIndices)
Record.push_back(GetStackIndex(Id));
}
if (!PerModule) {
for (auto V : AI.Versions)
@@ -4331,9 +4282,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
unsigned ValueID, unsigned FSCallsRelBFAbbrev,
unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev,
unsigned AllocAbbrev, unsigned ContextIdAbbvId, const Function &F,
DenseMap<CallStackId, LinearCallStackId> &CallStackPos,
CallStackId &CallStackCount) {
unsigned AllocAbbrev, unsigned ContextIdAbbvId, const Function &F) {
NameVals.push_back(ValueID);
FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -4348,7 +4297,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
/*PerModule*/ true,
/*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
/*GetStackIndex*/ [&](unsigned I) { return I; },
/*WriteContextSizeInfoIndex*/ true, CallStackPos, CallStackCount);
/*WriteContextSizeInfoIndex*/ true);
auto SpecialRefCnts = FS->specialRefCounts();
NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
@@ -4581,54 +4530,12 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
Abbv = std::make_shared<BitCodeAbbrev>();
Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
// n x (alloc type, context radix tree index)
// n x (alloc type, numstackids, numstackids x stackidindex)
// optional: nummib x (numcontext x total size)
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
Abbv = std::make_shared<BitCodeAbbrev>();
Abbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_RADIX_TREE_ARRAY));
// n x entry
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned RadixAbbrev = Stream.EmitAbbrev(std::move(Abbv));
// First walk through all the functions and collect the allocation contexts in
// their associated summaries, for use in constructing a radix tree of
// contexts. Note that we need to do this in the same order as the functions
// are processed further below since the call stack positions in the resulting
// radix tree array are identified based on this order.
MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> CallStacks;
for (const Function &F : M) {
// Summary emission does not support anonymous functions, they have to be
// renamed using the anonymous function renaming pass.
if (!F.hasName())
report_fatal_error("Unexpected anonymous function when writing summary");
ValueInfo VI = Index->getValueInfo(F.getGUID());
if (!VI || VI.getSummaryList().empty()) {
// Only declarations should not have a summary (a declaration might
// however have a summary if the def was in module level asm).
assert(F.isDeclaration());
continue;
}
auto *Summary = VI.getSummaryList()[0].get();
FunctionSummary *FS = cast<FunctionSummary>(Summary);
collectMemProfCallStacks(
FS, /*GetStackIndex*/ [](unsigned I) { return I; }, CallStacks);
}
// Finalize the radix tree, write it out, and get the map of positions in the
// linearized tree array.
DenseMap<CallStackId, LinearCallStackId> CallStackPos;
if (!CallStacks.empty()) {
CallStackPos =
writeMemoryProfileRadixTree(std::move(CallStacks), Stream, RadixAbbrev);
}
// Keep track of the current index into the CallStackPos map.
CallStackId CallStackCount = 0;
SmallVector<uint64_t, 64> NameVals;
// Iterate over the list of functions instead of the Index to
// ensure the ordering is stable.
@@ -4648,8 +4555,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
auto *Summary = VI.getSummaryList()[0].get();
writePerModuleFunctionSummaryRecord(
NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev,
FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId, F,
CallStackPos, CallStackCount);
FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId, F);
}
// Capture references from GlobalVariable initializers, which are outside
@@ -4786,20 +4692,13 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
// nummib x (alloc type, context radix tree index),
// nummib x (alloc type, numstackids, numstackids x stackidindex),
// numver x version
// optional: nummib x total size
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
Abbv = std::make_shared<BitCodeAbbrev>();
Abbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_RADIX_TREE_ARRAY));
// n x entry
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned RadixAbbrev = Stream.EmitAbbrev(std::move(Abbv));
auto shouldImportValueAsDecl = [&](GlobalValueSummary *GVS) -> bool {
if (DecSummaries == nullptr)
return false;
@@ -4836,41 +4735,6 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
NameVals.clear();
};
// First walk through all the functions and collect the allocation contexts in
// their associated summaries, for use in constructing a radix tree of
// contexts. Note that we need to do this in the same order as the functions
// are processed further below since the call stack positions in the resulting
// radix tree array are identified based on this order.
MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> CallStacks;
forEachSummary([&](GVInfo I, bool IsAliasee) {
GlobalValueSummary *S = I.second;
assert(S);
auto *FS = dyn_cast<FunctionSummary>(S);
if (!FS)
return;
collectMemProfCallStacks(
FS,
/*GetStackIndex*/
[&](unsigned I) {
// Get the corresponding index into the list of StackIds actually
// being written for this combined index (which may be a subset in
// the case of distributed indexes).
assert(StackIdIndicesToIndex.contains(I));
return StackIdIndicesToIndex[I];
},
CallStacks);
});
// Finalize the radix tree, write it out, and get the map of positions in the
// linearized tree array.
DenseMap<CallStackId, LinearCallStackId> CallStackPos;
if (!CallStacks.empty()) {
CallStackPos =
writeMemoryProfileRadixTree(std::move(CallStacks), Stream, RadixAbbrev);
}
// Keep track of the current index into the CallStackPos map.
CallStackId CallStackCount = 0;
DenseSet<GlobalValue::GUID> DefOrUseGUIDs;
forEachSummary([&](GVInfo I, bool IsAliasee) {
GlobalValueSummary *S = I.second;
@@ -4949,7 +4813,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
assert(StackIdIndicesToIndex.contains(I));
return StackIdIndicesToIndex[I];
},
/*WriteContextSizeInfoIndex*/ false, CallStackPos, CallStackCount);
/*WriteContextSizeInfoIndex*/ false);
NameVals.push_back(*ValueId);
assert(ModuleIdMap.count(FS->modulePath()));

View File

@@ -510,7 +510,6 @@ void CallStackRadixTreeBuilder<FrameIdTy>::build(
// Explicitly instantiate class with the utilized FrameIdTy.
template class CallStackRadixTreeBuilder<FrameId>;
template class CallStackRadixTreeBuilder<LinearFrameId>;
template <typename FrameIdTy>
llvm::DenseMap<FrameIdTy, FrameStat>
@@ -533,10 +532,6 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>(
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
&MemProfCallStackData);
template llvm::DenseMap<LinearFrameId, FrameStat>
computeFrameHistogram<LinearFrameId>(
llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
&MemProfCallStackData);
void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
for (const auto &AS : Record.AllocSites) {

View File

@@ -1,28 +0,0 @@
;; Check that we can read the old *_ALLOC_INFO summary format that placed the
;; stack id indexes directly in the alloc info summary, rather than encoding as
;; a separate radix tree.
;;
;; The old bitcode was generated by the older compiler from `opt -thinlto-bc`
;; on the following LLVM assembly:
;;
;; target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
;; target triple = "x86_64-unknown-linux-gnu"
;;
;; define internal ptr @_Z3barv() #0 {
;; entry:
;; %call = call ptr @_Znam(i64 0), !memprof !1, !callsite !6
;; ret ptr null
;; }
;;
;; declare ptr @_Znam(i64)
;;
;; !1 = !{!2, !4}
;; !2 = !{!3, !"notcold"}
;; !3 = !{i64 9086428284934609951, i64 8632435727821051414}
;; !4 = !{!5, !"cold"}
;; !5 = !{i64 9086428284934609951, i64 2732490490862098848}
;; !6 = !{i64 9086428284934609951}
; RUN: llvm-dis %S/Inputs/memprof-old-alloc-context-summary.bc -o - | FileCheck %s
; CHECK: stackIds: (8632435727821051414)
; CHECK-SAME: stackIds: (2732490490862098848)