[BOLT] Use symbol table info in registerFragment (#89648)

Fragment matching relies on symbol names to identify and register split
function fragments. However, as split fragments are often local symbols,
name aliasing is possible. For such cases, use symbol table to resolve
ambiguities.

This requires the presence of FILE symbols in the input binary. As BOLT
requires non-stripped binary, this is a reasonable assumption. Note that
`strip -g` removes FILE symbols by default, but `--keep-file-symbols`
can be used to preserve them.

Depends on: https://github.com/llvm/llvm-project/pull/89861

Test Plan:
Updated X86/fragment-lite.s
This commit is contained in:
Amir Ayupov
2024-04-29 20:14:31 +02:00
committed by GitHub
parent 7e2eeb5753
commit a1e9608b0f
4 changed files with 157 additions and 39 deletions

View File

@@ -494,6 +494,9 @@ private:
/// Store all non-zero symbols in this map for a quick address lookup.
std::map<uint64_t, llvm::object::SymbolRef> FileSymRefs;
/// FILE symbols used for disambiguating split function parents.
std::vector<ELFSymbolRef> FileSymbols;
std::unique_ptr<DWARFRewriter> DebugInfoRewriter;
std::unique_ptr<BoltAddressTranslation> BAT;

View File

@@ -28,10 +28,23 @@ class NameResolver {
static constexpr char Sep = '/';
public:
/// Return unique version of the \p Name in the form "Name<Sep><Number>".
/// Return the number of uniquified versions of a given \p Name.
uint64_t getUniquifiedNameCount(StringRef Name) const {
if (Counters.contains(Name))
return Counters.at(Name);
return 0;
}
/// Return unique version of the \p Name in the form "Name<Sep><ID>".
std::string getUniqueName(StringRef Name, const uint64_t ID) const {
return (Name + Twine(Sep) + Twine(ID)).str();
}
/// Register new version of \p Name and return unique version in the form
/// "Name<Sep><Number>".
std::string uniquify(StringRef Name) {
const uint64_t ID = ++Counters[Name];
return (Name + Twine(Sep) + Twine(ID)).str();
return getUniqueName(Name, ID);
}
/// For uniquified \p Name, return the original form (that may no longer be

View File

@@ -840,6 +840,7 @@ void RewriteInstance::discoverFileObjects() {
continue;
if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
FileSymbols.emplace_back(Symbol);
StringRef Name =
cantFail(std::move(NameOrError), "cannot get symbol name for file");
// Ignore Clang LTO artificial FILE symbol as it is not always generated,
@@ -1340,6 +1341,7 @@ void RewriteInstance::discoverFileObjects() {
}
registerFragments();
FileSymbols.clear();
}
Error RewriteInstance::discoverRtFiniAddress() {
@@ -1417,50 +1419,116 @@ void RewriteInstance::registerFragments() {
if (!BC->HasSplitFunctions)
return;
// Process fragments with ambiguous parents separately as they are typically a
// vanishing minority of cases and require expensive symbol table lookups.
std::vector<std::pair<StringRef, BinaryFunction *>> AmbiguousFragments;
for (auto &BFI : BC->getBinaryFunctions()) {
BinaryFunction &Function = BFI.second;
if (!Function.isFragment())
continue;
unsigned ParentsFound = 0;
for (StringRef Name : Function.getNames()) {
StringRef BaseName, Suffix;
std::tie(BaseName, Suffix) = Name.split('/');
StringRef BaseName = NR.restore(Name);
const bool IsGlobal = BaseName == Name;
const size_t ColdSuffixPos = BaseName.find(".cold");
if (ColdSuffixPos == StringRef::npos)
continue;
// For cold function with local (foo.cold/1) symbol, prefer a parent with
// local symbol as well (foo/1) over global symbol (foo).
std::string ParentName = BaseName.substr(0, ColdSuffixPos).str();
StringRef ParentName = BaseName.substr(0, ColdSuffixPos);
const BinaryData *BD = BC->getBinaryDataByName(ParentName);
if (Suffix != "") {
ParentName.append(Twine("/", Suffix).str());
const BinaryData *BDLocal = BC->getBinaryDataByName(ParentName);
if (BDLocal || !BD)
BD = BDLocal;
}
if (!BD) {
if (opts::Verbosity >= 1)
BC->outs() << "BOLT-INFO: parent function not found for " << Name
<< "\n";
const uint64_t NumPossibleLocalParents =
NR.getUniquifiedNameCount(ParentName);
// The most common case: single local parent fragment.
if (!BD && NumPossibleLocalParents == 1) {
BD = BC->getBinaryDataByName(NR.getUniqueName(ParentName, 1));
} else if (BD && (!NumPossibleLocalParents || IsGlobal)) {
// Global parent and either no local candidates (second most common), or
// the fragment is global as well (uncommon).
} else {
// Any other case: need to disambiguate using FILE symbols.
AmbiguousFragments.emplace_back(ParentName, &Function);
continue;
}
const uint64_t Address = BD->getAddress();
BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
if (!BF) {
if (opts::Verbosity >= 1)
BC->outs() << formatv(
"BOLT-INFO: parent function not found at {0:x}\n", Address);
continue;
if (BD) {
BinaryFunction *BF = BC->getFunctionForSymbol(BD->getSymbol());
if (BF) {
BC->registerFragment(Function, *BF);
continue;
}
}
BC->registerFragment(Function, *BF);
++ParentsFound;
}
if (!ParentsFound) {
BC->errs() << "BOLT-ERROR: parent function not found for " << Function
<< '\n';
exit(1);
}
}
if (AmbiguousFragments.empty())
return;
if (!BC->hasSymbolsWithFileName()) {
BC->errs() << "BOLT-ERROR: input file has split functions but does not "
"have FILE symbols. If the binary was stripped, preserve "
"FILE symbols with --keep-file-symbols strip option";
exit(1);
}
// The first global symbol is identified by the symbol table sh_info value.
// Used as local symbol search stopping point.
auto *ELF64LEFile = cast<ELF64LEObjectFile>(InputFile);
const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
auto *SymTab = llvm::find_if(cantFail(Obj.sections()), [](const auto &Sec) {
return Sec.sh_type == ELF::SHT_SYMTAB;
});
assert(SymTab);
// Symtab sh_info contains the value one greater than the symbol table index
// of the last local symbol.
ELFSymbolRef LocalSymEnd = ELF64LEFile->toSymbolRef(SymTab, SymTab->sh_info);
for (auto &[ParentName, BF] : AmbiguousFragments) {
const uint64_t Address = BF->getAddress();
// Get fragment's own symbol
const auto SymIt = FileSymRefs.find(Address);
if (SymIt == FileSymRefs.end()) {
BC->errs()
<< "BOLT-ERROR: symbol lookup failed for function at address 0x"
<< Twine::utohexstr(Address) << '\n';
exit(1);
}
// Find containing FILE symbol
ELFSymbolRef Symbol = SymIt->second;
auto FSI = llvm::upper_bound(FileSymbols, Symbol);
if (FSI == FileSymbols.begin()) {
BC->errs() << "BOLT-ERROR: owning FILE symbol not found for symbol "
<< cantFail(Symbol.getName()) << '\n';
exit(1);
}
ELFSymbolRef StopSymbol = LocalSymEnd;
if (FSI != FileSymbols.end())
StopSymbol = *FSI;
uint64_t ParentAddress{0};
// Iterate over local file symbols and check symbol names to match parent.
for (ELFSymbolRef Symbol(FSI[-1]); Symbol < StopSymbol; Symbol.moveNext()) {
if (cantFail(Symbol.getName()) == ParentName) {
ParentAddress = cantFail(Symbol.getAddress());
break;
}
}
// No local parent is found, use global parent function.
if (!ParentAddress)
if (BinaryData *ParentBD = BC->getBinaryDataByName(ParentName))
ParentAddress = ParentBD->getAddress();
if (BinaryFunction *ParentBF =
BC->getBinaryFunctionAtAddress(ParentAddress)) {
BC->registerFragment(*BF, *ParentBF);
continue;
}
BC->errs() << "BOLT-ERROR: parent function not found for " << *BF << '\n';
exit(1);
}
}
void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,

View File

@@ -3,35 +3,42 @@
# RUN: split-file %s %t
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz.s -o %t.baz.o
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz2.s -o %t.baz2.o
# RUN: link_fdata %s %t.o %t.main.fdata
# RUN: link_fdata %s %t.baz.o %t.baz.fdata
# RUN: merge-fdata %t.main.fdata %t.baz.fdata > %t.fdata
# RUN: %clang %cflags %t.o %t.baz.o -o %t.exe -Wl,-q
# RUN: link_fdata %s %t.baz2.o %t.baz2.fdata
# RUN: merge-fdata %t.main.fdata %t.baz.fdata %t.baz2.fdata > %t.fdata
# RUN: %clang %cflags %t.o %t.baz.o %t.baz2.o -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe -o %t.out --lite=1 --data %t.fdata -v=1 -print-cfg \
# RUN: 2>&1 | FileCheck %s
# CHECK: BOLT-INFO: processing main.cold.1 as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing foo.cold.1/1 as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing bar.cold.1/1 as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing foo.cold.1/1(*2) as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing bar.cold.1/1(*2) as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing baz.cold.1 as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing baz.cold.1/1 as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing baz.cold.1/1(*2) as a sibling of non-ignored function
# CHECK: BOLT-INFO: processing baz.cold.1/2(*2) as a sibling of non-ignored function
# CHECK: Binary Function "main.cold.1" after building cfg
# CHECK: Parent : main
# CHECK: Binary Function "foo.cold.1/1" after building cfg
# CHECK: Binary Function "foo.cold.1/1(*2)" after building cfg
# CHECK: Parent : foo
# CHECK: Binary Function "bar.cold.1/1" after building cfg
# CHECK: Parent : bar/1
# CHECK: Binary Function "bar.cold.1/1(*2)" after building cfg
# CHECK: Parent : bar/1(*2)
# CHECK: Binary Function "baz.cold.1" after building cfg
# CHECK: Parent : baz{{$}}
# CHECK: Binary Function "baz.cold.1/1" after building cfg
# CHECK: Parent : baz/1
# CHECK: Binary Function "baz.cold.1/1(*2)" after building cfg
# CHECK: Parent : baz/1(*2)
# CHECK: Binary Function "baz.cold.1/2(*2)" after building cfg
# CHECK: Parent : baz/2(*2)
#--- main.s
.file "main.s"
.globl main
.type main, %function
main:
@@ -126,6 +133,7 @@ baz.cold.1:
.size baz.cold.1, .-baz.cold.1
#--- baz.s
.file "baz.s"
.local baz
.type baz, %function
baz:
@@ -149,3 +157,29 @@ baz.cold.1:
retq
.cfi_endproc
.size baz.cold.1, .-baz.cold.1
#--- baz2.s
.file "baz2.s"
.local baz
.type baz, %function
baz:
.cfi_startproc
# FDATA: 0 [unknown] 0 1 baz/2 0 1 0
cmpl $0x0, %eax
je baz.cold.1
retq
.cfi_endproc
.size baz, .-baz
.section .text.cold
.local baz.cold.1
.type baz.cold.1, %function
baz.cold.1:
.cfi_startproc
pushq %rbp
movq %rsp, %rbp
movl $0x0, %eax
popq %rbp
retq
.cfi_endproc
.size baz.cold.1, .-baz.cold.1