[BOLT][AArch64] Add partial support for lite mode (#133014)

In lite mode, we only emit code for a subset of functions while
preserving the original code in .bolt.org.text. This requires updating
code references in non-emitted functions to ensure that:

* Non-optimized versions of the optimized code never execute.
* Function pointer comparison semantics is preserved.

On x86-64, we can update code references in-place using "pending
relocations" added in scanExternalRefs(). However, on AArch64, this is
not always possible due to address range limitations and linker address
"relaxation".

There are two types of code-to-code references: control transfer (e.g.,
calls and branches) and function pointer materialization.
AArch64-specific control transfer instructions are covered by #116964.

For function pointer materialization, simply changing the immediate
field of an instruction is not always sufficient. In some cases, we need
to modify a pair of instructions, such as undoing linker relaxation and
converting NOP+ADR into ADRP+ADD sequence.

To achieve this, we use the instruction patch mechanism instead of
pending relocations. Instruction patches are emitted via the regular MC
layer, just like regular functions. However, they have a fixed address
and do not have an associated symbol table entry. This allows us to make
more complex changes to the code, ensuring that function pointers are
correctly updated. Such mechanism should also be portable to RISC-V and
other architectures.

To summarize, for AArch64, we extend the scanExternalRefs() process to
undo linker relaxation and use instruction patches to partially
overwrite unoptimized code.
This commit is contained in:
Maksim Panchenko
2025-03-27 21:33:25 -07:00
committed by GitHub
parent 0ed4bdfe70
commit 96e5ee23a7
9 changed files with 282 additions and 23 deletions

View File

@@ -544,9 +544,10 @@ public:
///
/// Optional \p Name can be assigned to the patch. The name will be emitted to
/// the symbol table at \p Address.
BinaryFunction *createInstructionPatch(uint64_t Address,
InstructionListType &Instructions,
const Twine &Name = "");
BinaryFunction *
createInstructionPatch(uint64_t Address,
const InstructionListType &Instructions,
const Twine &Name = "");
std::vector<BinaryFunction *> &getInjectedBinaryFunctions() {
return InjectedBinaryFunctions;

View File

@@ -357,6 +357,12 @@ private:
/// True if another function body was merged into this one.
bool HasFunctionsFoldedInto{false};
/// True if the function is used for patching code at a fixed address.
bool IsPatch{false};
/// True if the function should not have an associated symbol table entry.
bool IsAnonymous{false};
/// Name for the section this function code should reside in.
std::string CodeSectionName;
@@ -1358,6 +1364,12 @@ public:
/// Return true if other functions were folded into this one.
bool hasFunctionsFoldedInto() const { return HasFunctionsFoldedInto; }
/// Return true if this function is used for patching existing code.
bool isPatch() const { return IsPatch; }
/// Return true if the function should not have associated symbol table entry.
bool isAnonymous() const { return IsAnonymous; }
/// If this function was folded, return the function it was folded into.
BinaryFunction *getFoldedIntoFunction() const { return FoldedIntoFunction; }
@@ -1734,6 +1746,18 @@ public:
/// Indicate that another function body was merged with this function.
void setHasFunctionsFoldedInto() { HasFunctionsFoldedInto = true; }
/// Indicate that this function is a patch.
void setIsPatch(bool V) {
assert(isInjected() && "Only injected functions can be used as patches");
IsPatch = V;
}
/// Indicate if the function should have a name in the symbol table.
void setAnonymous(bool V) {
assert(isInjected() && "Only injected functions could be anonymous");
IsAnonymous = V;
}
void setHasSDTMarker(bool V) { HasSDTMarker = V; }
/// Mark the function as using ORC format for stack unwinding.

View File

@@ -1264,9 +1264,12 @@ public:
return nullptr;
}
/// Return MCSymbol extracted from a target expression
/// Return MCSymbol extracted from the expression.
virtual const MCSymbol *getTargetSymbol(const MCExpr *Expr) const {
return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
if (auto *SymbolRefExpr = dyn_cast<const MCSymbolRefExpr>(Expr))
return &SymbolRefExpr->getSymbol();
return nullptr;
}
/// Return addend that represents an offset from MCSymbol target

View File

@@ -2401,8 +2401,10 @@ BinaryContext::createInjectedBinaryFunction(const std::string &Name,
return BF;
}
BinaryFunction *BinaryContext::createInstructionPatch(
uint64_t Address, InstructionListType &Instructions, const Twine &Name) {
BinaryFunction *
BinaryContext::createInstructionPatch(uint64_t Address,
const InstructionListType &Instructions,
const Twine &Name) {
ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
assert(Section && "cannot get section for patching");
assert(Section->hasSectionRef() && Section->isText() &&
@@ -2423,6 +2425,11 @@ BinaryFunction *BinaryContext::createInstructionPatch(
PBF->setFileOffset(FileOffset);
PBF->setOriginSection(&Section.get());
PBF->addBasicBlock()->addInstructions(Instructions);
PBF->setIsPatch(true);
// Don't create symbol table entry if the name wasn't specified.
if (Name.str().empty())
PBF->setAnonymous(true);
return PBF;
}

View File

@@ -1583,13 +1583,18 @@ bool BinaryFunction::scanExternalRefs() {
assert(FunctionData.size() == getMaxSize() &&
"function size does not match raw data size");
if (BC.isX86())
BC.SymbolicDisAsm->setSymbolizer(
BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
BC.SymbolicDisAsm->setSymbolizer(
BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
// A list of patches for this function.
using PatchTy = std::pair<uint64_t, MCInst>;
std::vector<PatchTy> InstructionPatches;
// Disassemble contents of the function. Detect code entry points and create
// relocations for references to code that will be moved.
uint64_t Size = 0; // instruction size
MCInst Instruction;
MCInst PrevInstruction;
for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
// Check for data inside code and ignore it
if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) {
@@ -1598,7 +1603,7 @@ bool BinaryFunction::scanExternalRefs() {
}
const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
MCInst Instruction;
PrevInstruction = Instruction;
if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size,
FunctionData.slice(Offset),
AbsoluteInstrAddr, nulls())) {
@@ -1673,12 +1678,108 @@ bool BinaryFunction::scanExternalRefs() {
if (BranchTargetSymbol) {
BC.MIB->replaceBranchTarget(Instruction, BranchTargetSymbol,
Emitter.LocalCtx.get());
} else if (!llvm::any_of(Instruction,
[](const MCOperand &Op) { return Op.isExpr(); })) {
// Skip assembly if the instruction may not have any symbolic operands.
continue;
} else {
analyzeInstructionForFuncReference(Instruction);
const bool NeedsPatch = llvm::any_of(
MCPlus::primeOperands(Instruction), [&](const MCOperand &Op) {
return Op.isExpr() &&
!ignoreReference(BC.MIB->getTargetSymbol(Op.getExpr()));
});
if (!NeedsPatch)
continue;
}
// For AArch64, we need to undo relaxation done by the linker if the target
// of the instruction is a function that we plan to move.
//
// Linker relaxation is documented at:
// https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
// under #relocation-optimization.
if (const Relocation *Rel;
BC.isAArch64() && (Rel = getRelocationAt(Offset))) {
// NOP+ADR sequence can originate from either ADRP+ADD or ADRP+LDR.
// In either case, we convert it into ADRP+ADD.
if (BC.MIB->isADR(Instruction) &&
(Rel->Type == ELF::R_AARCH64_ADD_ABS_LO12_NC ||
Rel->Type == ELF::R_AARCH64_LD64_GOT_LO12_NC)) {
if (!BC.MIB->isNoop(PrevInstruction)) {
// In case of unexpected conversion from the linker, skip target
// optimization.
const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Instruction);
BC.errs() << "BOLT-WARNING: cannot undo linker relaxation for "
"instruction at 0x"
<< Twine::utohexstr(AbsoluteInstrAddr) << " referencing "
<< Symbol->getName() << '\n';
if (BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol))
TargetBF->setIgnored();
continue;
}
InstructionListType AdrpAdd =
BC.MIB->undoAdrpAddRelaxation(Instruction, BC.Ctx.get());
assert(AdrpAdd.size() == 2 && "Two instructions expected");
LLVM_DEBUG({
dbgs() << "BOLT-DEBUG: linker relaxation undone for instruction "
"at 0x"
<< Twine::utohexstr(AbsoluteInstrAddr) << '\n';
});
InstructionPatches.push_back({AbsoluteInstrAddr - 4, AdrpAdd[0]});
InstructionPatches.push_back({AbsoluteInstrAddr, AdrpAdd[1]});
continue;
}
// If ADR was emitted by the compiler/assembler to reference a nearby
// local function, we cannot move away that function due to ADR address
// span limitation. Hence, we skip the optimization.
if (BC.MIB->isADR(Instruction) &&
Rel->Type == ELF::R_AARCH64_ADR_PREL_LO21) {
BC.errs() << "BOLT-WARNING: unable to convert ADR that references "
<< Rel->Symbol->getName()
<< ". Will not optimize the target\n";
if (BinaryFunction *TargetBF = BC.getFunctionForSymbol(Rel->Symbol))
TargetBF->setIgnored();
continue;
}
// In the case of GOT load, ADRP+LDR can also be converted into ADRP+ADD.
// When this happens, it's not always possible to properly symbolize ADRP
// operand and we might have to adjust the operand based on the next
// instruction.
if (BC.MIB->isAddXri(Instruction) &&
Rel->Type == ELF::R_AARCH64_LD64_GOT_LO12_NC) {
if (!BC.MIB->matchAdrpAddPair(PrevInstruction, Instruction)) {
BC.errs() << "BOLT-ERROR: cannot find matching ADRP for relaxed LDR "
"instruction at 0x"
<< Twine::utohexstr(AbsoluteInstrAddr) << '\n';
exit(1);
}
// Check if ADRP was already patched. If not, add a new patch for it.
if (InstructionPatches.empty() ||
InstructionPatches.back().first != AbsoluteInstrAddr - 4)
InstructionPatches.push_back(
{AbsoluteInstrAddr - 4, PrevInstruction});
// Adjust the operand for ADRP from the patch.
MCInst &ADRPInst = InstructionPatches.back().second;
const MCSymbol *ADRPSymbol = BC.MIB->getTargetSymbol(ADRPInst);
const MCSymbol *ADDSymbol = BC.MIB->getTargetSymbol(Instruction);
if (ADRPSymbol != ADDSymbol) {
const int64_t Addend = BC.MIB->getTargetAddend(Instruction);
BC.MIB->setOperandToSymbolRef(ADRPInst, /*OpNum*/ 1, ADDSymbol,
Addend, BC.Ctx.get(),
ELF::R_AARCH64_NONE);
}
}
}
// On AArch64, we use instruction patches for fixing references. We make an
// exception for branch instructions since they require optional
// relocations.
if (BC.isAArch64() && !BranchTargetSymbol) {
LLVM_DEBUG(BC.printInstruction(dbgs(), Instruction, AbsoluteInstrAddr));
InstructionPatches.push_back({AbsoluteInstrAddr, Instruction});
continue;
}
// Emit the instruction using temp emitter and generate relocations.
@@ -1720,6 +1821,23 @@ bool BinaryFunction::scanExternalRefs() {
for (Relocation &Rel : FunctionRelocations)
getOriginSection()->addPendingRelocation(Rel);
// Add patches grouping them together.
if (!InstructionPatches.empty()) {
uint64_t PatchGroupAddress;
InstructionListType PatchGroup;
for (auto PI = InstructionPatches.begin(), PE = InstructionPatches.end();
PI != PE; ++PI) {
auto &Patch = *PI;
if (PatchGroup.empty())
PatchGroupAddress = Patch.first;
PatchGroup.push_back(Patch.second);
if (std::next(PI) == PE || std::next(PI)->first != Patch.first + 4) {
BC.createInstructionPatch(PatchGroupAddress, PatchGroup);
PatchGroup.clear();
}
}
}
// Inform BinaryContext that this function symbols will not be defined and
// relocations should not be created against them.
if (BC.HasRelocations) {

View File

@@ -1269,8 +1269,10 @@ Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
Error AssignSections::runOnFunctions(BinaryContext &BC) {
for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
Function->setCodeSectionName(BC.getInjectedCodeSectionName());
Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
if (!Function->isPatch()) {
Function->setCodeSectionName(BC.getInjectedCodeSectionName());
Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
}
}
// In non-relocation mode functions have pre-assigned section names.

View File

@@ -5078,6 +5078,8 @@ void RewriteInstance::updateELFSymbolTable(
// Add symbols of injected functions
for (BinaryFunction *Function : BC->getInjectedBinaryFunctions()) {
if (Function->isAnonymous())
continue;
ELFSymTy NewSymbol;
BinarySection *OriginSection = Function->getOriginSection();
NewSymbol.st_shndx =

View File

@@ -1803,12 +1803,6 @@ public:
return &SymExpr->getSymbol();
}
// This is the same as the base class, but since we are overriding one of
// getTargetSymbol's signatures above, we need to override all of them.
const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
}
bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
const MCSymbol *&TBB, const MCSymbol *&FBB,
MCInst *&CondBranch,

View File

@@ -0,0 +1,108 @@
## Check that in lite mode llvm-bolt updates function references in
## non-optimized code.
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
# RUN: llvm-bolt %t.exe -o %t.bolt --data %t.fdata --lite
# RUN: llvm-objdump -d --disassemble-symbols=cold_function %t.exe \
# RUN: | FileCheck %s --check-prefix=CHECK-INPUT
# RUN: llvm-objdump -d --disassemble-symbols=cold_function %t.bolt \
# RUN: | FileCheck %s
## In lite mode, optimized code will be separated from the original .text by
## over 128MB, making it impossible for call/bl instructions in cold functions
## to reach optimized functions directly.
.text
.globl _start
.type _start, %function
_start:
# FDATA: 0 [unknown] 0 1 _start 0 0 100
.cfi_startproc
cmp x0, 1
b.eq .L0
bl cold_function
.L0:
ret x30
.cfi_endproc
.size _start, .-_start
## Cold non-optimized function with a reference to a hot function (_start).
# CHECK: Disassembly of section .bolt.org.text:
# CHECK-LABEL: <cold_function>
.globl cold_function
.type cold_function, %function
cold_function:
.cfi_startproc
## Absolute 64-bit function pointer reference.
## We check for the lower 16 bits of _start to be zeros after update.
movz x0, :abs_g3:_start
movk x0, :abs_g2_nc:_start
movk x0, :abs_g1_nc:_start
# CHECK-INPUT-NOT: movk x0, #0x0{{$}}
# CHECK: movk x0, #0x0{{$}}
movk x0, :abs_g0_nc:_start
## Relaxable address reference.
# CHECK-INPUT: nop
# CHECK-INPUT-NEXT: adr x1
# CHECK-NEXT: adrp x1, [[ADDR:0x[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: add x1
adrp x1, _start
add x1, x1, :lo12:_start
## Non-relaxable address reference.
# CHECK-INPUT-NEXT: adrp x2
# CHECK-INPUT-NEXT: add x2
# CHECK-NEXT: adrp x2, [[ADDR]]
# CHECK-NEXT: add x2
adrp x2, far_func
add x2, x2, :lo12:far_func
## Check that fully-relaxed GOT reference is converted into ADRP+ADD.
adrp x3, :got:_start
ldr x3, [x3, #:got_lo12:_start]
# CHECK-INPUT-NEXT: nop
# CHECK-INPUT-NEXT: adr x3
# CHECK-NEXT: adrp x3, [[ADDR]]
# CHECK-NEXT: add x3
## Check that partially-relaxed GOT reference is converted into ADRP+ADD.
adrp x4, :got:far_func
ldr x4, [x4, #:got_lo12:far_func]
# CHECK-INPUT-NEXT: adrp x4
# CHECK-INPUT-NEXT: add x4
# CHECK-NEXT: adrp x4, [[ADDR]]
# CHECK-NEXT: add x4
## Check that non-relaxable GOT load is left intact.
adrp x5, :got:far_func
nop
ldr x5, [x5, #:got_lo12:far_func]
# CHECK-INPUT-NEXT: adrp x5
# CHECK-INPUT-NEXT: nop
# CHECK-INPUT-NEXT: ldr x5
# CHECK-NEXT: adrp x5
# CHECK-NOT: [[ADDR]]
# CHECK-NEXT: nop
# CHECK-NEXT: ldr x5
.cfi_endproc
.size cold_function, .-cold_function
## Reserve 1MB of space to make functions that follow unreachable by ADRs in
## code that precedes this gap.
.space 0x100000
.globl far_func
.type far_func, %function
far_func:
# FDATA: 0 [unknown] 0 1 far_func 0 0 100
.cfi_startproc
ret x30
.cfi_endproc
.size far_func, .-far_func