[BOLT][AArch64] Add partial support for lite mode (#133014)

In lite mode, we only emit code for a subset of functions while preserving the original code in .bolt.org.text. This requires updating code references in non-emitted functions to ensure that: * Non-optimized versions of the optimized code never execute. * Function pointer comparison semantics is preserved. On x86-64, we can update code references in-place using "pending relocations" added in scanExternalRefs(). However, on AArch64, this is not always possible due to address range limitations and linker address "relaxation". There are two types of code-to-code references: control transfer (e.g., calls and branches) and function pointer materialization. AArch64-specific control transfer instructions are covered by #116964. For function pointer materialization, simply changing the immediate field of an instruction is not always sufficient. In some cases, we need to modify a pair of instructions, such as undoing linker relaxation and converting NOP+ADR into ADRP+ADD sequence. To achieve this, we use the instruction patch mechanism instead of pending relocations. Instruction patches are emitted via the regular MC layer, just like regular functions. However, they have a fixed address and do not have an associated symbol table entry. This allows us to make more complex changes to the code, ensuring that function pointers are correctly updated. Such mechanism should also be portable to RISC-V and other architectures. To summarize, for AArch64, we extend the scanExternalRefs() process to undo linker relaxation and use instruction patches to partially overwrite unoptimized code.
2025-03-27 21:33:25 -07:00
parent 0ed4bdfe70
commit 96e5ee23a7
9 changed files with 282 additions and 23 deletions
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -544,9 +544,10 @@ public:
  ///
  /// Optional \p Name can be assigned to the patch. The name will be emitted to
  /// the symbol table at \p Address.
-  BinaryFunction *createInstructionPatch(uint64_t Address,
-                                         InstructionListType &Instructions,
-                                         const Twine &Name = "");
+  BinaryFunction *
+  createInstructionPatch(uint64_t Address,
+                         const InstructionListType &Instructions,
+                         const Twine &Name = "");

  std::vector<BinaryFunction *> &getInjectedBinaryFunctions() {
    return InjectedBinaryFunctions;
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -357,6 +357,12 @@ private:
  /// True if another function body was merged into this one.
  bool HasFunctionsFoldedInto{false};

+  /// True if the function is used for patching code at a fixed address.
+  bool IsPatch{false};
+
+  /// True if the function should not have an associated symbol table entry.
+  bool IsAnonymous{false};
+
  /// Name for the section this function code should reside in.
  std::string CodeSectionName;

@@ -1358,6 +1364,12 @@ public:
  /// Return true if other functions were folded into this one.
  bool hasFunctionsFoldedInto() const { return HasFunctionsFoldedInto; }

+  /// Return true if this function is used for patching existing code.
+  bool isPatch() const { return IsPatch; }
+
+  /// Return true if the function should not have associated symbol table entry.
+  bool isAnonymous() const { return IsAnonymous; }
+
  /// If this function was folded, return the function it was folded into.
  BinaryFunction *getFoldedIntoFunction() const { return FoldedIntoFunction; }

@@ -1734,6 +1746,18 @@ public:
  /// Indicate that another function body was merged with this function.
  void setHasFunctionsFoldedInto() { HasFunctionsFoldedInto = true; }

+  /// Indicate that this function is a patch.
+  void setIsPatch(bool V) {
+    assert(isInjected() && "Only injected functions can be used as patches");
+    IsPatch = V;
+  }
+
+  /// Indicate if the function should have a name in the symbol table.
+  void setAnonymous(bool V) {
+    assert(isInjected() && "Only injected functions could be anonymous");
+    IsAnonymous = V;
+  }
+
  void setHasSDTMarker(bool V) { HasSDTMarker = V; }

  /// Mark the function as using ORC format for stack unwinding.
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1264,9 +1264,12 @@ public:
    return nullptr;
  }

-  /// Return MCSymbol extracted from a target expression
+  /// Return MCSymbol extracted from the expression.
  virtual const MCSymbol *getTargetSymbol(const MCExpr *Expr) const {
-    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
+    if (auto *SymbolRefExpr = dyn_cast<const MCSymbolRefExpr>(Expr))
+      return &SymbolRefExpr->getSymbol();
+
+    return nullptr;
  }

  /// Return addend that represents an offset from MCSymbol target
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -2401,8 +2401,10 @@ BinaryContext::createInjectedBinaryFunction(const std::string &Name,
  return BF;
 }

-BinaryFunction *BinaryContext::createInstructionPatch(
-    uint64_t Address, InstructionListType &Instructions, const Twine &Name) {
+BinaryFunction *
+BinaryContext::createInstructionPatch(uint64_t Address,
+                                      const InstructionListType &Instructions,
+                                      const Twine &Name) {
  ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
  assert(Section && "cannot get section for patching");
  assert(Section->hasSectionRef() && Section->isText() &&
@@ -2423,6 +2425,11 @@ BinaryFunction *BinaryContext::createInstructionPatch(
  PBF->setFileOffset(FileOffset);
  PBF->setOriginSection(&Section.get());
  PBF->addBasicBlock()->addInstructions(Instructions);
+  PBF->setIsPatch(true);
+
+  // Don't create symbol table entry if the name wasn't specified.
+  if (Name.str().empty())
+    PBF->setAnonymous(true);

  return PBF;
 }
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1583,13 +1583,18 @@ bool BinaryFunction::scanExternalRefs() {
  assert(FunctionData.size() == getMaxSize() &&
         "function size does not match raw data size");

-  if (BC.isX86())
-    BC.SymbolicDisAsm->setSymbolizer(
-        BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
+  BC.SymbolicDisAsm->setSymbolizer(
+      BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
+
+  // A list of patches for this function.
+  using PatchTy = std::pair<uint64_t, MCInst>;
+  std::vector<PatchTy> InstructionPatches;

  // Disassemble contents of the function. Detect code entry points and create
  // relocations for references to code that will be moved.
  uint64_t Size = 0; // instruction size
+  MCInst Instruction;
+  MCInst PrevInstruction;
  for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
    // Check for data inside code and ignore it
    if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) {
@@ -1598,7 +1603,7 @@ bool BinaryFunction::scanExternalRefs() {
    }

    const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
-    MCInst Instruction;
+    PrevInstruction = Instruction;
    if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size,
                                           FunctionData.slice(Offset),
                                           AbsoluteInstrAddr, nulls())) {
@@ -1673,12 +1678,108 @@ bool BinaryFunction::scanExternalRefs() {
    if (BranchTargetSymbol) {
      BC.MIB->replaceBranchTarget(Instruction, BranchTargetSymbol,
                                  Emitter.LocalCtx.get());
-    } else if (!llvm::any_of(Instruction,
-                             [](const MCOperand &Op) { return Op.isExpr(); })) {
-      // Skip assembly if the instruction may not have any symbolic operands.
-      continue;
    } else {
      analyzeInstructionForFuncReference(Instruction);
+      const bool NeedsPatch = llvm::any_of(
+          MCPlus::primeOperands(Instruction), [&](const MCOperand &Op) {
+            return Op.isExpr() &&
+                   !ignoreReference(BC.MIB->getTargetSymbol(Op.getExpr()));
+          });
+      if (!NeedsPatch)
+        continue;
+    }
+
+    // For AArch64, we need to undo relaxation done by the linker if the target
+    // of the instruction is a function that we plan to move.
+    //
+    // Linker relaxation is documented at:
+    // https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
+    // under #relocation-optimization.
+    if (const Relocation *Rel;
+        BC.isAArch64() && (Rel = getRelocationAt(Offset))) {
+      // NOP+ADR sequence can originate from either ADRP+ADD or ADRP+LDR.
+      // In either case, we convert it into ADRP+ADD.
+      if (BC.MIB->isADR(Instruction) &&
+          (Rel->Type == ELF::R_AARCH64_ADD_ABS_LO12_NC ||
+           Rel->Type == ELF::R_AARCH64_LD64_GOT_LO12_NC)) {
+        if (!BC.MIB->isNoop(PrevInstruction)) {
+          // In case of unexpected conversion from the linker, skip target
+          // optimization.
+          const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Instruction);
+          BC.errs() << "BOLT-WARNING: cannot undo linker relaxation for "
+                       "instruction at 0x"
+                    << Twine::utohexstr(AbsoluteInstrAddr) << " referencing "
+                    << Symbol->getName() << '\n';
+          if (BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol))
+            TargetBF->setIgnored();
+          continue;
+        }
+
+        InstructionListType AdrpAdd =
+            BC.MIB->undoAdrpAddRelaxation(Instruction, BC.Ctx.get());
+        assert(AdrpAdd.size() == 2 && "Two instructions expected");
+        LLVM_DEBUG({
+          dbgs() << "BOLT-DEBUG: linker relaxation undone for instruction "
+                    "at 0x"
+                 << Twine::utohexstr(AbsoluteInstrAddr) << '\n';
+        });
+        InstructionPatches.push_back({AbsoluteInstrAddr - 4, AdrpAdd[0]});
+        InstructionPatches.push_back({AbsoluteInstrAddr, AdrpAdd[1]});
+        continue;
+      }
+
+      // If ADR was emitted by the compiler/assembler to reference a nearby
+      // local function, we cannot move away that function due to ADR address
+      // span limitation. Hence, we skip the optimization.
+      if (BC.MIB->isADR(Instruction) &&
+          Rel->Type == ELF::R_AARCH64_ADR_PREL_LO21) {
+        BC.errs() << "BOLT-WARNING: unable to convert ADR that references "
+                  << Rel->Symbol->getName()
+                  << ". Will not optimize the target\n";
+        if (BinaryFunction *TargetBF = BC.getFunctionForSymbol(Rel->Symbol))
+          TargetBF->setIgnored();
+        continue;
+      }
+
+      // In the case of GOT load, ADRP+LDR can also be converted into ADRP+ADD.
+      // When this happens, it's not always possible to properly symbolize ADRP
+      // operand and we might have to adjust the operand based on the next
+      // instruction.
+      if (BC.MIB->isAddXri(Instruction) &&
+          Rel->Type == ELF::R_AARCH64_LD64_GOT_LO12_NC) {
+        if (!BC.MIB->matchAdrpAddPair(PrevInstruction, Instruction)) {
+          BC.errs() << "BOLT-ERROR: cannot find matching ADRP for relaxed LDR "
+                       "instruction at 0x"
+                    << Twine::utohexstr(AbsoluteInstrAddr) << '\n';
+          exit(1);
+        }
+
+        // Check if ADRP was already patched. If not, add a new patch for it.
+        if (InstructionPatches.empty() ||
+            InstructionPatches.back().first != AbsoluteInstrAddr - 4)
+          InstructionPatches.push_back(
+              {AbsoluteInstrAddr - 4, PrevInstruction});
+
+        // Adjust the operand for ADRP from the patch.
+        MCInst &ADRPInst = InstructionPatches.back().second;
+        const MCSymbol *ADRPSymbol = BC.MIB->getTargetSymbol(ADRPInst);
+        const MCSymbol *ADDSymbol = BC.MIB->getTargetSymbol(Instruction);
+        if (ADRPSymbol != ADDSymbol) {
+          const int64_t Addend = BC.MIB->getTargetAddend(Instruction);
+          BC.MIB->setOperandToSymbolRef(ADRPInst, /*OpNum*/ 1, ADDSymbol,
+                                        Addend, BC.Ctx.get(),
+                                        ELF::R_AARCH64_NONE);
+        }
+      }
+    }
+
+    // On AArch64, we use instruction patches for fixing references. We make an
+    // exception for branch instructions since they require optional
+    // relocations.
+    if (BC.isAArch64() && !BranchTargetSymbol) {
+      LLVM_DEBUG(BC.printInstruction(dbgs(), Instruction, AbsoluteInstrAddr));
+      InstructionPatches.push_back({AbsoluteInstrAddr, Instruction});
+      continue;
    }

    // Emit the instruction using temp emitter and generate relocations.
@@ -1720,6 +1821,23 @@ bool BinaryFunction::scanExternalRefs() {
    for (Relocation &Rel : FunctionRelocations)
      getOriginSection()->addPendingRelocation(Rel);

+  // Add patches grouping them together.
+  if (!InstructionPatches.empty()) {
+    uint64_t PatchGroupAddress;
+    InstructionListType PatchGroup;
+    for (auto PI = InstructionPatches.begin(), PE = InstructionPatches.end();
+         PI != PE; ++PI) {
+      auto &Patch = *PI;
+      if (PatchGroup.empty())
+        PatchGroupAddress = Patch.first;
+      PatchGroup.push_back(Patch.second);
+      if (std::next(PI) == PE || std::next(PI)->first != Patch.first + 4) {
+        BC.createInstructionPatch(PatchGroupAddress, PatchGroup);
+        PatchGroup.clear();
+      }
+    }
+  }
+
  // Inform BinaryContext that this function symbols will not be defined and
  // relocations should not be created against them.
  if (BC.HasRelocations) {
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1269,8 +1269,10 @@ Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {

 Error AssignSections::runOnFunctions(BinaryContext &BC) {
  for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
-    Function->setCodeSectionName(BC.getInjectedCodeSectionName());
-    Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
+    if (!Function->isPatch()) {
+      Function->setCodeSectionName(BC.getInjectedCodeSectionName());
+      Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
+    }
  }

  // In non-relocation mode functions have pre-assigned section names.
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -5078,6 +5078,8 @@ void RewriteInstance::updateELFSymbolTable(

  // Add symbols of injected functions
  for (BinaryFunction *Function : BC->getInjectedBinaryFunctions()) {
+    if (Function->isAnonymous())
+      continue;
    ELFSymTy NewSymbol;
    BinarySection *OriginSection = Function->getOriginSection();
    NewSymbol.st_shndx =
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1803,12 +1803,6 @@ public:
    return &SymExpr->getSymbol();
  }

-  // This is the same as the base class, but since we are overriding one of
-  // getTargetSymbol's signatures above, we need to override all of them.
-  const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
-    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
-  }
-
  bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
                     const MCSymbol *&TBB, const MCSymbol *&FBB,
                     MCInst *&CondBranch,
--- a/bolt/test/AArch64/lite-mode.s
+++ b/bolt/test/AArch64/lite-mode.s
@@ -0,0 +1,108 @@
+## Check that in lite mode llvm-bolt updates function references in
+## non-optimized code.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
+# RUN: llvm-bolt %t.exe -o %t.bolt --data %t.fdata --lite
+# RUN: llvm-objdump -d --disassemble-symbols=cold_function %t.exe \
+# RUN:   | FileCheck %s --check-prefix=CHECK-INPUT
+# RUN: llvm-objdump -d --disassemble-symbols=cold_function %t.bolt \
+# RUN:   | FileCheck %s
+
+## In lite mode, optimized code will be separated from the original .text by
+## over 128MB, making it impossible for call/bl instructions in cold functions
+## to reach optimized functions directly.
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+# FDATA: 0 [unknown] 0 1 _start 0 0 100
+  .cfi_startproc
+  cmp  x0, 1
+  b.eq  .L0
+  bl cold_function
+.L0:
+  ret  x30
+  .cfi_endproc
+.size _start, .-_start
+
+## Cold non-optimized function with a reference to a hot function (_start).
+# CHECK: Disassembly of section .bolt.org.text:
+# CHECK-LABEL: <cold_function>
+  .globl cold_function
+  .type cold_function, %function
+cold_function:
+  .cfi_startproc
+
+## Absolute 64-bit function pointer reference.
+## We check for the lower 16 bits of _start to be zeros after update.
+  movz    x0, :abs_g3:_start
+  movk    x0, :abs_g2_nc:_start
+  movk    x0, :abs_g1_nc:_start
+# CHECK-INPUT-NOT: movk x0, #0x0{{$}}
+# CHECK: movk x0, #0x0{{$}}
+  movk    x0, :abs_g0_nc:_start
+
+## Relaxable address reference.
+# CHECK-INPUT:      nop
+# CHECK-INPUT-NEXT: adr x1
+# CHECK-NEXT:       adrp x1, [[ADDR:0x[0-9a-f]+]] <{{.*}}>
+# CHECK-NEXT:       add  x1
+  adrp    x1, _start
+  add     x1, x1, :lo12:_start
+
+## Non-relaxable address reference.
+# CHECK-INPUT-NEXT: adrp x2
+# CHECK-INPUT-NEXT: add  x2
+# CHECK-NEXT:       adrp x2, [[ADDR]]
+# CHECK-NEXT:       add  x2
+  adrp    x2, far_func
+  add     x2, x2, :lo12:far_func
+
+## Check that fully-relaxed GOT reference is converted into ADRP+ADD.
+  adrp    x3, :got:_start
+  ldr     x3, [x3, #:got_lo12:_start]
+# CHECK-INPUT-NEXT: nop
+# CHECK-INPUT-NEXT: adr x3
+# CHECK-NEXT:       adrp x3, [[ADDR]]
+# CHECK-NEXT:       add  x3
+
+## Check that partially-relaxed GOT reference is converted into ADRP+ADD.
+  adrp    x4, :got:far_func
+  ldr     x4, [x4, #:got_lo12:far_func]
+# CHECK-INPUT-NEXT: adrp x4
+# CHECK-INPUT-NEXT: add x4
+# CHECK-NEXT:       adrp x4, [[ADDR]]
+# CHECK-NEXT:       add  x4
+
+## Check that non-relaxable GOT load is left intact.
+  adrp    x5, :got:far_func
+  nop
+  ldr     x5, [x5, #:got_lo12:far_func]
+# CHECK-INPUT-NEXT: adrp x5
+# CHECK-INPUT-NEXT: nop
+# CHECK-INPUT-NEXT: ldr x5
+# CHECK-NEXT:       adrp x5
+# CHECK-NOT: [[ADDR]]
+# CHECK-NEXT:       nop
+# CHECK-NEXT:       ldr x5
+
+  .cfi_endproc
+.size cold_function, .-cold_function
+
+## Reserve 1MB of space to make functions that follow unreachable by ADRs in
+## code that precedes this gap.
+.space 0x100000
+
+  .globl far_func
+  .type far_func, %function
+far_func:
+# FDATA: 0 [unknown] 0 1 far_func 0 0 100
+  .cfi_startproc
+  ret  x30
+  .cfi_endproc
+.size far_func, .-far_func
+