[BOLT] Set call to continuation count in pre-aggregated profile

#109683 identified an issue with pre-aggregated profile where a call to
continuation fallthrough edge count is missing (profile discontinuity).

This issue only affects pre-aggregated profile but not perf data since
LBR stack has the necessary information to determine if the trace (fall-
through) starts at call continuation, whereas pre-aggregated fallthrough
lacks this information.

The solution is to look at branch records in pre-aggregated profiles
that correspond to returns and assign counts to call to continuation
fallthrough:
- BranchFrom is in another function or DSO,
- BranchTo may be a call continuation site:
  - not an entry point/landing pad.

Note that we can't directly check if BranchFrom corresponds to a return
instruction if it's in external DSO.

Keep call continuation handling for perf data (`getFallthroughsInTrace`)
[1] as-is due to marginally better performance. The difference is that
return-converted call to continuation fallthrough is slightly more
frequent than other fallthroughs since the former only requires one LBR
address while the latter need two that belong to the profiled binary.
Hence return-converted fallthroughs have larger "weight" which affects
code layout.

[1] `DataAggregator::getFallthroughsInTrace`
fea18afeed/bolt/lib/Profile/DataAggregator.cpp (L906-L915)

Test Plan: added callcont-fallthru.s

Reviewers: maksfb, ayermolo, ShatianWang, dcci

Reviewed By: maksfb, ShatianWang

Pull Request: https://github.com/llvm/llvm-project/pull/109486
This commit is contained in:
Amir Ayupov
2024-11-07 16:20:19 -08:00
committed by GitHub
parent 5b697ef5dd
commit 74e6478f81
4 changed files with 218 additions and 38 deletions

View File

@@ -908,6 +908,10 @@ public:
return BB && BB->getOffset() == Offset ? BB : nullptr;
}
const BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) const {
return const_cast<BinaryFunction *>(this)->getBasicBlockAtOffset(Offset);
}
/// Retrieve the landing pad BB associated with invoke instruction \p Invoke
/// that is in \p BB. Return nullptr if none exists
BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,

View File

@@ -266,7 +266,8 @@ private:
uint64_t Mispreds);
/// Register a \p Branch.
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
bool IsPreagg);
/// Register a trace between two LBR entries supplied in execution order.
bool doTrace(const LBREntry &First, const LBREntry &Second,

View File

@@ -778,42 +778,75 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
}
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
uint64_t Mispreds) {
bool IsReturn = false;
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) -> BinaryFunction * {
if (BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr)) {
Addr -= Func->getAddress();
if (IsFrom) {
auto checkReturn = [&](auto MaybeInst) {
IsReturn = MaybeInst && BC->MIB->isReturn(*MaybeInst);
};
if (Func->hasInstructions())
checkReturn(Func->getInstructionAtOffset(Addr));
else
checkReturn(Func->disassembleInstructionAtOffset(Addr));
}
if (BAT)
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
if (BinaryFunction *ParentFunc = getBATParentFunction(*Func)) {
Func = ParentFunc;
if (IsFrom)
NumColdSamples += Count;
}
return Func;
}
return nullptr;
uint64_t Mispreds, bool IsPreagg) {
// Returns whether \p Offset in \p Func contains a return instruction.
auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
return Func.hasInstructions()
? isReturn(Func.getInstructionAtOffset(Offset))
: isReturn(Func.disassembleInstructionAtOffset(Offset));
};
BinaryFunction *FromFunc = handleAddress(From, /*IsFrom=*/true);
// Returns whether \p Offset in \p Func may be a call continuation excluding
// entry points and landing pads.
auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) {
// No call continuation at a function start.
if (!Offset)
return false;
// FIXME: support BAT case where the function might be in empty state
// (split fragments declared non-simple).
if (!Func.hasCFG())
return false;
// The offset should not be an entry point or a landing pad.
const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset);
return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad();
};
// Mutates \p Addr to an offset into the containing function, performing BAT
// offset translation and parent lookup.
//
// Returns the containing function (or BAT parent) and whether the address
// corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
if (!Func)
return std::pair{Func, false};
Addr -= Func->getAddress();
bool IsRetOrCallCont =
IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr);
if (BAT)
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
BinaryFunction *ParentFunc = getBATParentFunction(*Func);
if (!ParentFunc)
return std::pair{Func, IsRetOrCallCont};
if (IsFrom)
NumColdSamples += Count;
return std::pair{ParentFunc, IsRetOrCallCont};
};
uint64_t ToOrig = To;
auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom*/ false);
if (!FromFunc && !ToFunc)
return false;
// Record call to continuation trace.
if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) {
LBREntry First{ToOrig - 1, ToOrig - 1, false};
LBREntry Second{ToOrig, ToOrig, false};
return doTrace(First, Second, Count);
}
// Ignore returns.
if (IsReturn)
return true;
BinaryFunction *ToFunc = handleAddress(To, /*IsFrom=*/false);
if (!FromFunc && !ToFunc)
return false;
// Treat recursive control transfers as inter-branches.
if (FromFunc == ToFunc && To != 0) {
@@ -830,10 +863,19 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
if (!FromFunc || !ToFunc) {
LLVM_DEBUG({
dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
<< formatv(" @ {0:x}", First.To - FromFunc->getAddress())
<< " and ending in " << ToFunc->getPrintName()
<< formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress());
dbgs() << "Out of range trace starting in ";
if (FromFunc)
dbgs() << formatv("{0} @ {1:x}", *FromFunc,
First.To - FromFunc->getAddress());
else
dbgs() << Twine::utohexstr(First.To);
dbgs() << " and ending in ";
if (ToFunc)
dbgs() << formatv("{0} @ {1:x}", *ToFunc,
Second.From - ToFunc->getAddress());
else
dbgs() << Twine::utohexstr(Second.From);
dbgs() << '\n';
});
NumLongRangeTraces += Count;
return false;
@@ -1620,7 +1662,8 @@ void DataAggregator::processBranchEvents() {
for (const auto &AggrLBR : BranchLBRs) {
const Trace &Loc = AggrLBR.first;
const TakenBranchInfo &Info = AggrLBR.second;
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount,
/*IsPreagg*/ false);
}
}
@@ -1781,7 +1824,7 @@ void DataAggregator::processPreAggregated() {
switch (AggrEntry.EntryType) {
case AggregatedLBREntry::BRANCH:
doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
AggrEntry.Mispreds);
AggrEntry.Mispreds, /*IsPreagg*/ true);
break;
case AggregatedLBREntry::FT:
case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {

View File

@@ -0,0 +1,132 @@
## Ensures that a call continuation fallthrough count is set when using
## pre-aggregated perf data.
# RUN: %clangxx %cxxflags %s -o %t -Wl,-q -nostdlib
# RUN: link_fdata %s %t %t.pa1 PREAGG
# RUN: link_fdata %s %t %t.pa2 PREAGG2
# RUN: link_fdata %s %t %t.pa3 PREAGG3
# RUN: link_fdata %s %t %t.pa4 PREAGG4
## Check normal case: fallthrough is not LP or secondary entry.
# RUN: llvm-strip --strip-unneeded %t -o %t.exe
# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s
## Check that getFallthroughsInTrace correctly handles a trace starting at plt
## call continuation
# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
## Check that we don't treat secondary entry points as call continuation sites.
# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
## Check fallthrough to a landing pad case.
# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4
.globl foo
.type foo, %function
foo:
pushq %rbp
movq %rsp, %rbp
popq %rbp
Lfoo_ret:
retq
.size foo, .-foo
.globl main
.type main, %function
main:
.Lfunc_begin0:
.cfi_startproc
.cfi_personality 155, DW.ref.__gxx_personality_v0
.cfi_lsda 27, .Lexception0
pushq %rbp
movq %rsp, %rbp
subq $0x20, %rsp
movl $0x0, -0x4(%rbp)
movl %edi, -0x8(%rbp)
movq %rsi, -0x10(%rbp)
callq puts@PLT
## Target is a call continuation
# PREAGG: B X:0 #Ltmp1# 2 0
# CHECK: callq puts@PLT
# CHECK-NEXT: count: 2
Ltmp1:
movq -0x10(%rbp), %rax
movq 0x8(%rax), %rdi
movl %eax, -0x14(%rbp)
Ltmp4:
cmpl $0x0, -0x14(%rbp)
je Ltmp0
# CHECK2: je .Ltmp0
# CHECK2-NEXT: count: 3
movl $0xa, -0x18(%rbp)
callq foo
## Target is a call continuation
# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0
# CHECK: callq foo
# CHECK-NEXT: count: 1
## PLT call continuation fallthrough spanning the call
# PREAGG2: F #Ltmp1# #Ltmp3_br# 3
# CHECK2: callq foo
# CHECK2-NEXT: count: 3
## Target is a secondary entry point
# PREAGG3: B X:0 #Ltmp3# 2 0
# CHECK3: callq foo
# CHECK3-NEXT: count: 0
## Target is a landing pad
# PREAGG4: B X:0 #Ltmp3# 2 0
# CHECK4: callq puts@PLT
# CHECK4-NEXT: count: 0
Ltmp3:
cmpl $0x0, -0x18(%rbp)
Ltmp3_br:
jmp Ltmp2
Ltmp2:
movl -0x18(%rbp), %eax
addl $-0x1, %eax
movl %eax, -0x18(%rbp)
jmp Ltmp3
jmp Ltmp4
jmp Ltmp1
Ltmp0:
xorl %eax, %eax
addq $0x20, %rsp
popq %rbp
retq
.Lfunc_end0:
.cfi_endproc
.size main, .-main
.section .gcc_except_table,"a",@progbits
.p2align 2, 0x0
GCC_except_table0:
.Lexception0:
.byte 255 # @LPStart Encoding = omit
.byte 255 # @TType Encoding = omit
.byte 1 # Call site Encoding = uleb128
.uleb128 .Lcst_end0-.Lcst_begin0
.Lcst_begin0:
.uleb128 .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 <<
.uleb128 .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0
.uleb128 Ltmp3-.Lfunc_begin0 # jumps to Ltmp3
.byte 0 # has no landing pad
.byte 0 # On action: cleanup
.Lcst_end0:
.p2align 2, 0x0
.hidden DW.ref.__gxx_personality_v0
.weak DW.ref.__gxx_personality_v0
.section .data.DW.ref.__gxx_personality_v0,"awG",@progbits,DW.ref.__gxx_personality_v0,comdat
.p2align 3, 0x0
.type DW.ref.__gxx_personality_v0,@object