[BOLT][AArch64] Add support for SPE brstack format (#129231)
Since Linux 6.14, Perf gained the ability to report SPE branch events
using the `brstack` format, which matches the layout of LBR/BRBE.
This patch reuses the existing LBR parsing logic to support SPE.
Example SPE brstack format:
```bash
perf script -i perf.data -F pid,brstack --itrace=bl
```
```
PID FROM / TO / PREDICTED
16984 0x72e342e5f4/0x72e36192d0/M/-/-/11/RET/-
16984 0x72e7b8b3b4/0x72e7b8b3b8/PN/-/-/11/COND/-
16984 0x72e7b92b48/0x72e7b92b4c/PN/-/-/8/COND/-
16984 0x72eacc6b7c/0x760cc94b00/P/-/-/9/RET/-
16984 0x72e3f210fc/0x72e3f21068/P/-/-/4//-
16984 0x72e39b8c5c/0x72e3627b24/P/-/-/4//-
16984 0x72e7b89d20/0x72e7b92bbc/P/-/-/4/RET/-
```
SPE brstack flags can be two characters long: `PN` or `MN`:
- `P` = predicted branch
- `M` = mispredicted branch
- `N` = optionally appears when the branch is NOT-TAKEN
- flag is relevant only to conditional branches
Example of usage with BOLT:
1. Capture SPE branch events:
```bash
perf record -e 'arm_spe_0/branch_filter=1/u' -- binary
```
2. Convert profile for BOLT:
```bash
perf2bolt -p perf.data -o perf.fdata --spe binary
```
3. Run BOLT Optimization:
```bash
llvm-bolt binary -o binary.bolted --data perf.fdata ...
```
A unit test verifies the parsing of the 'SPE brstack format'.
---------
Co-authored-by: Paschalis Mpeis <paschalis.mpeis@arm.com>
This commit is contained in:
@@ -49,6 +49,9 @@ static cl::opt<bool>
|
||||
cl::desc("aggregate basic samples (without LBR info)"),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
|
||||
cl::cat(AggregatorCategory));
|
||||
|
||||
static cl::opt<std::string>
|
||||
ITraceAggregation("itrace",
|
||||
cl::desc("Generate LBR info with perf itrace argument"),
|
||||
@@ -181,11 +184,21 @@ void DataAggregator::start() {
|
||||
|
||||
findPerfExecutable();
|
||||
|
||||
if (opts::ArmSPE) {
|
||||
// pid from_ip to_ip flags
|
||||
// where flags could be:
|
||||
// P/M: whether branch was Predicted or Mispredicted.
|
||||
// N: optionally appears when the branch was Not-Taken (ie fall-through)
|
||||
// 12345 0x123/0x456/PN/-/-/8/RET/-
|
||||
opts::ITraceAggregation = "bl";
|
||||
opts::ParseMemProfile = true;
|
||||
opts::BasicAggregation = false;
|
||||
}
|
||||
|
||||
if (opts::BasicAggregation) {
|
||||
launchPerfProcess("events without LBR",
|
||||
MainEventsPPI,
|
||||
launchPerfProcess("events without LBR", MainEventsPPI,
|
||||
"script -F pid,event,ip",
|
||||
/*Wait = */false);
|
||||
/*Wait = */ false);
|
||||
} else if (!opts::ITraceAggregation.empty()) {
|
||||
// Disable parsing memory profile from trace data, unless requested by user.
|
||||
if (!opts::ParseMemProfile.getNumOccurrences())
|
||||
@@ -994,9 +1007,22 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
|
||||
if (std::error_code EC = MispredStrRes.getError())
|
||||
return EC;
|
||||
StringRef MispredStr = MispredStrRes.get();
|
||||
if (MispredStr.size() != 1 ||
|
||||
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
|
||||
reportError("expected single char for mispred bit");
|
||||
// SPE brstack mispredicted flags might be up to two characters long:
|
||||
// 'PN' or 'MN'. Where 'N' optionally appears.
|
||||
bool ValidStrSize = opts::ArmSPE
|
||||
? MispredStr.size() >= 1 && MispredStr.size() <= 2
|
||||
: MispredStr.size() == 1;
|
||||
bool SpeTakenBitErr =
|
||||
(opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N');
|
||||
bool PredictionBitErr =
|
||||
!ValidStrSize ||
|
||||
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-');
|
||||
if (SpeTakenBitErr)
|
||||
reportError("expected 'N' as SPE prediction bit for a not-taken branch");
|
||||
if (PredictionBitErr)
|
||||
reportError("expected 'P', 'M' or '-' char as a prediction bit");
|
||||
|
||||
if (SpeTakenBitErr || PredictionBitErr) {
|
||||
Diag << "Found: " << MispredStr << "\n";
|
||||
return make_error_code(llvm::errc::io_error);
|
||||
}
|
||||
@@ -1497,7 +1523,9 @@ void DataAggregator::printBranchStacksDiagnostics(
|
||||
}
|
||||
|
||||
std::error_code DataAggregator::parseBranchEvents() {
|
||||
outs() << "PERF2BOLT: parse branch events...\n";
|
||||
std::string BranchEventTypeStr =
|
||||
opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events";
|
||||
outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n";
|
||||
NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
|
||||
TimerGroupDesc, opts::TimeAggregator);
|
||||
|
||||
@@ -1525,7 +1553,8 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
}
|
||||
|
||||
NumEntries += Sample.LBR.size();
|
||||
if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
|
||||
if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 &&
|
||||
!NeedsSkylakeFix) {
|
||||
errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
|
||||
NeedsSkylakeFix = true;
|
||||
}
|
||||
@@ -1548,10 +1577,18 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
if (NumSamples && NumSamplesNoLBR == NumSamples) {
|
||||
// Note: we don't know if perf2bolt is being used to parse memory samples
|
||||
// at this point. In this case, it is OK to parse zero LBRs.
|
||||
errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
|
||||
"LBR. Record profile with perf record -j any or run perf2bolt "
|
||||
"in no-LBR mode with -nl (the performance improvement in -nl "
|
||||
"mode may be limited)\n";
|
||||
if (!opts::ArmSPE)
|
||||
errs()
|
||||
<< "PERF2BOLT-WARNING: all recorded samples for this binary lack "
|
||||
"LBR. Record profile with perf record -j any or run perf2bolt "
|
||||
"in no-LBR mode with -nl (the performance improvement in -nl "
|
||||
"mode may be limited)\n";
|
||||
else
|
||||
errs()
|
||||
<< "PERF2BOLT-WARNING: All recorded samples for this binary lack "
|
||||
"SPE brstack entries. Make sure you are running Linux perf 6.14 "
|
||||
"or later, otherwise you get zero samples. Record the profile "
|
||||
"with: perf record -e 'arm_spe_0/branch_filter=1/'.";
|
||||
} else {
|
||||
printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user