[lld][InstrProf] Profile guided function order (#96268)

Add the lld flags `--irpgo-profile-sort=<profile>` and
`--compression-sort={function,data,both}` to order functions to improve
startup time, and functions or data to improve compressed size,
respectively.

We use Balanced Partitioning to determine the best section order using
traces from IRPGO profiles (see
https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068
for details) to improve startup time and using hashes of section
contents to improve compressed size.

In our recent LLVM talk (https://www.youtube.com/watch?v=yd4pbSTjwuA),
we showed that this can reduce page faults during startup by 40% on a
large iOS app and we can reduce compressed size by 0.8-3%.

More details can be found in https://dl.acm.org/doi/10.1145/3660635

---------

Co-authored-by: Vincent Lee <thevinster@users.noreply.github.com>
This commit is contained in:
Ellis Hoag
2024-07-23 08:34:40 -07:00
committed by GitHub
parent b42fe6740e
commit e3b30bc553
10 changed files with 740 additions and 1 deletions

View File

@@ -0,0 +1,413 @@
//===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "BPSectionOrderer.h"
#include "InputSection.h"
#include "lld/Common/ErrorHandler.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ProfileData/InstrProfReader.h"
#include "llvm/Support/BalancedPartitioning.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/xxhash.h"
#define DEBUG_TYPE "bp-section-orderer"
using namespace llvm;
using namespace lld::macho;
/// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
/// "yyyy" are numbers that could change between builds. We need to use the root
/// symbol name before this suffix so these symbols can be matched with profiles
/// which may have different suffixes.
static StringRef getRootSymbol(StringRef Name) {
auto [P0, S0] = Name.rsplit(".llvm.");
auto [P1, S1] = P0.rsplit(".__uniq.");
return P1;
}
static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
uint64_t offset, uint64_t addend) {
return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
.str());
}
static uint64_t
getRelocHash(const Reloc &reloc,
const DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
auto *isec = reloc.getReferentInputSection();
std::optional<uint64_t> sectionIdx;
auto sectionIdxIt = sectionToIdx.find(isec);
if (sectionIdxIt != sectionToIdx.end())
sectionIdx = sectionIdxIt->getSecond();
std::string kind;
if (isec)
kind = ("Section " + Twine(isec->kind())).str();
if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
kind += (" Symbol " + Twine(sym->kind())).str();
if (auto *d = dyn_cast<Defined>(sym)) {
if (isa_and_nonnull<CStringInputSection>(isec))
return getRelocHash(kind, 0, isec->getOffset(d->value), reloc.addend);
return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
}
}
return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
}
static void constructNodesForCompression(
const SmallVector<const InputSection *> &sections,
const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
const SmallVector<unsigned> &sectionIdxs,
std::vector<BPFunctionNode> &nodes,
DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
BPFunctionNode::UtilityNodeT &maxUN) {
TimeTraceScope timeScope("Build nodes for compression");
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
sectionHashes.reserve(sectionIdxs.size());
SmallVector<uint64_t> hashes;
for (unsigned sectionIdx : sectionIdxs) {
const auto *isec = sections[sectionIdx];
constexpr unsigned windowSize = 4;
for (size_t i = 0; i < isec->data.size(); i++) {
auto window = isec->data.drop_front(i).take_front(windowSize);
hashes.push_back(xxHash64(window));
}
for (const auto &r : isec->relocs) {
if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
continue;
uint64_t relocHash = getRelocHash(r, sectionToIdx);
uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
for (uint32_t i = start; i < r.offset + r.length; i++) {
auto window = isec->data.drop_front(i).take_front(windowSize);
hashes.push_back(xxHash64(window) + relocHash);
}
}
llvm::sort(hashes);
hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
sectionHashes.emplace_back(sectionIdx, hashes);
hashes.clear();
}
DenseMap<uint64_t, unsigned> hashFrequency;
for (auto &[sectionIdx, hashes] : sectionHashes)
for (auto hash : hashes)
++hashFrequency[hash];
// Merge section that are nearly identical
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
for (auto &[sectionIdx, hashes] : sectionHashes) {
uint64_t wholeHash = 0;
for (auto hash : hashes)
if (hashFrequency[hash] > 5)
wholeHash ^= hash;
auto [it, wasInserted] =
wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
if (wasInserted) {
newSectionHashes.emplace_back(sectionIdx, hashes);
} else {
duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
}
}
sectionHashes = newSectionHashes;
// Recompute hash frequencies
hashFrequency.clear();
for (auto &[sectionIdx, hashes] : sectionHashes)
for (auto hash : hashes)
++hashFrequency[hash];
// Filter rare and common hashes and assign each a unique utility node that
// doesn't conflict with the trace utility nodes
DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
for (auto &[hash, frequency] : hashFrequency) {
if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
continue;
hashToUN[hash] = ++maxUN;
}
std::vector<BPFunctionNode::UtilityNodeT> uns;
for (auto &[sectionIdx, hashes] : sectionHashes) {
for (auto &hash : hashes) {
auto it = hashToUN.find(hash);
if (it != hashToUN.end())
uns.push_back(it->second);
}
nodes.emplace_back(sectionIdx, uns);
uns.clear();
}
}
DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
size_t &highestAvailablePriority, StringRef profilePath,
bool forFunctionCompression, bool forDataCompression, bool verbose) {
SmallVector<const InputSection *> sections;
DenseMap<const InputSection *, uint64_t> sectionToIdx;
StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
for (const auto *file : inputFiles) {
for (auto *sec : file->sections) {
for (auto &subsec : sec->subsections) {
auto *isec = subsec.isec;
if (!isec || isec->data.empty() || !isec->data.data())
continue;
unsigned sectionIdx = sections.size();
sectionToIdx.try_emplace(isec, sectionIdx);
sections.push_back(isec);
for (Symbol *sym : isec->symbols)
if (auto *d = dyn_cast_or_null<Defined>(sym))
symbolToSectionIdxs[d->getName()].insert(sectionIdx);
}
}
}
StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
for (auto &entry : symbolToSectionIdxs) {
StringRef name = entry.getKey();
auto &sectionIdxs = entry.getValue();
name = getRootSymbol(name);
rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
sectionIdxs.end());
// Linkage names can be prefixed with "_" or "l_" on Mach-O. See
// Mangler::getNameWithPrefix() for details.
if (name.consume_front("_") || name.consume_front("l_"))
rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
sectionIdxs.end());
}
std::vector<BPFunctionNode> nodesForStartup;
BPFunctionNode::UtilityNodeT maxUN = 0;
DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
startupSectionIdxUNs;
std::unique_ptr<InstrProfReader> reader;
if (!profilePath.empty()) {
auto fs = vfs::getRealFileSystem();
auto readerOrErr = InstrProfReader::create(profilePath, *fs);
lld::checkError(readerOrErr.takeError());
reader = std::move(readerOrErr.get());
for (auto &entry : *reader) {
// Read all entries
(void)entry;
}
auto &traces = reader->getTemporalProfTraces();
// Used to define the initial order for startup functions.
DenseMap<unsigned, size_t> sectionIdxToTimestamp;
DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
uint64_t currentSize = 0, cutoffSize = 1;
size_t cutoffTimestamp = 1;
auto &trace = traces[traceIdx].FunctionNameRefs;
for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
auto [Filename, ParsedFuncName] = getParsedIRPGOName(
reader->getSymtab().getFuncOrVarName(trace[timestamp]));
ParsedFuncName = getRootSymbol(ParsedFuncName);
auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
continue;
auto &sectionIdxs = sectionIdxsIt->getValue();
// If the same symbol is found in multiple sections, they might be
// identical, so we arbitrarily use the size from the first section.
currentSize += sections[*sectionIdxs.begin()]->getSize();
// Since BalancedPartitioning is sensitive to the initial order, we need
// to explicitly define it to be ordered by earliest timestamp.
for (unsigned sectionIdx : sectionIdxs) {
auto [it, wasInserted] =
sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
if (!wasInserted)
it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
}
if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
++maxUN;
cutoffSize = 2 * currentSize;
cutoffTimestamp = 2 * cutoffTimestamp;
}
for (unsigned sectionIdx : sectionIdxs)
sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
}
for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
for (auto un = firstUN; un <= maxUN; ++un)
startupSectionIdxUNs[sectionIdx].push_back(un);
++maxUN;
sectionIdxToFirstUN.clear();
}
// These uns should already be sorted without duplicates.
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
nodesForStartup.emplace_back(sectionIdx, uns);
llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
});
}
SmallVector<unsigned> sectionIdxsForFunctionCompression,
sectionIdxsForDataCompression;
for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
if (startupSectionIdxUNs.count(sectionIdx))
continue;
const auto *isec = sections[sectionIdx];
if (isCodeSection(isec)) {
if (forFunctionCompression)
sectionIdxsForFunctionCompression.push_back(sectionIdx);
} else {
if (forDataCompression)
sectionIdxsForDataCompression.push_back(sectionIdx);
}
}
std::vector<BPFunctionNode> nodesForFunctionCompression,
nodesForDataCompression;
// Map a section index (to be ordered for compression) to a list of duplicate
// section indices (not ordered for compression).
DenseMap<unsigned, SmallVector<unsigned>> duplicateFunctionSectionIdxs,
duplicateDataSectionIdxs;
constructNodesForCompression(
sections, sectionToIdx, sectionIdxsForFunctionCompression,
nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN);
constructNodesForCompression(
sections, sectionToIdx, sectionIdxsForDataCompression,
nodesForDataCompression, duplicateDataSectionIdxs, maxUN);
// Sort nodes by their Id (which is the section index) because the input
// linker order tends to be not bad
llvm::sort(nodesForFunctionCompression,
[](auto &L, auto &R) { return L.Id < R.Id; });
llvm::sort(nodesForDataCompression,
[](auto &L, auto &R) { return L.Id < R.Id; });
{
TimeTraceScope timeScope("Balanced Partitioning");
BalancedPartitioningConfig config;
BalancedPartitioning bp(config);
bp.run(nodesForStartup);
bp.run(nodesForFunctionCompression);
bp.run(nodesForDataCompression);
}
unsigned numStartupSections = 0;
unsigned numCodeCompressionSections = 0;
unsigned numDuplicateCodeSections = 0;
unsigned numDataCompressionSections = 0;
unsigned numDuplicateDataSections = 0;
SetVector<const InputSection *> orderedSections;
// Order startup functions,
for (auto &node : nodesForStartup) {
const auto *isec = sections[node.Id];
if (orderedSections.insert(isec))
++numStartupSections;
}
// then functions for compression,
for (auto &node : nodesForFunctionCompression) {
const auto *isec = sections[node.Id];
if (orderedSections.insert(isec))
++numCodeCompressionSections;
auto It = duplicateFunctionSectionIdxs.find(node.Id);
if (It == duplicateFunctionSectionIdxs.end())
continue;
for (auto dupSecIdx : It->getSecond()) {
const auto *dupIsec = sections[dupSecIdx];
if (orderedSections.insert(dupIsec))
++numDuplicateCodeSections;
}
}
// then data for compression.
for (auto &node : nodesForDataCompression) {
const auto *isec = sections[node.Id];
if (orderedSections.insert(isec))
++numDataCompressionSections;
auto It = duplicateDataSectionIdxs.find(node.Id);
if (It == duplicateDataSectionIdxs.end())
continue;
for (auto dupSecIdx : It->getSecond()) {
const auto *dupIsec = sections[dupSecIdx];
if (orderedSections.insert(dupIsec))
++numDuplicateDataSections;
}
}
if (verbose) {
unsigned numTotalOrderedSections =
numStartupSections + numCodeCompressionSections +
numDuplicateCodeSections + numDataCompressionSections +
numDuplicateDataSections;
dbgs()
<< "Ordered " << numTotalOrderedSections
<< " sections using balanced partitioning:\n Functions for startup: "
<< numStartupSections
<< "\n Functions for compression: " << numCodeCompressionSections
<< "\n Duplicate functions: " << numDuplicateCodeSections
<< "\n Data for compression: " << numDataCompressionSections
<< "\n Duplicate data: " << numDuplicateDataSections << "\n";
if (!profilePath.empty()) {
// Evaluate this function order for startup
StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
const uint64_t pageSize = (1 << 14);
uint64_t currentAddress = 0;
for (const auto *isec : orderedSections) {
for (Symbol *sym : isec->symbols) {
if (auto *d = dyn_cast_or_null<Defined>(sym)) {
uint64_t startAddress = currentAddress + d->value;
uint64_t endAddress = startAddress + d->size;
uint64_t firstPage = startAddress / pageSize;
// I think the kernel might pull in a few pages when one it touched,
// so it might be more accurate to force lastPage to be aligned by
// 4?
uint64_t lastPage = endAddress / pageSize;
StringRef rootSymbol = d->getName();
rootSymbol = getRootSymbol(rootSymbol);
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
}
}
currentAddress += isec->getSize();
}
// The area under the curve F where F(t) is the total number of page
// faults at step t.
unsigned area = 0;
for (auto &trace : reader->getTemporalProfTraces()) {
SmallSet<uint64_t, 0> touchedPages;
for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
auto traceId = trace.FunctionNameRefs[step];
auto [Filename, ParsedFuncName] =
getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
ParsedFuncName = getRootSymbol(ParsedFuncName);
auto it = symbolToPageNumbers.find(ParsedFuncName);
if (it != symbolToPageNumbers.end()) {
auto &[firstPage, lastPage] = it->getValue();
for (uint64_t i = firstPage; i <= lastPage; i++)
touchedPages.insert(i);
}
area += touchedPages.size();
}
}
dbgs() << "Total area under the page fault curve: " << (float)area
<< "\n";
}
}
DenseMap<const InputSection *, size_t> sectionPriorities;
for (const auto *isec : orderedSections)
sectionPriorities[isec] = --highestAvailablePriority;
return sectionPriorities;
}

View File

@@ -0,0 +1,37 @@
//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This file uses Balanced Partitioning to order sections to improve startup
/// time and compressed size.
///
//===----------------------------------------------------------------------===//
#ifndef LLD_MACHO_BPSECTION_ORDERER_H
#define LLD_MACHO_BPSECTION_ORDERER_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
namespace lld::macho {
class InputSection;
/// Run Balanced Partitioning to find the optimal function and data order to
/// improve startup time and compressed size.
///
/// It is important that .subsections_via_symbols is used to ensure functions
/// and data are in their own sections and thus can be reordered.
llvm::DenseMap<const lld::macho::InputSection *, size_t>
runBalancedPartitioning(size_t &highestAvailablePriority,
llvm::StringRef profilePath,
bool forFunctionCompression, bool forDataCompression,
bool verbose);
} // namespace lld::macho
#endif

View File

@@ -25,6 +25,7 @@ add_lld_library(lldMachO
OutputSection.cpp
OutputSegment.cpp
Relocations.cpp
BPSectionOrderer.cpp
SectionPriorities.cpp
Sections.cpp
SymbolTable.cpp
@@ -48,6 +49,7 @@ add_lld_library(lldMachO
Object
Option
Passes
ProfileData
Support
TargetParser
TextAPI

View File

@@ -217,6 +217,11 @@ struct Configuration {
bool callGraphProfileSort = false;
llvm::StringRef printSymbolOrder;
llvm::StringRef irpgoProfileSortProfilePath;
bool functionOrderForCompression = false;
bool dataOrderForCompression = false;
bool verboseBpSectionOrderer = false;
SectionRenameMap sectionRenameMap;
SegmentRenameMap segmentRenameMap;

View File

@@ -1750,6 +1750,34 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
OPT_no_warn_thin_archive_missing_members, true);
config->generateUuid = !args.hasArg(OPT_no_uuid);
auto IncompatWithCGSort = [&](StringRef firstArgStr) {
// Throw an error only if --call-graph-profile-sort is explicitly specified
if (config->callGraphProfileSort)
if (const Arg *arg = args.getLastArgNoClaim(OPT_call_graph_profile_sort))
error(firstArgStr + " is incompatible with " + arg->getSpelling());
};
if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
config->irpgoProfileSortProfilePath = arg->getValue();
IncompatWithCGSort(arg->getSpelling());
}
if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
StringRef compressionSortStr = arg->getValue();
if (compressionSortStr == "function") {
config->functionOrderForCompression = true;
} else if (compressionSortStr == "data") {
config->dataOrderForCompression = true;
} else if (compressionSortStr == "both") {
config->functionOrderForCompression = true;
config->dataOrderForCompression = true;
} else if (compressionSortStr != "none") {
error("unknown value `" + compressionSortStr + "` for " +
arg->getSpelling());
}
if (compressionSortStr != "none")
IncompatWithCGSort(arg->getSpelling());
}
config->verboseBpSectionOrderer = args.hasArg(OPT_verbose_bp_section_orderer);
for (const Arg *arg : args.filtered(OPT_alias)) {
config->aliasedSymbols.push_back(
std::make_pair(arg->getValue(0), arg->getValue(1)));

View File

@@ -126,6 +126,16 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
Group<grp_lld>;
def irpgo_profile_sort: Joined<["--"], "irpgo-profile-sort=">,
MetaVarName<"<profile>">,
HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
Group<grp_lld>;
def compression_sort: Joined<["--"], "compression-sort=">,
MetaVarName<"[none,function,data,both]">,
HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
def verbose_bp_section_orderer: Flag<["--"], "verbose-bp-section-orderer">,
HelpText<"Print information on how many sections were ordered by balanced partitioning and a measure of the expected number of page faults">,
Group<grp_lld>;
def ignore_auto_link_option : Separate<["--"], "ignore-auto-link-option">,
Group<grp_lld>;
def ignore_auto_link_option_eq : Joined<["--"], "ignore-auto-link-option=">,

View File

@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "SectionPriorities.h"
#include "BPSectionOrderer.h"
#include "Config.h"
#include "InputFiles.h"
#include "Symbols.h"
@@ -352,7 +353,14 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
DenseMap<const InputSection *, size_t>
macho::PriorityBuilder::buildInputSectionPriorities() {
DenseMap<const InputSection *, size_t> sectionPriorities;
if (config->callGraphProfileSort) {
if (!config->irpgoProfileSortProfilePath.empty() ||
config->functionOrderForCompression || config->dataOrderForCompression) {
TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
sectionPriorities = runBalancedPartitioning(
highestAvailablePriority, config->irpgoProfileSortProfilePath,
config->functionOrderForCompression, config->dataOrderForCompression,
config->verboseBpSectionOrderer);
} else if (config->callGraphProfileSort) {
// Sort sections by the profile data provided by __LLVM,__cg_profile
// sections.
//

View File

@@ -0,0 +1,8 @@
# RUN: not %lld -o /dev/null --irpgo-profile-sort=%s --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
# IRPGO-ERR: --irpgo-profile-sort= is incompatible with --call-graph-profile-sort
# RUN: not %lld -o /dev/null --compression-sort=function --call-graph-profile-sort %s 2>&1 | FileCheck %s --check-prefix=COMPRESSION-ERR
# COMPRESSION-ERR: --compression-sort= is incompatible with --call-graph-profile-sort
# RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
# COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=

View File

@@ -0,0 +1,105 @@
# REQUIRES: aarch64
# Generate a large test case and check that the output is deterministic.
# RUN: %python %s %t.s %t.proftext
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
# RUN: llvm-profdata merge %t.proftext -o %t.profdata
# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
# RUN: diff %t.order1.txt %t.order2.txt
import random
import sys
assembly_filepath = sys.argv[1]
proftext_filepath = sys.argv[2]
random.seed(1234)
num_functions = 1000
num_data = 100
num_traces = 10
function_names = [f"f{n}" for n in range(num_functions)]
data_names = [f"d{n}" for n in range(num_data)]
profiled_functions = function_names[: int(num_functions / 2)]
function_contents = [
f"""
{name}:
add w0, w0, #{i % 4096}
add w1, w1, #{i % 10}
add w2, w0, #{i % 20}
adrp x3, {name}@PAGE
ret
"""
for i, name in enumerate(function_names)
]
data_contents = [
f"""
{name}:
.ascii "s{i % 2}-{i % 3}-{i % 5}"
.xword {name}
"""
for i, name in enumerate(data_names)
]
trace_contents = [
f"""
# Weight
1
{", ".join(random.sample(profiled_functions, len(profiled_functions)))}
"""
for i in range(num_traces)
]
profile_contents = [
f"""
{name}
# Func Hash:
{i}
# Num Counters:
1
# Counter Values:
1
"""
for i, name in enumerate(profiled_functions)
]
with open(assembly_filepath, "w") as f:
f.write(
f"""
.text
.globl _main
_main:
ret
{"".join(function_contents)}
.data
{"".join(data_contents)}
.subsections_via_symbols
"""
)
with open(proftext_filepath, "w") as f:
f.write(
f"""
:ir
:temporal_prof_traces
# Num Traces
{num_traces}
# Trace Stream Size:
{num_traces}
{"".join(trace_contents)}
{"".join(profile_contents)}
"""
)

View File

@@ -0,0 +1,123 @@
# REQUIRES: aarch64
# RUN: rm -rf %t && split-file %s %t
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
# RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
# STARTUP: Ordered 3 sections using balanced partitioning
# RUN: %lld -arch arm64 -lSystem -e _main -o - %t/a.o --irpgo-profile-sort=%t/a.profdata -order_file %t/a.orderfile | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE
# ORDERFILE: A
# ORDERFILE: F
# ORDERFILE: E
# ORDERFILE: D
# ORDERFILE-DAG: _main
# ORDERFILE-DAG: _B
# ORDERFILE-DAG: l_C
# ORDERFILE-DAG: s1
# ORDERFILE-DAG: s2
# ORDERFILE-DAG: r1
# ORDERFILE-DAG: r2
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-FUNC
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=data 2>&1 | FileCheck %s --check-prefix=COMPRESSION-DATA
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both --irpgo-profile-sort=%t/a.profdata 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
# COMPRESSION-FUNC: Ordered 7 sections using balanced partitioning
# COMPRESSION-DATA: Ordered 4 sections using balanced partitioning
# COMPRESSION-BOTH: Ordered 11 sections using balanced partitioning
#--- a.s
.text
.globl _main, A, _B, l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
_main:
ret
A:
ret
_B:
add w0, w0, #1
bl A
ret
l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222:
add w0, w0, #2
bl A
ret
D:
add w0, w0, #2
bl _B
ret
E:
add w0, w0, #2
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
ret
F:
add w0, w0, #3
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
ret
.data
s1:
.ascii "hello world"
s2:
.ascii "i am a string"
r1:
.quad s1
r2:
.quad r1
.subsections_via_symbols
#--- a.proftext
:ir
:temporal_prof_traces
# Num Traces
1
# Trace Stream Size:
1
# Weight
1
A, B, C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
A
# Func Hash:
1111
# Num Counters:
1
# Counter Values:
1
B
# Func Hash:
2222
# Num Counters:
1
# Counter Values:
1
C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
# Func Hash:
3333
# Num Counters:
1
# Counter Values:
1
D
# Func Hash:
4444
# Num Counters:
1
# Counter Values:
1
#--- a.orderfile
A
F
E
D