[TypeProf][PGO]Support skipping vtable comparisons for a class and its derived ones (#110575)

Performance critical core libraries could be highly-optimized for arch
or micro-arch features. For instance, the absl crc library specializes
different templated classes among different hardwares [1]. In a
practical setting, it's likely that instrumented profiles are collected
on one type of machine and used to optimize binaries that run on
multiple types of hardwares.

While this kind of specialization is rare in terms of lines of code,
compiler can do a better job to skip vtable-based ICP.
* The per-class `Extend` implementation is arch-specific as well. If an
instrumented profile is collected on one arch and applied to another
arch where `Extend` implementation is different, `Extend` might be
regarded as unlikely function in the latter case. `ABSL_ATTRIBUTE_HOT`
annotation alleviates the problem by putting all `Extend` implementation
into the hot text section [2]

This change introduces a comma-separated list to specify the mangled
vtable names, and ICP pass will skip vtable-based comparison if a vtable
variable definition is shown to be in its class hierarchy (per LLVM type
metadata).

[1]
c6b27359c3/absl/crc/internal/crc_x86_arm_combined.cc (L621-L650)
[2]
c6b27359c3/absl/crc/internal/crc_x86_arm_combined.cc (L370C3-L370C21)
This commit is contained in:
Mingming Liu
2024-10-02 10:23:54 -07:00
committed by GitHub
parent 694fd1f297
commit 34f0edd509
2 changed files with 58 additions and 4 deletions

View File

@@ -132,6 +132,15 @@ static cl::opt<int> ICPMaxNumVTableLastCandidate(
"icp-max-num-vtable-last-candidate", cl::init(1), cl::Hidden,
cl::desc("The maximum number of vtable for the last candidate."));
static cl::list<std::string> ICPIgnoredBaseTypes(
"icp-ignored-base-types", cl::Hidden,
cl::desc(
"A list of mangled vtable type info names. Classes specified by the "
"type info names and their derived ones will not be vtable-ICP'ed. "
"Useful when the profiled types and actual types in the optimized "
"binary could be different due to profiling limitations. Type info "
"names are those string literals used in LLVM type metadata"));
namespace {
// The key is a vtable global variable, and the value is a map.
@@ -316,6 +325,8 @@ private:
OptimizationRemarkEmitter &ORE;
const DenseSet<StringRef> &IgnoredBaseTypes;
// A struct that records the direct target and it's call count.
struct PromotionCandidate {
Function *const TargetFunction;
@@ -366,6 +377,10 @@ private:
bool isProfitableToCompareVTables(const CallBase &CB,
ArrayRef<PromotionCandidate> Candidates);
// Return true if the vtable corresponding to VTableGUID should be skipped
// for vtable-based comparison.
bool shouldSkipVTable(uint64_t VTableGUID);
// Given an indirect callsite and the list of function candidates, compute
// the following vtable information in output parameters and return vtable
// pointer if type profiles exist.
@@ -391,10 +406,12 @@ public:
Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO,
const VirtualCallSiteTypeInfoMap &VirtualCSInfo,
VTableAddressPointOffsetValMap &VTableAddressPointOffsetVal,
const DenseSet<StringRef> &IgnoredBaseTypes,
OptimizationRemarkEmitter &ORE)
: F(Func), M(M), Symtab(Symtab), SamplePGO(SamplePGO),
VirtualCSInfo(VirtualCSInfo),
VTableAddressPointOffsetVal(VTableAddressPointOffsetVal), ORE(ORE) {}
VTableAddressPointOffsetVal(VTableAddressPointOffsetVal), ORE(ORE),
IgnoredBaseTypes(IgnoredBaseTypes) {}
IndirectCallPromoter(const IndirectCallPromoter &) = delete;
IndirectCallPromoter &operator=(const IndirectCallPromoter &) = delete;
@@ -851,9 +868,14 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
LLVM_DEBUG(dbgs() << "\n");
uint64_t CandidateVTableCount = 0;
for (auto &[GUID, Count] : VTableGUIDAndCounts)
for (auto &[GUID, Count] : VTableGUIDAndCounts) {
CandidateVTableCount += Count;
if (shouldSkipVTable(GUID))
return false;
}
if (CandidateVTableCount < Candidate.Count * ICPVTablePercentageThreshold) {
LLVM_DEBUG(
dbgs() << " function count " << Candidate.Count
@@ -883,6 +905,27 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
return true;
}
bool IndirectCallPromoter::shouldSkipVTable(uint64_t VTableGUID) {
if (IgnoredBaseTypes.empty())
return false;
auto *VTableVar = Symtab->getGlobalVariable(VTableGUID);
assert(VTableVar && "VTableVar must exist for GUID in VTableGUIDAndCounts");
SmallVector<MDNode *, 2> Types;
VTableVar->getMetadata(LLVMContext::MD_type, Types);
for (auto *Type : Types)
if (auto *TypeId = dyn_cast<MDString>(Type->getOperand(1).get()))
if (IgnoredBaseTypes.contains(TypeId->getString())) {
LLVM_DEBUG(dbgs() << " vtable profiles should be ignored. Bail "
"out of vtable comparison.");
return true;
}
return false;
}
// For virtual calls in the module, collect per-callsite information which will
// be used to associate an ICP candidate with a vtable and a specific function
// in the vtable. With type intrinsics (llvm.type.test), we can find virtual
@@ -956,9 +999,15 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
bool Changed = false;
VirtualCallSiteTypeInfoMap VirtualCSInfo;
if (EnableVTableProfileUse)
DenseSet<StringRef> IgnoredBaseTypes;
if (EnableVTableProfileUse) {
computeVirtualCallSiteTypeInfoMap(M, MAM, VirtualCSInfo);
for (StringRef Str : ICPIgnoredBaseTypes)
IgnoredBaseTypes.insert(Str);
}
// VTableAddressPointOffsetVal stores the vtable address points. The vtable
// address point of a given <vtable, address point offset> is static (doesn't
// change after being computed once).
@@ -977,7 +1026,8 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
IndirectCallPromoter CallPromoter(F, M, &Symtab, SamplePGO, VirtualCSInfo,
VTableAddressPointOffsetVal, ORE);
VTableAddressPointOffsetVal,
IgnoredBaseTypes, ORE);
bool FuncChanged = CallPromoter.processFunction(PSI);
if (ICPDUMPAFTER && FuncChanged) {
LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));

View File

@@ -1,7 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; Tests that ICP compares vtables by checking IR.
; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=2 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,VTABLE-CMP
; Require exactly one vtable candidate for each function candidate. Tests that ICP compares function by checking IR.
; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=1 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,FUNC-CMP
; On top of line 4, ignore 'Base1' and its derived types for vtable-based comparison. Tests that ICP compares functions.
; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=2 -icp-ignored-base-types='Base1' -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,FUNC-CMP
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"