Reland [CGData][GMF] Skip No Params (#116548)
This update follows up on change #112671 and is mostly a NFC, with the following exceptions: - Introduced `-global-merging-skip-no-params` to bypass merging when no parameters are required. - Parameter count is now calculated based on the unique hash count. - Added `-global-merging-inst-overhead` to adjust the instruction overhead, reflecting the machine instruction size. - Costs and benefits are now computed using the double data type. Since the finalization process occurs offline, this should not significantly impact build time. - Moved a sorting operation outside of the loop. This is a patch for https://discourse.llvm.org/t/rfc-global-function-merging/82608.
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/CGData/StableFunctionMap.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
@@ -35,21 +36,30 @@ static cl::opt<unsigned> GlobalMergingMaxParams(
|
||||
cl::desc(
|
||||
"The maximum number of parameters allowed when merging functions."),
|
||||
cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden);
|
||||
static cl::opt<unsigned> GlobalMergingParamOverhead(
|
||||
static cl::opt<bool> GlobalMergingSkipNoParams(
|
||||
"global-merging-skip-no-params",
|
||||
cl::desc("Skip merging functions with no parameters."), cl::init(true),
|
||||
cl::Hidden);
|
||||
static cl::opt<double> GlobalMergingInstOverhead(
|
||||
"global-merging-inst-overhead",
|
||||
cl::desc("The overhead cost associated with each instruction when lowering "
|
||||
"to machine instruction."),
|
||||
cl::init(1.2), cl::Hidden);
|
||||
static cl::opt<double> GlobalMergingParamOverhead(
|
||||
"global-merging-param-overhead",
|
||||
cl::desc("The overhead cost associated with each parameter when merging "
|
||||
"functions."),
|
||||
cl::init(2), cl::Hidden);
|
||||
static cl::opt<unsigned>
|
||||
cl::init(2.0), cl::Hidden);
|
||||
static cl::opt<double>
|
||||
GlobalMergingCallOverhead("global-merging-call-overhead",
|
||||
cl::desc("The overhead cost associated with each "
|
||||
"function call when merging functions."),
|
||||
cl::init(1), cl::Hidden);
|
||||
static cl::opt<unsigned> GlobalMergingExtraThreshold(
|
||||
cl::init(1.0), cl::Hidden);
|
||||
static cl::opt<double> GlobalMergingExtraThreshold(
|
||||
"global-merging-extra-threshold",
|
||||
cl::desc("An additional cost threshold that must be exceeded for merging "
|
||||
"to be considered beneficial."),
|
||||
cl::init(0), cl::Hidden);
|
||||
cl::init(0.0), cl::Hidden);
|
||||
|
||||
unsigned StableFunctionMap::getIdOrCreateForName(StringRef Name) {
|
||||
auto It = NameToId.find(Name);
|
||||
@@ -160,21 +170,32 @@ static bool isProfitable(
|
||||
if (InstCount < GlobalMergingMinInstrs)
|
||||
return false;
|
||||
|
||||
unsigned ParamCount = SFS[0]->IndexOperandHashMap->size();
|
||||
if (ParamCount > GlobalMergingMaxParams)
|
||||
return false;
|
||||
|
||||
unsigned Benefit = InstCount * (StableFunctionCount - 1);
|
||||
unsigned Cost =
|
||||
(GlobalMergingParamOverhead * ParamCount + GlobalMergingCallOverhead) *
|
||||
StableFunctionCount +
|
||||
GlobalMergingExtraThreshold;
|
||||
double Cost = 0.0;
|
||||
SmallSet<stable_hash, 8> UniqueHashVals;
|
||||
for (auto &SF : SFS) {
|
||||
UniqueHashVals.clear();
|
||||
for (auto &[IndexPair, Hash] : *SF->IndexOperandHashMap)
|
||||
UniqueHashVals.insert(Hash);
|
||||
unsigned ParamCount = UniqueHashVals.size();
|
||||
if (ParamCount > GlobalMergingMaxParams)
|
||||
return false;
|
||||
// Theoretically, if ParamCount is 0, it results in identical code folding
|
||||
// (ICF), which we can skip merging here since the linker already handles
|
||||
// ICF. This pass would otherwise introduce unnecessary thunks that are
|
||||
// merely direct jumps. However, enabling this could be beneficial depending
|
||||
// on downstream passes, so we provide an option for it.
|
||||
if (GlobalMergingSkipNoParams && ParamCount == 0)
|
||||
return false;
|
||||
Cost += ParamCount * GlobalMergingParamOverhead + GlobalMergingCallOverhead;
|
||||
}
|
||||
Cost += GlobalMergingExtraThreshold;
|
||||
|
||||
double Benefit =
|
||||
InstCount * (StableFunctionCount - 1) * GlobalMergingInstOverhead;
|
||||
bool Result = Benefit > Cost;
|
||||
LLVM_DEBUG(dbgs() << "isProfitable: Hash = " << SFS[0]->Hash << ", "
|
||||
<< "StableFunctionCount = " << StableFunctionCount
|
||||
<< ", InstCount = " << InstCount
|
||||
<< ", ParamCount = " << ParamCount
|
||||
<< ", Benefit = " << Benefit << ", Cost = " << Cost
|
||||
<< ", Result = " << (Result ? "true" : "false") << "\n");
|
||||
return Result;
|
||||
|
||||
@@ -405,12 +405,13 @@ static ParamLocsVecTy computeParamInfo(
|
||||
}
|
||||
|
||||
ParamLocsVecTy ParamLocsVec;
|
||||
for (auto &[HashSeq, Locs] : HashSeqToLocs) {
|
||||
for (auto &[HashSeq, Locs] : HashSeqToLocs)
|
||||
ParamLocsVec.push_back(std::move(Locs));
|
||||
llvm::sort(ParamLocsVec, [&](const ParamLocs &L, const ParamLocs &R) {
|
||||
return L[0] < R[0];
|
||||
});
|
||||
}
|
||||
|
||||
llvm::sort(ParamLocsVec, [&](const ParamLocs &L, const ParamLocs &R) {
|
||||
return L[0] < R[0];
|
||||
});
|
||||
|
||||
return ParamLocsVec;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
; while parameterizing a difference in their global variables, g1 and g2.
|
||||
; To achieve this, we create two instances of the global merging function, f1.Tgm and f2.Tgm,
|
||||
; which are tail-called from thunks f1 and f2 respectively.
|
||||
; These identical functions, f1.Tgm and f2.Tgm, will be folded by the linker via Identical Code Folding (IFC).
|
||||
; These identical functions, f1.Tgm and f2.Tgm, will be folded by the linker via Identical Code Folding (ICF).
|
||||
|
||||
; RUN: opt -S --passes=global-merge-func %s | FileCheck %s
|
||||
; RUN: opt -mtriple=arm64-apple-darwin -S --passes=global-merge-func %s | FileCheck %s
|
||||
|
||||
; A merging instance is created with additional parameter.
|
||||
; CHECK: define internal i32 @f1.Tgm(i32 %0, ptr %1)
|
||||
@@ -38,8 +38,8 @@
|
||||
; CHECK-NEXT: %1 = tail call i32 @f2.Tgm(i32 %a, ptr @g2)
|
||||
; CHECK-NEXT: ret i32 %1
|
||||
|
||||
; RUN: llc -enable-global-merge-func=true < %s | FileCheck %s --check-prefix=MERGE
|
||||
; RUN: llc -enable-global-merge-func=false < %s | FileCheck %s --check-prefix=NOMERGE
|
||||
; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s --check-prefix=MERGE
|
||||
; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=false < %s | FileCheck %s --check-prefix=NOMERGE
|
||||
|
||||
; MERGE: _f1.Tgm
|
||||
; MERGE: _f2.Tgm
|
||||
@@ -47,9 +47,6 @@
|
||||
; NOMERGE-NOT: _f1.Tgm
|
||||
; NOMERGE-NOT: _f2.Tgm
|
||||
|
||||
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "arm64-unknown-ios12.0.0"
|
||||
|
||||
@g = external local_unnamed_addr global [0 x i32], align 4
|
||||
@g1 = external global i32, align 4
|
||||
@g2 = external global i32, align 4
|
||||
39
llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
Normal file
39
llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
Normal file
@@ -0,0 +1,39 @@
|
||||
; This test verifies whether two identical functions, f1 and f2, can be merged
|
||||
; locally using the global merge function.
|
||||
; The functions, f1.Tgm and f2.Tgm, will be folded by the linker through
|
||||
; Identical Code Folding (ICF).
|
||||
; While identical functions can already be folded by the linker, creating this
|
||||
; canonical form can be beneficial in downstream passes. This merging process
|
||||
; can be controlled by the -global-merging-skip-no-params option.
|
||||
|
||||
; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true -global-merging-skip-no-params=false < %s | FileCheck %s --check-prefix=MERGE
|
||||
; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true -global-merging-skip-no-params=true < %s | FileCheck %s --implicit-check-not=".Tgm"
|
||||
|
||||
; MERGE: _f1.Tgm
|
||||
; MERGE: _f2.Tgm
|
||||
|
||||
@g = external local_unnamed_addr global [0 x i32], align 4
|
||||
@g1 = external global i32, align 4
|
||||
@g2 = external global i32, align 4
|
||||
|
||||
define i32 @f1(i32 %a) {
|
||||
entry:
|
||||
%idxprom = sext i32 %a to i64
|
||||
%arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%1 = load volatile i32, i32* @g1, align 4
|
||||
%mul = mul nsw i32 %1, %0
|
||||
%add = add nsw i32 %mul, 1
|
||||
ret i32 %add
|
||||
}
|
||||
|
||||
define i32 @f2(i32 %a) {
|
||||
entry:
|
||||
%idxprom = sext i32 %a to i64
|
||||
%arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%1 = load volatile i32, i32* @g1, align 4
|
||||
%mul = mul nsw i32 %1, %0
|
||||
%add = add nsw i32 %mul, 1
|
||||
ret i32 %add
|
||||
}
|
||||
Reference in New Issue
Block a user