The Clang binary (and any binary linking Clang as a library), when built
using PIE, ends up with a pretty shocking number of dynamic relocations
to apply to the executable image: roughly 400k.
Each of these takes up binary space in the executable, and perhaps most
interestingly takes start-up time to apply the relocations.
The largest pattern I identified were the strings used to describe
target builtins. The addresses of these string literals were stored into
huge arrays, each one requiring a dynamic relocation. The way to avoid
this is to design the target builtins to use a single large table of
strings and offsets within the table for the individual strings. This
switches the builtin management to such a scheme.
This saves over 100k dynamic relocations by my measurement, an over 25%
reduction. Just looking at byte size improvements, using the `bloaty`
tool to compare a newly built `clang` binary to an old one:
```
FILE SIZE VM SIZE
-------------- --------------
+1.4% +653Ki +1.4% +653Ki .rodata
+0.0% +960 +0.0% +960 .text
+0.0% +197 +0.0% +197 .dynstr
+0.0% +184 +0.0% +184 .eh_frame
+0.0% +96 +0.0% +96 .dynsym
+0.0% +40 +0.0% +40 .eh_frame_hdr
+114% +32 [ = ] 0 [Unmapped]
+0.0% +20 +0.0% +20 .gnu.hash
+0.0% +8 +0.0% +8 .gnu.version
+0.9% +7 +0.9% +7 [LOAD #2 [R]]
[ = ] 0 -75.4% -3.00Ki .relro_padding
-16.1% -802Ki -16.1% -802Ki .data.rel.ro
-27.3% -2.52Mi -27.3% -2.52Mi .rela.dyn
-1.6% -2.66Mi -1.6% -2.66Mi TOTAL
```
We get a 16% reduction in the `.data.rel.ro` section, and nearly 30%
reduction in `.rela.dyn` where those reloctaions are stored.
This is also visible in my benchmarking of binary start-up overhead at
least:
```
Benchmark 1: ./old_clang --version
Time (mean ± σ): 17.6 ms ± 1.5 ms [User: 4.1 ms, System: 13.3 ms]
Range (min … max): 14.2 ms … 22.8 ms 162 runs
Benchmark 2: ./new_clang --version
Time (mean ± σ): 15.5 ms ± 1.4 ms [User: 3.6 ms, System: 11.8 ms]
Range (min … max): 12.4 ms … 20.3 ms 216 runs
Summary
'./new_clang --version' ran
1.13 ± 0.14 times faster than './old_clang --version'
```
We get about 2ms faster `--version` runs. While there is a lot of noise
in binary execution time, this delta is pretty consistent, and
represents over 10% improvement. This is particularly interesting to me
because for very short source files, repeatedly starting the `clang`
binary is actually the dominant cost. For example, `configure` scripts
running against the `clang` compiler are slow in large part because of
binary start up time, not the time to process the actual inputs to the
compiler.
----
This PR implements the string tables using `constexpr` code and the
existing macro system. I understand that the builtins are moving towards
a TableGen model, and if complete that would provide more options for
modeling this. Unfortunately, that migration isn't complete, and even
the parts that are migrated still rely on the ability to break out of
the TableGen model and directly expand an X-macro style `BUILTIN(...)`
textually. I looked at trying to complete the move to TableGen, but it
would both require the difficult migration of the remaining targets, and
solving some tricky problems with how to move away from any macro-based
expansion.
I was also able to find a reasonably clean and effective way of doing
this with the existing macros and some `constexpr` code that I think is
clean enough to be a pretty good intermediate state, and maybe give a
good target for the eventual TableGen solution. I was also able to
factor the macros into set of consistent patterns that avoids a
significant regression in overall boilerplate.
268 lines
8.2 KiB
C++
268 lines
8.2 KiB
C++
//===--- SystemZ.h - Declare SystemZ target feature support -----*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file declares SystemZ TargetInfo objects.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_SYSTEMZ_H
|
|
#define LLVM_CLANG_LIB_BASIC_TARGETS_SYSTEMZ_H
|
|
|
|
#include "clang/Basic/TargetInfo.h"
|
|
#include "clang/Basic/TargetOptions.h"
|
|
#include "llvm/Support/Compiler.h"
|
|
#include "llvm/TargetParser/Triple.h"
|
|
|
|
namespace clang {
|
|
namespace targets {
|
|
|
|
static const unsigned ZOSAddressMap[] = {
|
|
0, // Default
|
|
0, // opencl_global
|
|
0, // opencl_local
|
|
0, // opencl_constant
|
|
0, // opencl_private
|
|
0, // opencl_generic
|
|
0, // opencl_global_device
|
|
0, // opencl_global_host
|
|
0, // cuda_device
|
|
0, // cuda_constant
|
|
0, // cuda_shared
|
|
0, // sycl_global
|
|
0, // sycl_global_device
|
|
0, // sycl_global_host
|
|
0, // sycl_local
|
|
0, // sycl_private
|
|
0, // ptr32_sptr
|
|
1, // ptr32_uptr
|
|
0, // ptr64
|
|
0, // hlsl_groupshared
|
|
0 // wasm_funcref
|
|
};
|
|
|
|
class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
|
|
|
|
static const char *const GCCRegNames[];
|
|
int ISARevision;
|
|
bool HasTransactionalExecution;
|
|
bool HasVector;
|
|
bool SoftFloat;
|
|
bool UnalignedSymbols;
|
|
enum AddrSpace { ptr32 = 1 };
|
|
|
|
public:
|
|
SystemZTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
|
|
: TargetInfo(Triple), ISARevision(getISARevision("z10")),
|
|
HasTransactionalExecution(false), HasVector(false), SoftFloat(false),
|
|
UnalignedSymbols(false) {
|
|
IntMaxType = SignedLong;
|
|
Int64Type = SignedLong;
|
|
IntWidth = IntAlign = 32;
|
|
LongWidth = LongLongWidth = LongAlign = LongLongAlign = 64;
|
|
Int128Align = 64;
|
|
PointerWidth = PointerAlign = 64;
|
|
LongDoubleWidth = 128;
|
|
LongDoubleAlign = 64;
|
|
LongDoubleFormat = &llvm::APFloat::IEEEquad();
|
|
DefaultAlignForAttributeAligned = 64;
|
|
MinGlobalAlign = 16;
|
|
HasUnalignedAccess = true;
|
|
if (Triple.isOSzOS()) {
|
|
if (Triple.isArch64Bit()) {
|
|
AddrSpaceMap = &ZOSAddressMap;
|
|
}
|
|
TLSSupported = false;
|
|
// All vector types are default aligned on an 8-byte boundary, even if the
|
|
// vector facility is not available. That is different from Linux.
|
|
MaxVectorAlign = 64;
|
|
// Compared to Linux/ELF, the data layout differs only in some details:
|
|
// - name mangling is GOFF.
|
|
// - 32 bit pointers, either as default or special address space
|
|
resetDataLayout("E-m:l-p1:32:32-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-"
|
|
"a:8:16-n32:64");
|
|
} else {
|
|
TLSSupported = true;
|
|
resetDataLayout("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64"
|
|
"-v128:64-a:8:16-n32:64");
|
|
}
|
|
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
|
|
HasStrictFP = true;
|
|
}
|
|
|
|
unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override;
|
|
|
|
void getTargetDefines(const LangOptions &Opts,
|
|
MacroBuilder &Builder) const override;
|
|
|
|
std::pair<const char *, ArrayRef<Builtin::Info>>
|
|
getTargetBuiltinStorage() const override;
|
|
|
|
ArrayRef<const char *> getGCCRegNames() const override;
|
|
|
|
ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
|
|
// No aliases.
|
|
return {};
|
|
}
|
|
|
|
ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;
|
|
|
|
bool isSPRegName(StringRef RegName) const override {
|
|
return RegName == "r15";
|
|
}
|
|
|
|
bool validateAsmConstraint(const char *&Name,
|
|
TargetInfo::ConstraintInfo &info) const override;
|
|
|
|
std::string convertConstraint(const char *&Constraint) const override {
|
|
switch (Constraint[0]) {
|
|
case 'p': // Keep 'p' constraint.
|
|
return std::string("p");
|
|
case 'Z':
|
|
switch (Constraint[1]) {
|
|
case 'Q': // Address with base and unsigned 12-bit displacement
|
|
case 'R': // Likewise, plus an index
|
|
case 'S': // Address with base and signed 20-bit displacement
|
|
case 'T': // Likewise, plus an index
|
|
// "^" hints llvm that this is a 2 letter constraint.
|
|
// "Constraint++" is used to promote the string iterator
|
|
// to the next constraint.
|
|
return std::string("^") + std::string(Constraint++, 2);
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return TargetInfo::convertConstraint(Constraint);
|
|
}
|
|
|
|
std::string_view getClobbers() const override {
|
|
// FIXME: Is this really right?
|
|
return "";
|
|
}
|
|
|
|
BuiltinVaListKind getBuiltinVaListKind() const override {
|
|
return TargetInfo::SystemZBuiltinVaList;
|
|
}
|
|
|
|
int getISARevision(StringRef Name) const;
|
|
|
|
bool isValidCPUName(StringRef Name) const override {
|
|
return getISARevision(Name) != -1;
|
|
}
|
|
|
|
void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
|
|
|
|
bool isValidTuneCPUName(StringRef Name) const override {
|
|
return isValidCPUName(Name);
|
|
}
|
|
|
|
void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override {
|
|
fillValidCPUList(Values);
|
|
}
|
|
|
|
bool setCPU(const std::string &Name) override {
|
|
ISARevision = getISARevision(Name);
|
|
return ISARevision != -1;
|
|
}
|
|
|
|
bool
|
|
initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
|
|
StringRef CPU,
|
|
const std::vector<std::string> &FeaturesVec) const override {
|
|
int ISARevision = getISARevision(CPU);
|
|
if (ISARevision >= 10)
|
|
Features["transactional-execution"] = true;
|
|
if (ISARevision >= 11)
|
|
Features["vector"] = true;
|
|
if (ISARevision >= 12)
|
|
Features["vector-enhancements-1"] = true;
|
|
if (ISARevision >= 13)
|
|
Features["vector-enhancements-2"] = true;
|
|
if (ISARevision >= 14)
|
|
Features["nnp-assist"] = true;
|
|
return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
|
|
}
|
|
|
|
bool handleTargetFeatures(std::vector<std::string> &Features,
|
|
DiagnosticsEngine &Diags) override {
|
|
HasTransactionalExecution = false;
|
|
HasVector = false;
|
|
SoftFloat = false;
|
|
UnalignedSymbols = false;
|
|
for (const auto &Feature : Features) {
|
|
if (Feature == "+transactional-execution")
|
|
HasTransactionalExecution = true;
|
|
else if (Feature == "+vector")
|
|
HasVector = true;
|
|
else if (Feature == "+soft-float")
|
|
SoftFloat = true;
|
|
else if (Feature == "+unaligned-symbols")
|
|
UnalignedSymbols = true;
|
|
}
|
|
HasVector &= !SoftFloat;
|
|
|
|
// If we use the vector ABI, vector types are 64-bit aligned. The
|
|
// DataLayout string is always set to this alignment as it is not a
|
|
// requirement that it follows the alignment emitted by the front end. It
|
|
// is assumed generally that the Datalayout should reflect only the
|
|
// target triple and not any specific feature.
|
|
if (HasVector && !getTriple().isOSzOS())
|
|
MaxVectorAlign = 64;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool hasFeature(StringRef Feature) const override;
|
|
|
|
CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
|
|
switch (CC) {
|
|
case CC_C:
|
|
case CC_Swift:
|
|
case CC_OpenCLKernel:
|
|
return CCCR_OK;
|
|
case CC_SwiftAsync:
|
|
return CCCR_Error;
|
|
default:
|
|
return CCCR_Warning;
|
|
}
|
|
}
|
|
|
|
StringRef getABI() const override {
|
|
if (HasVector)
|
|
return "vector";
|
|
return "";
|
|
}
|
|
|
|
const char *getLongDoubleMangling() const override { return "g"; }
|
|
|
|
bool hasBitIntType() const override { return true; }
|
|
|
|
int getEHDataRegisterNumber(unsigned RegNo) const override {
|
|
return RegNo < 4 ? 6 + RegNo : -1;
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
|
|
return std::make_pair(256, 256);
|
|
}
|
|
uint64_t getPointerWidthV(LangAS AddrSpace) const override {
|
|
return (getTriple().isOSzOS() && getTriple().isArch64Bit() &&
|
|
getTargetAddressSpace(AddrSpace) == ptr32)
|
|
? 32
|
|
: PointerWidth;
|
|
}
|
|
|
|
uint64_t getPointerAlignV(LangAS AddrSpace) const override {
|
|
return getPointerWidthV(AddrSpace);
|
|
}
|
|
};
|
|
} // namespace targets
|
|
} // namespace clang
|
|
#endif // LLVM_CLANG_LIB_BASIC_TARGETS_SYSTEMZ_H
|