This pull request is the second part of an ongoing effort to extends PGO instrumentation to GPU device code and depends on #76587. This PR makes the following changes: - Introduces `__llvm_write_custom_profile` to PGO compiler-rt library. This is an external function that can be used to write profiles with custom data to target-specific files. - Adds `__llvm_write_custom_profile` as weak symbol to libomptarget so that it can write the collected data to a profraw file. - Adds `PGODump` debug flag and only displays dump when the aforementioned flag is set
This commit is contained in:
committed by
GitHub
parent
84e3c6ff95
commit
9e5c136d5a
@@ -304,6 +304,17 @@ int __llvm_profile_get_padding_sizes_for_counters(
|
|||||||
*/
|
*/
|
||||||
void __llvm_profile_set_dumped(void);
|
void __llvm_profile_set_dumped(void);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Write custom target-specific profiling data to a seperate file.
|
||||||
|
* Used by offload PGO.
|
||||||
|
*/
|
||||||
|
int __llvm_write_custom_profile(const char *Target,
|
||||||
|
const __llvm_profile_data *DataBegin,
|
||||||
|
const __llvm_profile_data *DataEnd,
|
||||||
|
const char *CountersBegin,
|
||||||
|
const char *CountersEnd, const char *NamesBegin,
|
||||||
|
const char *NamesEnd);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* This variable is defined in InstrProfilingRuntime.cpp as a hidden
|
* This variable is defined in InstrProfilingRuntime.cpp as a hidden
|
||||||
* symbol. Its main purpose is to enable profile runtime user to
|
* symbol. Its main purpose is to enable profile runtime user to
|
||||||
|
|||||||
@@ -541,6 +541,17 @@ static FILE *getFileObject(const char *OutputName) {
|
|||||||
return fopen(OutputName, "ab");
|
return fopen(OutputName, "ab");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void closeFileObject(FILE *OutputFile) {
|
||||||
|
if (OutputFile == getProfileFile()) {
|
||||||
|
fflush(OutputFile);
|
||||||
|
if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
|
||||||
|
lprofUnlockFileHandle(OutputFile);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fclose(OutputFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Write profile data to file \c OutputName. */
|
/* Write profile data to file \c OutputName. */
|
||||||
static int writeFile(const char *OutputName) {
|
static int writeFile(const char *OutputName) {
|
||||||
int RetVal;
|
int RetVal;
|
||||||
@@ -562,15 +573,7 @@ static int writeFile(const char *OutputName) {
|
|||||||
initFileWriter(&fileWriter, OutputFile);
|
initFileWriter(&fileWriter, OutputFile);
|
||||||
RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone);
|
RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone);
|
||||||
|
|
||||||
if (OutputFile == getProfileFile()) {
|
closeFileObject(OutputFile);
|
||||||
fflush(OutputFile);
|
|
||||||
if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) {
|
|
||||||
lprofUnlockFileHandle(OutputFile);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fclose(OutputFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
return RetVal;
|
return RetVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1359,4 +1362,107 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int __llvm_write_custom_profile(const char *Target,
|
||||||
|
const __llvm_profile_data *DataBegin,
|
||||||
|
const __llvm_profile_data *DataEnd,
|
||||||
|
const char *CountersBegin,
|
||||||
|
const char *CountersEnd, const char *NamesBegin,
|
||||||
|
const char *NamesEnd) {
|
||||||
|
int ReturnValue = 0, FilenameLength, TargetLength;
|
||||||
|
char *FilenameBuf, *TargetFilename;
|
||||||
|
const char *Filename;
|
||||||
|
|
||||||
|
/* Save old profile data */
|
||||||
|
FILE *oldFile = getProfileFile();
|
||||||
|
|
||||||
|
// Temporarily suspend getting SIGKILL when the parent exits.
|
||||||
|
int PDeathSig = lprofSuspendSigKill();
|
||||||
|
|
||||||
|
if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
|
||||||
|
PROF_NOTE("Profile data not written to file: %s.\n", "already written");
|
||||||
|
if (PDeathSig == 1)
|
||||||
|
lprofRestoreSigKill();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if there is llvm/runtime version mismatch. */
|
||||||
|
if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
|
||||||
|
PROF_ERR("Runtime and instrumentation version mismatch : "
|
||||||
|
"expected %d, but get %d\n",
|
||||||
|
INSTR_PROF_RAW_VERSION,
|
||||||
|
(int)GET_VERSION(__llvm_profile_get_version()));
|
||||||
|
if (PDeathSig == 1)
|
||||||
|
lprofRestoreSigKill();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get current filename */
|
||||||
|
FilenameLength = getCurFilenameLength();
|
||||||
|
FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1);
|
||||||
|
Filename = getCurFilename(FilenameBuf, 0);
|
||||||
|
|
||||||
|
/* Check the filename. */
|
||||||
|
if (!Filename) {
|
||||||
|
PROF_ERR("Failed to write file : %s\n", "Filename not set");
|
||||||
|
if (PDeathSig == 1)
|
||||||
|
lprofRestoreSigKill();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate new space for our target-specific PGO filename */
|
||||||
|
TargetLength = strlen(Target);
|
||||||
|
TargetFilename =
|
||||||
|
(char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2);
|
||||||
|
|
||||||
|
/* Find file basename and path sizes */
|
||||||
|
int32_t DirEnd = FilenameLength - 1;
|
||||||
|
while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) {
|
||||||
|
DirEnd--;
|
||||||
|
}
|
||||||
|
uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize;
|
||||||
|
|
||||||
|
/* Prepend "TARGET." to current filename */
|
||||||
|
if (DirSize > 0) {
|
||||||
|
memcpy(TargetFilename, Filename, DirSize);
|
||||||
|
}
|
||||||
|
memcpy(TargetFilename + DirSize, Target, TargetLength);
|
||||||
|
TargetFilename[TargetLength + DirSize] = '.';
|
||||||
|
memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize,
|
||||||
|
BaseSize);
|
||||||
|
TargetFilename[FilenameLength + 1 + TargetLength] = 0;
|
||||||
|
|
||||||
|
/* Open and truncate target-specific PGO file */
|
||||||
|
FILE *OutputFile = fopen(TargetFilename, "w");
|
||||||
|
setProfileFile(OutputFile);
|
||||||
|
|
||||||
|
if (!OutputFile) {
|
||||||
|
PROF_ERR("Failed to open file : %s\n", TargetFilename);
|
||||||
|
if (PDeathSig == 1)
|
||||||
|
lprofRestoreSigKill();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
FreeHook = &free;
|
||||||
|
setupIOBuffer();
|
||||||
|
|
||||||
|
/* Write custom data */
|
||||||
|
ProfDataWriter fileWriter;
|
||||||
|
initFileWriter(&fileWriter, OutputFile);
|
||||||
|
|
||||||
|
/* Write custom data to the file */
|
||||||
|
ReturnValue = lprofWriteDataImpl(
|
||||||
|
&fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL,
|
||||||
|
lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0);
|
||||||
|
closeFileObject(OutputFile);
|
||||||
|
|
||||||
|
// Restore SIGKILL.
|
||||||
|
if (PDeathSig == 1)
|
||||||
|
lprofRestoreSigKill();
|
||||||
|
|
||||||
|
/* Restore old profiling file */
|
||||||
|
setProfileFile(oldFile);
|
||||||
|
|
||||||
|
return ReturnValue;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t {
|
|||||||
FunctionTracing = 1U << 1,
|
FunctionTracing = 1U << 1,
|
||||||
CommonIssues = 1U << 2,
|
CommonIssues = 1U << 2,
|
||||||
AllocationTracker = 1U << 3,
|
AllocationTracker = 1U << 3,
|
||||||
|
PGODump = 1U << 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct DeviceEnvironmentTy {
|
struct DeviceEnvironmentTy {
|
||||||
|
|||||||
@@ -63,14 +63,22 @@ struct __llvm_profile_data {
|
|||||||
#include "llvm/ProfileData/InstrProfData.inc"
|
#include "llvm/ProfileData/InstrProfData.inc"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
extern int __attribute__((weak)) __llvm_write_custom_profile(
|
||||||
|
const char *Target, const __llvm_profile_data *DataBegin,
|
||||||
|
const __llvm_profile_data *DataEnd, const char *CountersBegin,
|
||||||
|
const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
|
||||||
|
}
|
||||||
|
|
||||||
/// PGO profiling data extracted from a GPU device
|
/// PGO profiling data extracted from a GPU device
|
||||||
struct GPUProfGlobals {
|
struct GPUProfGlobals {
|
||||||
SmallVector<uint8_t> NamesData;
|
SmallVector<int64_t> Counts;
|
||||||
SmallVector<SmallVector<int64_t>> Counts;
|
|
||||||
SmallVector<__llvm_profile_data> Data;
|
SmallVector<__llvm_profile_data> Data;
|
||||||
|
SmallVector<uint8_t> NamesData;
|
||||||
Triple TargetTriple;
|
Triple TargetTriple;
|
||||||
|
|
||||||
void dump() const;
|
void dump() const;
|
||||||
|
Error write() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
|
/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
|
||||||
|
|||||||
@@ -206,7 +206,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
|
|||||||
GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
|
GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
|
||||||
if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
|
if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
|
||||||
return Err;
|
return Err;
|
||||||
DeviceProfileData.Counts.push_back(std::move(Counts));
|
DeviceProfileData.Counts.append(std::move(Counts));
|
||||||
} else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
|
} else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
|
||||||
// Read profiling data for this global variable
|
// Read profiling data for this global variable
|
||||||
__llvm_profile_data Data{};
|
__llvm_profile_data Data{};
|
||||||
@@ -224,15 +224,14 @@ void GPUProfGlobals::dump() const {
|
|||||||
<< "\n";
|
<< "\n";
|
||||||
|
|
||||||
outs() << "======== Counters =========\n";
|
outs() << "======== Counters =========\n";
|
||||||
for (const auto &Count : Counts) {
|
for (size_t i = 0; i < Counts.size(); i++) {
|
||||||
outs() << "[";
|
if (i > 0 && i % 10 == 0)
|
||||||
for (size_t i = 0; i < Count.size(); i++) {
|
outs() << "\n";
|
||||||
if (i == 0)
|
else if (i != 0)
|
||||||
outs() << " ";
|
outs() << " ";
|
||||||
outs() << Count[i] << " ";
|
outs() << Counts[i];
|
||||||
}
|
|
||||||
outs() << "]\n";
|
|
||||||
}
|
}
|
||||||
|
outs() << "\n";
|
||||||
|
|
||||||
outs() << "========== Data ===========\n";
|
outs() << "========== Data ===========\n";
|
||||||
for (const auto &ProfData : Data) {
|
for (const auto &ProfData : Data) {
|
||||||
@@ -264,3 +263,43 @@ void GPUProfGlobals::dump() const {
|
|||||||
Symtab.dumpNames(outs());
|
Symtab.dumpNames(outs());
|
||||||
outs() << "===========================\n";
|
outs() << "===========================\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Error GPUProfGlobals::write() const {
|
||||||
|
if (!__llvm_write_custom_profile)
|
||||||
|
return Plugin::error("Could not find symbol __llvm_write_custom_profile. "
|
||||||
|
"The compiler-rt profiling library must be linked for "
|
||||||
|
"GPU PGO to work.");
|
||||||
|
|
||||||
|
size_t DataSize = Data.size() * sizeof(__llvm_profile_data),
|
||||||
|
CountsSize = Counts.size() * sizeof(int64_t);
|
||||||
|
__llvm_profile_data *DataBegin, *DataEnd;
|
||||||
|
char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
|
||||||
|
|
||||||
|
// Initialize array of contiguous data. We need to make sure each section is
|
||||||
|
// contiguous so that the PGO library can compute deltas properly
|
||||||
|
SmallVector<uint8_t> ContiguousData(NamesData.size() + DataSize + CountsSize);
|
||||||
|
|
||||||
|
// Compute region pointers
|
||||||
|
DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize);
|
||||||
|
DataEnd =
|
||||||
|
(__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize);
|
||||||
|
CountersBegin = (char *)ContiguousData.data();
|
||||||
|
CountersEnd = (char *)(ContiguousData.data() + CountsSize);
|
||||||
|
NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize);
|
||||||
|
NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize +
|
||||||
|
NamesData.size());
|
||||||
|
|
||||||
|
// Copy data to contiguous buffer
|
||||||
|
memcpy(DataBegin, Data.data(), DataSize);
|
||||||
|
memcpy(CountersBegin, Counts.data(), CountsSize);
|
||||||
|
memcpy(NamesBegin, NamesData.data(), NamesData.size());
|
||||||
|
|
||||||
|
// Invoke compiler-rt entrypoint
|
||||||
|
int result = __llvm_write_custom_profile(TargetTriple.str().c_str(),
|
||||||
|
DataBegin, DataEnd, CountersBegin,
|
||||||
|
CountersEnd, NamesBegin, NamesEnd);
|
||||||
|
if (result != 0)
|
||||||
|
return Plugin::error("Error writing GPU PGO data to file");
|
||||||
|
|
||||||
|
return Plugin::success();
|
||||||
|
}
|
||||||
|
|||||||
@@ -861,8 +861,14 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
|
|||||||
if (!ProfOrErr)
|
if (!ProfOrErr)
|
||||||
return ProfOrErr.takeError();
|
return ProfOrErr.takeError();
|
||||||
|
|
||||||
// TODO: write data to profiling file
|
// Dump out profdata
|
||||||
ProfOrErr->dump();
|
if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
|
||||||
|
uint32_t(DeviceDebugKind::PGODump))
|
||||||
|
ProfOrErr->dump();
|
||||||
|
|
||||||
|
// Write data to profiling file
|
||||||
|
if (auto Err = ProfOrErr->write())
|
||||||
|
return Err;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Delete the memory manager before deinitializing the device. Otherwise,
|
// Delete the memory manager before deinitializing the device. Otherwise,
|
||||||
|
|||||||
@@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target)
|
|||||||
if config.libomptarget_has_libc:
|
if config.libomptarget_has_libc:
|
||||||
config.available_features.add('libc')
|
config.available_features.add('libc')
|
||||||
|
|
||||||
|
profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
|
||||||
if config.libomptarget_test_pgo:
|
if config.libomptarget_test_pgo:
|
||||||
config.available_features.add('pgo')
|
config.available_features.add('pgo')
|
||||||
|
config.substitutions.append(("%profdata", profdata_path))
|
||||||
|
|
||||||
# Determine whether the test system supports unified memory.
|
# Determine whether the test system supports unified memory.
|
||||||
# For CUDA, this is the case with compute capability 70 (Volta) or higher.
|
# For CUDA, this is the case with compute capability 70 (Volta) or higher.
|
||||||
@@ -407,6 +409,8 @@ if config.test_fortran_compiler:
|
|||||||
config.available_features.add('flang')
|
config.available_features.add('flang')
|
||||||
config.substitutions.append(("%flang", config.test_fortran_compiler))
|
config.substitutions.append(("%flang", config.test_fortran_compiler))
|
||||||
|
|
||||||
|
config.substitutions.append(("%target_triple", config.libomptarget_current_target))
|
||||||
|
|
||||||
config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
|
config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
|
||||||
if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
|
if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
|
||||||
config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
|
config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
@AUTO_GEN_COMMENT@
|
@AUTO_GEN_COMMENT@
|
||||||
|
|
||||||
config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
|
config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@"
|
||||||
config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
|
config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
|
||||||
config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
|
config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
|
||||||
config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@"
|
config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@"
|
||||||
|
|||||||
@@ -1,12 +1,17 @@
|
|||||||
// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
|
|
||||||
// RUN: -Xclang "-fprofile-instrument=clang"
|
|
||||||
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
|
|
||||||
// RUN: --check-prefix="CLANG-PGO"
|
|
||||||
// RUN: %libomptarget-compile-generic -fprofile-generate \
|
// RUN: %libomptarget-compile-generic -fprofile-generate \
|
||||||
// RUN: -Xclang "-fprofile-instrument=llvm"
|
// RUN: -Xclang "-fprofile-instrument=llvm"
|
||||||
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \
|
// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1
|
||||||
|
// RUN: %profdata show --all-functions --counts \
|
||||||
|
// RUN: %target_triple.llvm.profraw | %fcheck-generic \
|
||||||
// RUN: --check-prefix="LLVM-PGO"
|
// RUN: --check-prefix="LLVM-PGO"
|
||||||
|
|
||||||
|
// RUN: %libomptarget-compile-generic -fprofile-instr-generate \
|
||||||
|
// RUN: -Xclang "-fprofile-instrument=clang"
|
||||||
|
// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1
|
||||||
|
// RUN: %profdata show --all-functions --counts \
|
||||||
|
// RUN: %target_triple.clang.profraw | %fcheck-generic \
|
||||||
|
// RUN: --check-prefix="CLANG-PGO"
|
||||||
|
|
||||||
// REQUIRES: gpu
|
// REQUIRES: gpu
|
||||||
// REQUIRES: pgo
|
// REQUIRES: pgo
|
||||||
|
|
||||||
|
|||||||
@@ -1522,3 +1522,4 @@ debugging features are supported.
|
|||||||
* Enable debugging assertions in the device. ``0x01``
|
* Enable debugging assertions in the device. ``0x01``
|
||||||
* Enable diagnosing common problems during offloading . ``0x4``
|
* Enable diagnosing common problems during offloading . ``0x4``
|
||||||
* Enable device malloc statistics (amdgpu only). ``0x8``
|
* Enable device malloc statistics (amdgpu only). ``0x8``
|
||||||
|
* Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``
|
||||||
|
|||||||
Reference in New Issue
Block a user