Revert "[LLVM] Make the GPU loader utilities an LLVM tool (#132096)"
This reverts commit 221b0117fd.
Some build failures requiring TargetParser and some warnings to clean
up.
This commit is contained in:
1
libc/utils/gpu/CMakeLists.txt
Normal file
1
libc/utils/gpu/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
||||
add_subdirectory(loader)
|
||||
54
libc/utils/gpu/loader/CMakeLists.txt
Normal file
54
libc/utils/gpu/loader/CMakeLists.txt
Normal file
@@ -0,0 +1,54 @@
|
||||
add_library(gpu_loader OBJECT Main.cpp)
|
||||
|
||||
include(FindLibcCommonUtils)
|
||||
target_link_libraries(gpu_loader PUBLIC llvm-libc-common-utilities)
|
||||
|
||||
target_include_directories(gpu_loader PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${LIBC_SOURCE_DIR}/include
|
||||
${LIBC_SOURCE_DIR}
|
||||
${LLVM_MAIN_INCLUDE_DIR}
|
||||
${LLVM_BINARY_DIR}/include
|
||||
)
|
||||
if(NOT LLVM_ENABLE_RTTI)
|
||||
target_compile_options(gpu_loader PUBLIC -fno-rtti)
|
||||
endif()
|
||||
|
||||
find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
|
||||
if(hsa-runtime64_FOUND)
|
||||
add_subdirectory(amdgpu)
|
||||
endif()
|
||||
|
||||
# The CUDA loader requires LLVM to traverse the ELF image for symbols.
|
||||
find_package(CUDAToolkit 11.2 QUIET)
|
||||
if(CUDAToolkit_FOUND)
|
||||
add_subdirectory(nvptx)
|
||||
endif()
|
||||
|
||||
if(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
|
||||
add_custom_target(libc.utils.gpu.loader)
|
||||
add_dependencies(libc.utils.gpu.loader amdhsa-loader)
|
||||
set_target_properties(
|
||||
libc.utils.gpu.loader
|
||||
PROPERTIES
|
||||
TARGET amdhsa-loader
|
||||
EXECUTABLE "$<TARGET_FILE:amdhsa-loader>"
|
||||
)
|
||||
elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
|
||||
add_custom_target(libc.utils.gpu.loader)
|
||||
add_dependencies(libc.utils.gpu.loader nvptx-loader)
|
||||
set_target_properties(
|
||||
libc.utils.gpu.loader
|
||||
PROPERTIES
|
||||
TARGET nvptx-loader
|
||||
EXECUTABLE "$<TARGET_FILE:nvptx-loader>"
|
||||
)
|
||||
endif()
|
||||
|
||||
foreach(gpu_loader_tgt amdhsa-loader nvptx-loader)
|
||||
if(TARGET ${gpu_loader_tgt})
|
||||
install(TARGETS ${gpu_loader_tgt}
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
COMPONENT libc)
|
||||
endif()
|
||||
endforeach()
|
||||
198
libc/utils/gpu/loader/Loader.h
Normal file
198
libc/utils/gpu/loader/Loader.h
Normal file
@@ -0,0 +1,198 @@
|
||||
//===-- Generic device loader interface -----------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
|
||||
#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
|
||||
|
||||
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
|
||||
|
||||
#include "shared/rpc.h"
|
||||
#include "shared/rpc_opcodes.h"
|
||||
#include "shared/rpc_server.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
/// Generic launch parameters for configuration the number of blocks / threads.
|
||||
struct LaunchParameters {
|
||||
uint32_t num_threads_x;
|
||||
uint32_t num_threads_y;
|
||||
uint32_t num_threads_z;
|
||||
uint32_t num_blocks_x;
|
||||
uint32_t num_blocks_y;
|
||||
uint32_t num_blocks_z;
|
||||
};
|
||||
|
||||
/// The arguments to the '_begin' kernel.
|
||||
struct begin_args_t {
|
||||
int argc;
|
||||
void *argv;
|
||||
void *envp;
|
||||
};
|
||||
|
||||
/// The arguments to the '_start' kernel.
|
||||
struct start_args_t {
|
||||
int argc;
|
||||
void *argv;
|
||||
void *envp;
|
||||
void *ret;
|
||||
};
|
||||
|
||||
/// The arguments to the '_end' kernel.
|
||||
struct end_args_t {
|
||||
int argc;
|
||||
};
|
||||
|
||||
/// Generic interface to load the \p image and launch execution of the _start
|
||||
/// kernel on the target device. Copies \p argc and \p argv to the device.
|
||||
/// Returns the final value of the `main` function on the device.
|
||||
int load(int argc, const char **argv, const char **evnp, void *image,
|
||||
size_t size, const LaunchParameters ¶ms,
|
||||
bool print_resource_usage);
|
||||
|
||||
/// Return \p V aligned "upwards" according to \p Align.
|
||||
template <typename V, typename A> inline V align_up(V val, A align) {
|
||||
return ((val + V(align) - 1) / V(align)) * V(align);
|
||||
}
|
||||
|
||||
/// Copy the system's argument vector to GPU memory allocated using \p alloc.
|
||||
template <typename Allocator>
|
||||
void *copy_argument_vector(int argc, const char **argv, Allocator alloc) {
|
||||
size_t argv_size = sizeof(char *) * (argc + 1);
|
||||
size_t str_size = 0;
|
||||
for (int i = 0; i < argc; ++i)
|
||||
str_size += strlen(argv[i]) + 1;
|
||||
|
||||
// We allocate enough space for a null terminated array and all the strings.
|
||||
void *dev_argv = alloc(argv_size + str_size);
|
||||
if (!dev_argv)
|
||||
return nullptr;
|
||||
|
||||
// Store the strings linerally in the same memory buffer.
|
||||
void *dev_str = reinterpret_cast<uint8_t *>(dev_argv) + argv_size;
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
size_t size = strlen(argv[i]) + 1;
|
||||
std::memcpy(dev_str, argv[i], size);
|
||||
static_cast<void **>(dev_argv)[i] = dev_str;
|
||||
dev_str = reinterpret_cast<uint8_t *>(dev_str) + size;
|
||||
}
|
||||
|
||||
// Ensure the vector is null terminated.
|
||||
reinterpret_cast<void **>(dev_argv)[argc] = nullptr;
|
||||
return dev_argv;
|
||||
}
|
||||
|
||||
/// Copy the system's environment to GPU memory allocated using \p alloc.
|
||||
template <typename Allocator>
|
||||
void *copy_environment(const char **envp, Allocator alloc) {
|
||||
int envc = 0;
|
||||
for (const char **env = envp; *env != 0; ++env)
|
||||
++envc;
|
||||
|
||||
return copy_argument_vector(envc, envp, alloc);
|
||||
}
|
||||
|
||||
inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
|
||||
fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)
|
||||
|
||||
template <uint32_t num_lanes, typename Alloc, typename Free>
|
||||
inline uint32_t handle_server(rpc::Server &server, uint32_t index,
|
||||
Alloc &&alloc, Free &&free) {
|
||||
auto port = server.try_open(num_lanes, index);
|
||||
if (!port)
|
||||
return 0;
|
||||
index = port->get_index() + 1;
|
||||
|
||||
int status = rpc::RPC_SUCCESS;
|
||||
switch (port->get_opcode()) {
|
||||
case RPC_TEST_INCREMENT: {
|
||||
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
|
||||
reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case RPC_TEST_INTERFACE: {
|
||||
bool end_with_recv;
|
||||
uint64_t cnt;
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) {
|
||||
end_with_recv = buffer->data[0];
|
||||
});
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
|
||||
port->send([&](rpc::Buffer *buffer, uint32_t) {
|
||||
buffer->data[0] = cnt = cnt + 1;
|
||||
});
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
|
||||
port->send([&](rpc::Buffer *buffer, uint32_t) {
|
||||
buffer->data[0] = cnt = cnt + 1;
|
||||
});
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
|
||||
port->send([&](rpc::Buffer *buffer, uint32_t) {
|
||||
buffer->data[0] = cnt = cnt + 1;
|
||||
});
|
||||
port->send([&](rpc::Buffer *buffer, uint32_t) {
|
||||
buffer->data[0] = cnt = cnt + 1;
|
||||
});
|
||||
if (end_with_recv)
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
|
||||
else
|
||||
port->send([&](rpc::Buffer *buffer, uint32_t) {
|
||||
buffer->data[0] = cnt = cnt + 1;
|
||||
});
|
||||
|
||||
break;
|
||||
}
|
||||
case RPC_TEST_STREAM: {
|
||||
uint64_t sizes[num_lanes] = {0};
|
||||
void *dst[num_lanes] = {nullptr};
|
||||
port->recv_n(dst, sizes,
|
||||
[](uint64_t size) -> void * { return new char[size]; });
|
||||
port->send_n(dst, sizes);
|
||||
for (uint64_t i = 0; i < num_lanes; ++i) {
|
||||
if (dst[i])
|
||||
delete[] reinterpret_cast<uint8_t *>(dst[i]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RPC_TEST_NOOP: {
|
||||
port->recv([&](rpc::Buffer *, uint32_t) {});
|
||||
break;
|
||||
}
|
||||
case LIBC_MALLOC: {
|
||||
port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) {
|
||||
buffer->data[0] = reinterpret_cast<uintptr_t>(alloc(buffer->data[0]));
|
||||
});
|
||||
break;
|
||||
}
|
||||
case LIBC_FREE: {
|
||||
port->recv([&](rpc::Buffer *buffer, uint32_t) {
|
||||
free(reinterpret_cast<void *>(buffer->data[0]));
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*port, num_lanes);
|
||||
break;
|
||||
}
|
||||
|
||||
// Handle all of the `libc` specific opcodes.
|
||||
if (status != rpc::RPC_SUCCESS)
|
||||
handle_error("Error handling RPC server");
|
||||
|
||||
port->close();
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
#endif
|
||||
142
libc/utils/gpu/loader/Main.cpp
Normal file
142
libc/utils/gpu/loader/Main.cpp
Normal file
@@ -0,0 +1,142 @@
|
||||
//===-- Main entry into the loader interface ------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file opens a device image passed on the command line and passes it to
|
||||
// one of the loader implementations for launch.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Loader.h"
|
||||
|
||||
#include "llvm/BinaryFormat/Magic.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/Path.h"
|
||||
#include "llvm/Support/Signals.h"
|
||||
#include "llvm/Support/WithColor.h"
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <sys/file.h>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
static cl::OptionCategory loader_category("loader options");
|
||||
|
||||
static cl::opt<bool> help("h", cl::desc("Alias for -help"), cl::Hidden,
|
||||
cl::cat(loader_category));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
|
||||
cl::init(1), cl::cat(loader_category));
|
||||
static cl::opt<unsigned>
|
||||
threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
|
||||
cl::init(1), cl::cat(loader_category));
|
||||
static cl::opt<unsigned>
|
||||
threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
|
||||
cl::init(1), cl::cat(loader_category));
|
||||
static cl::alias threads("threads", cl::aliasopt(threads_x),
|
||||
cl::desc("Alias for --threads-x"),
|
||||
cl::cat(loader_category));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
|
||||
cl::init(1), cl::cat(loader_category));
|
||||
static cl::opt<unsigned>
|
||||
blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
|
||||
cl::init(1), cl::cat(loader_category));
|
||||
static cl::opt<unsigned>
|
||||
blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
|
||||
cl::init(1), cl::cat(loader_category));
|
||||
static cl::alias blocks("blocks", cl::aliasopt(blocks_x),
|
||||
cl::desc("Alias for --blocks-x"),
|
||||
cl::cat(loader_category));
|
||||
|
||||
static cl::opt<bool>
|
||||
print_resource_usage("print-resource-usage",
|
||||
cl::desc("Output resource usage of launched kernels"),
|
||||
cl::init(false), cl::cat(loader_category));
|
||||
|
||||
static cl::opt<bool>
|
||||
no_parallelism("no-parallelism",
|
||||
cl::desc("Allows only a single process to use the GPU at a "
|
||||
"time. Useful to suppress out-of-resource errors"),
|
||||
cl::init(false), cl::cat(loader_category));
|
||||
|
||||
static cl::opt<std::string> file(cl::Positional, cl::Required,
|
||||
cl::desc("<gpu executable>"),
|
||||
cl::cat(loader_category));
|
||||
static cl::list<std::string> args(cl::ConsumeAfter,
|
||||
cl::desc("<program arguments>..."),
|
||||
cl::cat(loader_category));
|
||||
|
||||
[[noreturn]] void report_error(Error E) {
|
||||
outs().flush();
|
||||
logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
std::string get_main_executable(const char *name) {
|
||||
void *ptr = (void *)(intptr_t)&get_main_executable;
|
||||
auto cow_path = sys::fs::getMainExecutable(name, ptr);
|
||||
return sys::path::parent_path(cow_path).str();
|
||||
}
|
||||
|
||||
int main(int argc, const char **argv, const char **envp) {
|
||||
sys::PrintStackTraceOnErrorSignal(argv[0]);
|
||||
cl::HideUnrelatedOptions(loader_category);
|
||||
cl::ParseCommandLineOptions(
|
||||
argc, argv,
|
||||
"A utility used to launch unit tests built for a GPU target. This is\n"
|
||||
"intended to provide an intrface simular to cross-compiling emulators\n");
|
||||
|
||||
if (help) {
|
||||
cl::PrintHelpMessage();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
ErrorOr<std::unique_ptr<MemoryBuffer>> image_or_err =
|
||||
MemoryBuffer::getFileOrSTDIN(file);
|
||||
if (std::error_code ec = image_or_err.getError())
|
||||
report_error(errorCodeToError(ec));
|
||||
MemoryBufferRef image = **image_or_err;
|
||||
|
||||
SmallVector<const char *> new_argv = {file.c_str()};
|
||||
llvm::transform(args, std::back_inserter(new_argv),
|
||||
[](const std::string &arg) { return arg.c_str(); });
|
||||
|
||||
// Claim a file lock on the executable so only a single process can enter this
|
||||
// region if requested. This prevents the loader from spurious failures.
|
||||
int fd = -1;
|
||||
if (no_parallelism) {
|
||||
fd = open(get_main_executable(argv[0]).c_str(), O_RDONLY);
|
||||
if (flock(fd, LOCK_EX) == -1)
|
||||
report_error(createStringError("Failed to lock '%s': %s", argv[0],
|
||||
strerror(errno)));
|
||||
}
|
||||
|
||||
// Drop the loader from the program arguments.
|
||||
LaunchParameters params{threads_x, threads_y, threads_z,
|
||||
blocks_x, blocks_y, blocks_z};
|
||||
int ret = load(new_argv.size(), new_argv.data(), envp,
|
||||
const_cast<char *>(image.getBufferStart()),
|
||||
image.getBufferSize(), params, print_resource_usage);
|
||||
|
||||
if (no_parallelism) {
|
||||
if (flock(fd, LOCK_UN) == -1)
|
||||
report_error(createStringError("Failed to unlock '%s': %s", argv[0],
|
||||
strerror(errno)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
10
libc/utils/gpu/loader/amdgpu/CMakeLists.txt
Normal file
10
libc/utils/gpu/loader/amdgpu/CMakeLists.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
BinaryFormat
|
||||
Object
|
||||
Option
|
||||
Support
|
||||
FrontendOffloading
|
||||
)
|
||||
|
||||
add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
|
||||
target_link_libraries(amdhsa-loader PRIVATE hsa-runtime64::hsa-runtime64 gpu_loader)
|
||||
594
libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
Normal file
594
libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
Normal file
@@ -0,0 +1,594 @@
|
||||
//===-- Loader Implementation for AMDHSA devices --------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file impelements a simple loader to run images supporting the AMDHSA
|
||||
// architecture. The file launches the '_start' kernel which should be provided
|
||||
// by the device application start code and call ultimately call the 'main'
|
||||
// function.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Loader.h"
|
||||
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
|
||||
#include "llvm/Frontend/Offloading/Utility.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <thread>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
// The implicit arguments of COV5 AMDGPU kernels.
|
||||
struct implicit_args_t {
|
||||
uint32_t grid_size_x;
|
||||
uint32_t grid_size_y;
|
||||
uint32_t grid_size_z;
|
||||
uint16_t workgroup_size_x;
|
||||
uint16_t workgroup_size_y;
|
||||
uint16_t workgroup_size_z;
|
||||
uint8_t Unused0[46];
|
||||
uint16_t grid_dims;
|
||||
uint8_t Unused1[190];
|
||||
};
|
||||
|
||||
/// Print the error code and exit if \p code indicates an error.
|
||||
static void handle_error_impl(const char *file, int32_t line,
|
||||
hsa_status_t code) {
|
||||
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
|
||||
return;
|
||||
|
||||
const char *desc;
|
||||
if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
|
||||
desc = "Unknown error";
|
||||
fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/// Generic interface for iterating using the HSA callbacks.
|
||||
template <typename elem_ty, typename func_ty, typename callback_ty>
|
||||
hsa_status_t iterate(func_ty func, callback_ty cb) {
|
||||
auto l = [](elem_ty elem, void *data) -> hsa_status_t {
|
||||
callback_ty *unwrapped = static_cast<callback_ty *>(data);
|
||||
return (*unwrapped)(elem);
|
||||
};
|
||||
return func(l, static_cast<void *>(&cb));
|
||||
}
|
||||
|
||||
/// Generic interface for iterating using the HSA callbacks.
|
||||
template <typename elem_ty, typename func_ty, typename func_arg_ty,
|
||||
typename callback_ty>
|
||||
hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
|
||||
auto l = [](elem_ty elem, void *data) -> hsa_status_t {
|
||||
callback_ty *unwrapped = static_cast<callback_ty *>(data);
|
||||
return (*unwrapped)(elem);
|
||||
};
|
||||
return func(func_arg, l, static_cast<void *>(&cb));
|
||||
}
|
||||
|
||||
/// Iterate through all availible agents.
|
||||
template <typename callback_ty>
|
||||
hsa_status_t iterate_agents(callback_ty callback) {
|
||||
return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
|
||||
}
|
||||
|
||||
/// Iterate through all availible memory pools.
|
||||
template <typename callback_ty>
|
||||
hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
|
||||
return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
|
||||
agent, cb);
|
||||
}
|
||||
|
||||
template <hsa_device_type_t flag>
|
||||
hsa_status_t get_agent(hsa_agent_t *output_agent) {
|
||||
// Find the first agent with a matching device type.
|
||||
auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
|
||||
hsa_device_type_t type;
|
||||
hsa_status_t status =
|
||||
hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
|
||||
if (status != HSA_STATUS_SUCCESS)
|
||||
return status;
|
||||
|
||||
if (type == flag) {
|
||||
// Ensure that a GPU agent supports kernel dispatch packets.
|
||||
if (type == HSA_DEVICE_TYPE_GPU) {
|
||||
hsa_agent_feature_t features;
|
||||
status =
|
||||
hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
|
||||
if (status != HSA_STATUS_SUCCESS)
|
||||
return status;
|
||||
if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
|
||||
*output_agent = hsa_agent;
|
||||
} else {
|
||||
*output_agent = hsa_agent;
|
||||
}
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
return iterate_agents(cb);
|
||||
}
|
||||
|
||||
void print_kernel_resources(const char *kernel_name) {
|
||||
fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
|
||||
}
|
||||
|
||||
/// Retrieve a global memory pool with a \p flag from the agent.
|
||||
template <hsa_amd_memory_pool_global_flag_t flag>
|
||||
hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
|
||||
hsa_amd_memory_pool_t *output_pool) {
|
||||
auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
|
||||
uint32_t flags;
|
||||
hsa_amd_segment_t segment;
|
||||
if (auto err = hsa_amd_memory_pool_get_info(
|
||||
memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
|
||||
return err;
|
||||
if (auto err = hsa_amd_memory_pool_get_info(
|
||||
memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
|
||||
return err;
|
||||
|
||||
if (segment != HSA_AMD_SEGMENT_GLOBAL)
|
||||
return HSA_STATUS_SUCCESS;
|
||||
|
||||
if (flags & flag)
|
||||
*output_pool = memory_pool;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
return iterate_agent_memory_pools(agent, cb);
|
||||
}
|
||||
|
||||
template <typename args_t>
|
||||
hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
|
||||
hsa_amd_memory_pool_t kernargs_pool,
|
||||
hsa_amd_memory_pool_t coarsegrained_pool,
|
||||
hsa_queue_t *queue, rpc::Server &server,
|
||||
const LaunchParameters ¶ms,
|
||||
const char *kernel_name, args_t kernel_args,
|
||||
uint32_t wavefront_size, bool print_resource_usage) {
|
||||
// Look up the kernel in the loaded executable.
|
||||
hsa_executable_symbol_t symbol;
|
||||
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
|
||||
executable, kernel_name, &dev_agent, &symbol))
|
||||
return err;
|
||||
|
||||
// Retrieve different properties of the kernel symbol used for launch.
|
||||
uint64_t kernel;
|
||||
uint32_t args_size;
|
||||
uint32_t group_size;
|
||||
uint32_t private_size;
|
||||
bool dynamic_stack;
|
||||
|
||||
std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
|
||||
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
|
||||
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
|
||||
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
|
||||
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
|
||||
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
|
||||
|
||||
for (auto &[info, value] : symbol_infos)
|
||||
if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
|
||||
return err;
|
||||
|
||||
// Allocate space for the kernel arguments on the host and allow the GPU agent
|
||||
// to access it.
|
||||
void *args;
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
|
||||
/*flags=*/0, &args))
|
||||
handle_error(err);
|
||||
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
|
||||
|
||||
// Initialize all the arguments (explicit and implicit) to zero, then set the
|
||||
// explicit arguments to the values created above.
|
||||
std::memset(args, 0, args_size);
|
||||
std::memcpy(args, &kernel_args, sizeof(args_t));
|
||||
|
||||
// Initialize the necessary implicit arguments to the proper values.
|
||||
int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
|
||||
(params.num_blocks_z * params.num_threads_z != 1);
|
||||
implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
|
||||
reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
|
||||
implicit_args->grid_dims = dims;
|
||||
implicit_args->grid_size_x = params.num_blocks_x;
|
||||
implicit_args->grid_size_y = params.num_blocks_y;
|
||||
implicit_args->grid_size_z = params.num_blocks_z;
|
||||
implicit_args->workgroup_size_x = params.num_threads_x;
|
||||
implicit_args->workgroup_size_y = params.num_threads_y;
|
||||
implicit_args->workgroup_size_z = params.num_threads_z;
|
||||
|
||||
// Obtain a packet from the queue.
|
||||
uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
|
||||
while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
|
||||
;
|
||||
|
||||
const uint32_t mask = queue->size - 1;
|
||||
hsa_kernel_dispatch_packet_t *packet =
|
||||
static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
|
||||
(packet_id & mask);
|
||||
|
||||
// Set up the packet for exeuction on the device. We currently only launch
|
||||
// with one thread on the device, forcing the rest of the wavefront to be
|
||||
// masked off.
|
||||
uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
packet->workgroup_size_x = params.num_threads_x;
|
||||
packet->workgroup_size_y = params.num_threads_y;
|
||||
packet->workgroup_size_z = params.num_threads_z;
|
||||
packet->reserved0 = 0;
|
||||
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
|
||||
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
|
||||
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
|
||||
packet->private_segment_size =
|
||||
dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
|
||||
packet->group_segment_size = group_size;
|
||||
packet->kernel_object = kernel;
|
||||
packet->kernarg_address = args;
|
||||
packet->reserved2 = 0;
|
||||
// Create a signal to indicate when this packet has been completed.
|
||||
if (hsa_status_t err =
|
||||
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
|
||||
handle_error(err);
|
||||
|
||||
if (print_resource_usage)
|
||||
print_kernel_resources(kernel_name);
|
||||
|
||||
// Initialize the packet header and set the doorbell signal to begin execution
|
||||
// by the HSA runtime.
|
||||
uint16_t header =
|
||||
1u << HSA_PACKET_HEADER_BARRIER |
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
uint32_t header_word = header | (setup << 16u);
|
||||
__atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
|
||||
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
|
||||
|
||||
std::atomic<bool> finished = false;
|
||||
std::thread server_thread(
|
||||
[](std::atomic<bool> *finished, rpc::Server *server,
|
||||
uint32_t wavefront_size, hsa_agent_t dev_agent,
|
||||
hsa_amd_memory_pool_t coarsegrained_pool) {
|
||||
// Register RPC callbacks for the malloc and free functions on HSA.
|
||||
auto malloc_handler = [&](size_t size) -> void * {
|
||||
void *dev_ptr = nullptr;
|
||||
if (hsa_status_t err =
|
||||
hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
|
||||
/*flags=*/0, &dev_ptr))
|
||||
dev_ptr = nullptr;
|
||||
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
|
||||
return dev_ptr;
|
||||
};
|
||||
|
||||
auto free_handler = [](void *ptr) -> void {
|
||||
if (hsa_status_t err =
|
||||
hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
|
||||
handle_error(err);
|
||||
};
|
||||
|
||||
uint32_t index = 0;
|
||||
while (!*finished) {
|
||||
if (wavefront_size == 32)
|
||||
index =
|
||||
handle_server<32>(*server, index, malloc_handler, free_handler);
|
||||
else
|
||||
index =
|
||||
handle_server<64>(*server, index, malloc_handler, free_handler);
|
||||
}
|
||||
},
|
||||
&finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
|
||||
|
||||
// Wait until the kernel has completed execution on the device. Periodically
|
||||
// check the RPC client for work to be performed on the server.
|
||||
while (hsa_signal_wait_scacquire(packet->completion_signal,
|
||||
HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
|
||||
HSA_WAIT_STATE_BLOCKED) != 0)
|
||||
;
|
||||
|
||||
finished = true;
|
||||
if (server_thread.joinable())
|
||||
server_thread.join();
|
||||
|
||||
// Destroy the resources acquired to launch the kernel and return.
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
|
||||
handle_error(err);
|
||||
if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
|
||||
handle_error(err);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/// Copies data from the source agent to the destination agent. The source
|
||||
/// memory must first be pinned explicitly or allocated via HSA.
|
||||
static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
|
||||
const void *src, hsa_agent_t src_agent,
|
||||
uint64_t size) {
|
||||
// Create a memory signal to copy information between the host and device.
|
||||
hsa_signal_t memory_signal;
|
||||
if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
|
||||
return err;
|
||||
|
||||
if (hsa_status_t err = hsa_amd_memory_async_copy(
|
||||
dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
|
||||
return err;
|
||||
|
||||
while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
|
||||
UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
|
||||
;
|
||||
|
||||
if (hsa_status_t err = hsa_signal_destroy(memory_signal))
|
||||
return err;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int load(int argc, const char **argv, const char **envp, void *image,
|
||||
size_t size, const LaunchParameters ¶ms,
|
||||
bool print_resource_usage) {
|
||||
// Initialize the HSA runtime used to communicate with the device.
|
||||
if (hsa_status_t err = hsa_init())
|
||||
handle_error(err);
|
||||
|
||||
// Register a callback when the device encounters a memory fault.
|
||||
if (hsa_status_t err = hsa_amd_register_system_event_handler(
|
||||
[](const hsa_amd_event_t *event, void *) -> hsa_status_t {
|
||||
if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
|
||||
return HSA_STATUS_ERROR;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
},
|
||||
nullptr))
|
||||
handle_error(err);
|
||||
|
||||
// Obtain a single agent for the device and host to use the HSA memory model.
|
||||
hsa_agent_t dev_agent;
|
||||
hsa_agent_t host_agent;
|
||||
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
|
||||
handle_error(err);
|
||||
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
|
||||
handle_error(err);
|
||||
|
||||
// Load the code object's ISA information and executable data segments.
|
||||
hsa_code_object_reader_t reader;
|
||||
if (hsa_status_t err =
|
||||
hsa_code_object_reader_create_from_memory(image, size, &reader))
|
||||
handle_error(err);
|
||||
|
||||
hsa_executable_t executable;
|
||||
if (hsa_status_t err = hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
|
||||
&executable))
|
||||
handle_error(err);
|
||||
|
||||
hsa_loaded_code_object_t object;
|
||||
if (hsa_status_t err = hsa_executable_load_agent_code_object(
|
||||
executable, dev_agent, reader, "", &object))
|
||||
handle_error(err);
|
||||
|
||||
// No modifications to the executable are allowed after this point.
|
||||
if (hsa_status_t err = hsa_executable_freeze(executable, ""))
|
||||
handle_error(err);
|
||||
|
||||
// Check the validity of the loaded executable. If the agents ISA features do
|
||||
// not match the executable's code object it will fail here.
|
||||
uint32_t result;
|
||||
if (hsa_status_t err = hsa_executable_validate(executable, &result))
|
||||
handle_error(err);
|
||||
if (result)
|
||||
handle_error(HSA_STATUS_ERROR);
|
||||
|
||||
if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
|
||||
handle_error(err);
|
||||
|
||||
// Obtain memory pools to exchange data between the host and the device. The
|
||||
// fine-grained pool acts as pinned memory on the host for DMA transfers to
|
||||
// the device, the coarse-grained pool is for allocations directly on the
|
||||
// device, and the kernerl-argument pool is for executing the kernel.
|
||||
hsa_amd_memory_pool_t kernargs_pool;
|
||||
hsa_amd_memory_pool_t finegrained_pool;
|
||||
hsa_amd_memory_pool_t coarsegrained_pool;
|
||||
if (hsa_status_t err =
|
||||
get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
|
||||
host_agent, &kernargs_pool))
|
||||
handle_error(err);
|
||||
if (hsa_status_t err =
|
||||
get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
|
||||
host_agent, &finegrained_pool))
|
||||
handle_error(err);
|
||||
if (hsa_status_t err =
|
||||
get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
|
||||
dev_agent, &coarsegrained_pool))
|
||||
handle_error(err);
|
||||
|
||||
// The AMDGPU target can change its wavefront size. There currently isn't a
|
||||
// good way to look this up through the HSA API so we use the LLVM interface.
|
||||
uint16_t abi_version;
|
||||
llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
|
||||
llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
|
||||
if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
|
||||
llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
|
||||
handle_error(llvm::toString(std::move(err)).c_str());
|
||||
}
|
||||
|
||||
// Allocate fine-grained memory on the host to hold the pointer array for the
|
||||
// copied argv and allow the GPU agent to access it.
|
||||
auto allocator = [&](uint64_t size) -> void * {
|
||||
void *dev_ptr = nullptr;
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
|
||||
/*flags=*/0, &dev_ptr))
|
||||
handle_error(err);
|
||||
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
|
||||
return dev_ptr;
|
||||
};
|
||||
void *dev_argv = copy_argument_vector(argc, argv, allocator);
|
||||
if (!dev_argv)
|
||||
handle_error("Failed to allocate device argv");
|
||||
|
||||
// Allocate fine-grained memory on the host to hold the pointer array for the
|
||||
// copied environment array and allow the GPU agent to access it.
|
||||
void *dev_envp = copy_environment(envp, allocator);
|
||||
if (!dev_envp)
|
||||
handle_error("Failed to allocate device environment");
|
||||
|
||||
// Allocate space for the return pointer and initialize it to zero.
|
||||
void *dev_ret;
|
||||
if (hsa_status_t err =
|
||||
hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
|
||||
/*flags=*/0, &dev_ret))
|
||||
handle_error(err);
|
||||
hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
|
||||
|
||||
// Allocate finegrained memory for the RPC server and client to share.
|
||||
uint32_t wavefront_size =
|
||||
llvm::max_element(info_map, [](auto &&x, auto &&y) {
|
||||
return x.second.WavefrontSize < y.second.WavefrontSize;
|
||||
})->second.WavefrontSize;
|
||||
|
||||
// Set up the RPC server.
|
||||
void *rpc_buffer;
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_allocate(
|
||||
finegrained_pool,
|
||||
rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
|
||||
/*flags=*/0, &rpc_buffer))
|
||||
handle_error(err);
|
||||
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
|
||||
|
||||
rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
|
||||
rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
|
||||
|
||||
// Initialize the RPC client on the device by copying the local data to the
|
||||
// device's internal pointer.
|
||||
hsa_executable_symbol_t rpc_client_sym;
|
||||
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
|
||||
executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
|
||||
handle_error(err);
|
||||
|
||||
void *rpc_client_dev;
|
||||
if (hsa_status_t err = hsa_executable_symbol_get_info(
|
||||
rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
|
||||
&rpc_client_dev))
|
||||
handle_error(err);
|
||||
|
||||
void *rpc_client_buffer;
|
||||
if (hsa_status_t err =
|
||||
hsa_amd_memory_lock(&client, sizeof(rpc::Client),
|
||||
/*agents=*/nullptr, 0, &rpc_client_buffer))
|
||||
handle_error(err);
|
||||
|
||||
// Copy the RPC client buffer to the address pointed to by the symbol.
|
||||
if (hsa_status_t err =
|
||||
hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
|
||||
sizeof(rpc::Client)))
|
||||
handle_error(err);
|
||||
|
||||
if (hsa_status_t err = hsa_amd_memory_unlock(&client))
|
||||
handle_error(err);
|
||||
|
||||
// Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
|
||||
// If the clock_freq symbol is missing, no work to do.
|
||||
hsa_executable_symbol_t freq_sym;
|
||||
if (HSA_STATUS_SUCCESS ==
|
||||
hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
|
||||
&dev_agent, &freq_sym)) {
|
||||
void *host_clock_freq;
|
||||
if (hsa_status_t err =
|
||||
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
|
||||
/*flags=*/0, &host_clock_freq))
|
||||
handle_error(err);
|
||||
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
|
||||
|
||||
if (HSA_STATUS_SUCCESS ==
|
||||
hsa_agent_get_info(dev_agent,
|
||||
static_cast<hsa_agent_info_t>(
|
||||
HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
|
||||
host_clock_freq)) {
|
||||
|
||||
void *freq_addr;
|
||||
if (hsa_status_t err = hsa_executable_symbol_get_info(
|
||||
freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
|
||||
&freq_addr))
|
||||
handle_error(err);
|
||||
|
||||
if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
|
||||
host_agent, sizeof(uint64_t)))
|
||||
handle_error(err);
|
||||
}
|
||||
}
|
||||
|
||||
// Obtain a queue with the maximum (power of two) size, used to send commands
|
||||
// to the HSA runtime and launch execution on the device.
|
||||
uint64_t queue_size;
|
||||
if (hsa_status_t err = hsa_agent_get_info(
|
||||
dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
|
||||
handle_error(err);
|
||||
hsa_queue_t *queue = nullptr;
|
||||
if (hsa_status_t err =
|
||||
hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
|
||||
nullptr, UINT32_MAX, UINT32_MAX, &queue))
|
||||
handle_error(err);
|
||||
|
||||
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
|
||||
begin_args_t init_args = {argc, dev_argv, dev_envp};
|
||||
if (hsa_status_t err = launch_kernel(
|
||||
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
|
||||
server, single_threaded_params, "_begin.kd", init_args,
|
||||
info_map["_begin"].WavefrontSize, print_resource_usage))
|
||||
handle_error(err);
|
||||
|
||||
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
|
||||
if (hsa_status_t err = launch_kernel(
|
||||
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
|
||||
server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
|
||||
print_resource_usage))
|
||||
handle_error(err);
|
||||
|
||||
void *host_ret;
|
||||
if (hsa_status_t err =
|
||||
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
|
||||
/*flags=*/0, &host_ret))
|
||||
handle_error(err);
|
||||
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
|
||||
|
||||
if (hsa_status_t err =
|
||||
hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
|
||||
handle_error(err);
|
||||
|
||||
// Save the return value and perform basic clean-up.
|
||||
int ret = *static_cast<int *>(host_ret);
|
||||
|
||||
end_args_t fini_args = {ret};
|
||||
if (hsa_status_t err = launch_kernel(
|
||||
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
|
||||
server, single_threaded_params, "_end.kd", fini_args,
|
||||
info_map["_end"].WavefrontSize, print_resource_usage))
|
||||
handle_error(err);
|
||||
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
|
||||
handle_error(err);
|
||||
|
||||
// Free the memory allocated for the device.
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
|
||||
handle_error(err);
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
|
||||
handle_error(err);
|
||||
if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
|
||||
handle_error(err);
|
||||
|
||||
if (hsa_status_t err = hsa_queue_destroy(queue))
|
||||
handle_error(err);
|
||||
|
||||
if (hsa_status_t err = hsa_executable_destroy(executable))
|
||||
handle_error(err);
|
||||
|
||||
if (hsa_status_t err = hsa_shut_down())
|
||||
handle_error(err);
|
||||
|
||||
return ret;
|
||||
}
|
||||
9
libc/utils/gpu/loader/nvptx/CMakeLists.txt
Normal file
9
libc/utils/gpu/loader/nvptx/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
BinaryFormat
|
||||
Object
|
||||
Option
|
||||
Support
|
||||
)
|
||||
|
||||
add_llvm_executable(nvptx-loader nvptx-loader.cpp)
|
||||
target_link_libraries(nvptx-loader PRIVATE gpu_loader CUDA::cuda_driver)
|
||||
366
libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
Normal file
366
libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
Normal file
@@ -0,0 +1,366 @@
|
||||
//===-- Loader Implementation for NVPTX devices --------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file impelements a simple loader to run images supporting the NVPTX
|
||||
// architecture. The file launches the '_start' kernel which should be provided
|
||||
// by the device application start code and call ultimately call the 'main'
|
||||
// function.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Loader.h"
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
#include "llvm/Object/ELF.h"
|
||||
#include "llvm/Object/ELFObjectFile.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
using namespace llvm;
|
||||
using namespace object;
|
||||
|
||||
static void handle_error_impl(const char *file, int32_t line, CUresult err) {
|
||||
if (err == CUDA_SUCCESS)
|
||||
return;
|
||||
|
||||
const char *err_str = nullptr;
|
||||
CUresult result = cuGetErrorString(err, &err_str);
|
||||
if (result != CUDA_SUCCESS)
|
||||
fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
|
||||
else
|
||||
fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Gets the names of all the globals that contain functions to initialize or
|
||||
// deinitialize. We need to do this manually because the NVPTX toolchain does
|
||||
// not contain the necessary binary manipulation tools.
|
||||
template <typename Alloc>
|
||||
Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
|
||||
Alloc allocator, CUmodule binary) {
|
||||
auto mem_buffer = MemoryBuffer::getMemBuffer(
|
||||
StringRef(reinterpret_cast<const char *>(image), size), "image",
|
||||
/*RequiresNullTerminator=*/false);
|
||||
Expected<ELF64LEObjectFile> elf_or_err =
|
||||
ELF64LEObjectFile::create(*mem_buffer);
|
||||
if (!elf_or_err)
|
||||
handle_error(toString(elf_or_err.takeError()).c_str());
|
||||
|
||||
std::vector<std::pair<const char *, uint16_t>> ctors;
|
||||
std::vector<std::pair<const char *, uint16_t>> dtors;
|
||||
// CUDA has no way to iterate over all the symbols so we need to inspect the
|
||||
// ELF directly using the LLVM libraries.
|
||||
for (const auto &symbol : elf_or_err->symbols()) {
|
||||
auto name_or_err = symbol.getName();
|
||||
if (!name_or_err)
|
||||
handle_error(toString(name_or_err.takeError()).c_str());
|
||||
|
||||
// Search for all symbols that contain a constructor or destructor.
|
||||
if (!name_or_err->starts_with("__init_array_object_") &&
|
||||
!name_or_err->starts_with("__fini_array_object_"))
|
||||
continue;
|
||||
|
||||
uint16_t priority;
|
||||
if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
|
||||
handle_error("Invalid priority for constructor or destructor");
|
||||
|
||||
if (name_or_err->starts_with("__init"))
|
||||
ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
|
||||
else
|
||||
dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
|
||||
}
|
||||
// Lower priority constructors are run before higher ones. The reverse is true
|
||||
// for destructors.
|
||||
llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
|
||||
llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
|
||||
|
||||
// Allocate host pinned memory to make these arrays visible to the GPU.
|
||||
CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
|
||||
ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
|
||||
uint64_t global_size = 0;
|
||||
|
||||
// Get the address of the global and then store the address of the constructor
|
||||
// function to call in the constructor array.
|
||||
CUdeviceptr *dev_ctors_start = dev_memory;
|
||||
CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
|
||||
for (uint64_t i = 0; i < ctors.size(); ++i) {
|
||||
CUdeviceptr dev_ptr;
|
||||
if (CUresult err =
|
||||
cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
|
||||
handle_error(err);
|
||||
if (CUresult err =
|
||||
cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
|
||||
handle_error(err);
|
||||
}
|
||||
|
||||
// Get the address of the global and then store the address of the destructor
|
||||
// function to call in the destructor array.
|
||||
CUdeviceptr *dev_dtors_start = dev_ctors_end;
|
||||
CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
|
||||
for (uint64_t i = 0; i < dtors.size(); ++i) {
|
||||
CUdeviceptr dev_ptr;
|
||||
if (CUresult err =
|
||||
cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
|
||||
handle_error(err);
|
||||
if (CUresult err =
|
||||
cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
|
||||
handle_error(err);
|
||||
}
|
||||
|
||||
// Obtain the address of the pointers the startup implementation uses to
|
||||
// iterate the constructors and destructors.
|
||||
CUdeviceptr init_start;
|
||||
if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
|
||||
"__init_array_start"))
|
||||
handle_error(err);
|
||||
CUdeviceptr init_end;
|
||||
if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
|
||||
"__init_array_end"))
|
||||
handle_error(err);
|
||||
CUdeviceptr fini_start;
|
||||
if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
|
||||
"__fini_array_start"))
|
||||
handle_error(err);
|
||||
CUdeviceptr fini_end;
|
||||
if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
|
||||
"__fini_array_end"))
|
||||
handle_error(err);
|
||||
|
||||
// Copy the pointers to the newly written array to the symbols so the startup
|
||||
// implementation can iterate them.
|
||||
if (CUresult err =
|
||||
cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
|
||||
handle_error(err);
|
||||
if (CUresult err =
|
||||
cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
|
||||
handle_error(err);
|
||||
|
||||
return dev_memory;
|
||||
}
|
||||
|
||||
void print_kernel_resources(CUmodule binary, const char *kernel_name) {
|
||||
CUfunction function;
|
||||
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
|
||||
handle_error(err);
|
||||
int num_regs;
|
||||
if (CUresult err =
|
||||
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
|
||||
handle_error(err);
|
||||
printf("Executing kernel %s:\n", kernel_name);
|
||||
printf("%6s registers: %d\n", kernel_name, num_regs);
|
||||
}
|
||||
|
||||
template <typename args_t>
|
||||
CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
|
||||
const LaunchParameters ¶ms, const char *kernel_name,
|
||||
args_t kernel_args, bool print_resource_usage) {
|
||||
// look up the '_start' kernel in the loaded module.
|
||||
CUfunction function;
|
||||
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
|
||||
handle_error(err);
|
||||
|
||||
// Set up the arguments to the '_start' kernel on the GPU.
|
||||
uint64_t args_size = sizeof(args_t);
|
||||
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
|
||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
||||
CU_LAUNCH_PARAM_END};
|
||||
if (print_resource_usage)
|
||||
print_kernel_resources(binary, kernel_name);
|
||||
|
||||
// Initialize a non-blocking CUDA stream to allocate memory if needed.
|
||||
// This needs to be done on a separate stream or else it will deadlock
|
||||
// with the executing kernel.
|
||||
CUstream memory_stream;
|
||||
if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
|
||||
handle_error(err);
|
||||
|
||||
std::atomic<bool> finished = false;
|
||||
std::thread server_thread(
|
||||
[](std::atomic<bool> *finished, rpc::Server *server,
|
||||
CUstream memory_stream) {
|
||||
auto malloc_handler = [&](size_t size) -> void * {
|
||||
CUdeviceptr dev_ptr;
|
||||
if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
|
||||
dev_ptr = 0UL;
|
||||
|
||||
// Wait until the memory allocation is complete.
|
||||
while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
|
||||
;
|
||||
return reinterpret_cast<void *>(dev_ptr);
|
||||
};
|
||||
|
||||
auto free_handler = [&](void *ptr) -> void {
|
||||
if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
|
||||
memory_stream))
|
||||
handle_error(err);
|
||||
};
|
||||
|
||||
uint32_t index = 0;
|
||||
while (!*finished) {
|
||||
index =
|
||||
handle_server<32>(*server, index, malloc_handler, free_handler);
|
||||
}
|
||||
},
|
||||
&finished, &server, memory_stream);
|
||||
|
||||
// Call the kernel with the given arguments.
|
||||
if (CUresult err = cuLaunchKernel(
|
||||
function, params.num_blocks_x, params.num_blocks_y,
|
||||
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
|
||||
params.num_threads_z, 0, stream, nullptr, args_config))
|
||||
handle_error(err);
|
||||
|
||||
if (CUresult err = cuStreamSynchronize(stream))
|
||||
handle_error(err);
|
||||
|
||||
finished = true;
|
||||
if (server_thread.joinable())
|
||||
server_thread.join();
|
||||
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
|
||||
int load(int argc, const char **argv, const char **envp, void *image,
|
||||
size_t size, const LaunchParameters ¶ms,
|
||||
bool print_resource_usage) {
|
||||
if (CUresult err = cuInit(0))
|
||||
handle_error(err);
|
||||
// Obtain the first device found on the system.
|
||||
uint32_t device_id = 0;
|
||||
CUdevice device;
|
||||
if (CUresult err = cuDeviceGet(&device, device_id))
|
||||
handle_error(err);
|
||||
|
||||
// Initialize the CUDA context and claim it for this execution.
|
||||
CUcontext context;
|
||||
if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuCtxSetCurrent(context))
|
||||
handle_error(err);
|
||||
|
||||
// Increase the stack size per thread.
|
||||
// TODO: We should allow this to be passed in so only the tests that require a
|
||||
// larger stack can specify it to save on memory usage.
|
||||
if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
|
||||
handle_error(err);
|
||||
|
||||
// Initialize a non-blocking CUDA stream to execute the kernel.
|
||||
CUstream stream;
|
||||
if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
|
||||
handle_error(err);
|
||||
|
||||
// Load the image into a CUDA module.
|
||||
CUmodule binary;
|
||||
if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
|
||||
handle_error(err);
|
||||
|
||||
// Allocate pinned memory on the host to hold the pointer array for the
|
||||
// copied argv and allow the GPU device to access it.
|
||||
auto allocator = [&](uint64_t size) -> void * {
|
||||
void *dev_ptr;
|
||||
if (CUresult err = cuMemAllocHost(&dev_ptr, size))
|
||||
handle_error(err);
|
||||
return dev_ptr;
|
||||
};
|
||||
|
||||
auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
|
||||
if (!memory_or_err)
|
||||
handle_error(toString(memory_or_err.takeError()).c_str());
|
||||
|
||||
void *dev_argv = copy_argument_vector(argc, argv, allocator);
|
||||
if (!dev_argv)
|
||||
handle_error("Failed to allocate device argv");
|
||||
|
||||
// Allocate pinned memory on the host to hold the pointer array for the
|
||||
// copied environment array and allow the GPU device to access it.
|
||||
void *dev_envp = copy_environment(envp, allocator);
|
||||
if (!dev_envp)
|
||||
handle_error("Failed to allocate device environment");
|
||||
|
||||
// Allocate space for the return pointer and initialize it to zero.
|
||||
CUdeviceptr dev_ret;
|
||||
if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
|
||||
handle_error(err);
|
||||
|
||||
uint32_t warp_size = 32;
|
||||
void *rpc_buffer = nullptr;
|
||||
if (CUresult err = cuMemAllocHost(
|
||||
&rpc_buffer,
|
||||
rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
|
||||
handle_error(err);
|
||||
rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
|
||||
rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
|
||||
|
||||
// Initialize the RPC client on the device by copying the local data to the
|
||||
// device's internal pointer.
|
||||
CUdeviceptr rpc_client_dev = 0;
|
||||
uint64_t client_ptr_size = sizeof(void *);
|
||||
if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
|
||||
binary, "__llvm_rpc_client"))
|
||||
handle_error(err);
|
||||
|
||||
if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
|
||||
handle_error(err);
|
||||
|
||||
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
|
||||
begin_args_t init_args = {argc, dev_argv, dev_envp};
|
||||
if (CUresult err =
|
||||
launch_kernel(binary, stream, server, single_threaded_params,
|
||||
"_begin", init_args, print_resource_usage))
|
||||
handle_error(err);
|
||||
|
||||
start_args_t args = {argc, dev_argv, dev_envp,
|
||||
reinterpret_cast<void *>(dev_ret)};
|
||||
if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
|
||||
args, print_resource_usage))
|
||||
handle_error(err);
|
||||
|
||||
// Copy the return value back from the kernel and wait.
|
||||
int host_ret = 0;
|
||||
if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
|
||||
handle_error(err);
|
||||
|
||||
if (CUresult err = cuStreamSynchronize(stream))
|
||||
handle_error(err);
|
||||
|
||||
end_args_t fini_args = {host_ret};
|
||||
if (CUresult err =
|
||||
launch_kernel(binary, stream, server, single_threaded_params, "_end",
|
||||
fini_args, print_resource_usage))
|
||||
handle_error(err);
|
||||
|
||||
// Free the memory allocated for the device.
|
||||
if (CUresult err = cuMemFreeHost(*memory_or_err))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuMemFree(dev_ret))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuMemFreeHost(dev_argv))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuMemFreeHost(rpc_buffer))
|
||||
handle_error(err);
|
||||
|
||||
// Destroy the context and the loaded binary.
|
||||
if (CUresult err = cuModuleUnload(binary))
|
||||
handle_error(err);
|
||||
if (CUresult err = cuDevicePrimaryCtxRelease(device))
|
||||
handle_error(err);
|
||||
return host_ret;
|
||||
}
|
||||
Reference in New Issue
Block a user