This patch reworks the RPC interface to allow more generic memory operations using the shared better. This patch decomposes the entire RPC interface into opening a port and calling `send` or `recv` on it. The `send` function sends a single packet of the length of the buffer. The `recv` function is paired with the `send` call to then use the data. So, any aribtrary combination of sending packets is possible. The only restriction is that the client initiates the exchange with a `send` while the server consumes it with a `recv`. The operation of this is driven by two independent state machines that tracks the buffer ownership during loads / stores. We keep track of two so that we can transition between a send state and a recv state without an extra wait. State transitions are observed via bit toggling, e.g. This interface supports an efficient `send -> ack -> send -> ack -> send` interface and allows for the last send to be ignored without checking the ack. A following patch will add some more comprehensive testing to this interface. I I informally made an RPC call that simply incremented an integer and it took roughly 10 microsends to complete an RPC call. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D148288
174 lines
5.7 KiB
C++
174 lines
5.7 KiB
C++
//===-- Loader Implementation for NVPTX devices --------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file impelements a simple loader to run images supporting the NVPTX
|
|
// architecture. The file launches the '_start' kernel which should be provided
|
|
// by the device application start code and call ultimately call the 'main'
|
|
// function.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Loader.h"
|
|
#include "Server.h"
|
|
|
|
#include "cuda.h"
|
|
#include <cstddef>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
|
|
/// The arguments to the '_start' kernel.
|
|
struct kernel_args_t {
|
|
int argc;
|
|
void *argv;
|
|
void *envp;
|
|
void *ret;
|
|
void *inbox;
|
|
void *outbox;
|
|
void *buffer;
|
|
};
|
|
|
|
static void handle_error(CUresult err) {
|
|
if (err == CUDA_SUCCESS)
|
|
return;
|
|
|
|
const char *err_str = nullptr;
|
|
CUresult result = cuGetErrorString(err, &err_str);
|
|
if (result != CUDA_SUCCESS)
|
|
fprintf(stderr, "Unknown Error\n");
|
|
else
|
|
fprintf(stderr, "%s\n", err_str);
|
|
exit(1);
|
|
}
|
|
|
|
static void handle_error(const char *msg) {
|
|
fprintf(stderr, "%s\n", msg);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
int load(int argc, char **argv, char **envp, void *image, size_t size,
|
|
const LaunchParameters ¶ms) {
|
|
if (CUresult err = cuInit(0))
|
|
handle_error(err);
|
|
|
|
// Obtain the first device found on the system.
|
|
CUdevice device;
|
|
if (CUresult err = cuDeviceGet(&device, 0))
|
|
handle_error(err);
|
|
|
|
// Initialize the CUDA context and claim it for this execution.
|
|
CUcontext context;
|
|
if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
|
|
handle_error(err);
|
|
if (CUresult err = cuCtxSetCurrent(context))
|
|
handle_error(err);
|
|
|
|
// Initialize a non-blocking CUDA stream to execute the kernel.
|
|
CUstream stream;
|
|
if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
|
|
handle_error(err);
|
|
|
|
// Load the image into a CUDA module.
|
|
CUmodule binary;
|
|
if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
|
|
handle_error(err);
|
|
|
|
// look up the '_start' kernel in the loaded module.
|
|
CUfunction function;
|
|
if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
|
|
handle_error(err);
|
|
|
|
// Allocate pinned memory on the host to hold the pointer array for the
|
|
// copied argv and allow the GPU device to access it.
|
|
auto allocator = [&](uint64_t size) -> void * {
|
|
void *dev_ptr;
|
|
if (CUresult err = cuMemAllocHost(&dev_ptr, size))
|
|
handle_error(err);
|
|
return dev_ptr;
|
|
};
|
|
void *dev_argv = copy_argument_vector(argc, argv, allocator);
|
|
if (!dev_argv)
|
|
handle_error("Failed to allocate device argv");
|
|
|
|
// Allocate pinned memory on the host to hold the pointer array for the
|
|
// copied environment array and allow the GPU device to access it.
|
|
void *dev_envp = copy_environment(envp, allocator);
|
|
if (!dev_envp)
|
|
handle_error("Failed to allocate device environment");
|
|
|
|
// Allocate space for the return pointer and initialize it to zero.
|
|
CUdeviceptr dev_ret;
|
|
if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
|
|
handle_error(err);
|
|
if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
|
|
handle_error(err);
|
|
|
|
void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
|
|
void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
|
|
void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
|
|
if (!server_inbox || !server_outbox || !buffer)
|
|
handle_error("Failed to allocate memory the RPC client / server.");
|
|
|
|
// Set up the arguments to the '_start' kernel on the GPU.
|
|
uint64_t args_size = sizeof(kernel_args_t);
|
|
kernel_args_t args;
|
|
std::memset(&args, 0, args_size);
|
|
args.argc = argc;
|
|
args.argv = dev_argv;
|
|
args.envp = dev_envp;
|
|
args.ret = reinterpret_cast<void *>(dev_ret);
|
|
args.inbox = server_outbox;
|
|
args.outbox = server_inbox;
|
|
args.buffer = buffer;
|
|
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
|
|
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
|
CU_LAUNCH_PARAM_END};
|
|
|
|
// Initialize the RPC server's buffer for host-device communication.
|
|
server.reset(&lock, server_inbox, server_outbox, buffer);
|
|
|
|
// Call the kernel with the given arguments.
|
|
if (CUresult err = cuLaunchKernel(
|
|
function, params.num_blocks_x, params.num_blocks_y,
|
|
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
|
|
params.num_threads_z, 0, stream, nullptr, args_config))
|
|
handle_error(err);
|
|
|
|
// Wait until the kernel has completed execution on the device. Periodically
|
|
// check the RPC client for work to be performed on the server.
|
|
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
|
|
handle_server();
|
|
|
|
// Copy the return value back from the kernel and wait.
|
|
int host_ret = 0;
|
|
if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
|
|
handle_error(err);
|
|
|
|
if (CUresult err = cuStreamSynchronize(stream))
|
|
handle_error(err);
|
|
|
|
// Free the memory allocated for the device.
|
|
if (CUresult err = cuMemFree(dev_ret))
|
|
handle_error(err);
|
|
if (CUresult err = cuMemFreeHost(dev_argv))
|
|
handle_error(err);
|
|
if (CUresult err = cuMemFreeHost(server_inbox))
|
|
handle_error(err);
|
|
if (CUresult err = cuMemFreeHost(server_outbox))
|
|
handle_error(err);
|
|
if (CUresult err = cuMemFreeHost(buffer))
|
|
handle_error(err);
|
|
|
|
// Destroy the context and the loaded binary.
|
|
if (CUresult err = cuModuleUnload(binary))
|
|
handle_error(err);
|
|
if (CUresult err = cuDevicePrimaryCtxRelease(device))
|
|
handle_error(err);
|
|
return host_ret;
|
|
}
|