Files
clang-p2996/libc/utils/gpu/loader/nvptx/Loader.cpp
Joseph Huber d0ff5e4030 [libc] Update RPC interface for system utilities on the GPU
This patch reworks the RPC interface to allow more generic memory
operations using the shared better. This patch decomposes the entire RPC
interface into opening a port and calling `send` or `recv` on it.

The `send` function sends a single packet of the length of the buffer.
The `recv` function is paired with the `send` call to then use the data.
So, any aribtrary combination of sending packets is possible. The only
restriction is that the client initiates the exchange with a `send`
while the server consumes it with a `recv`.

The operation of this is driven by two independent state machines that
tracks the buffer ownership during loads / stores. We keep track of two
so that we can transition between a send state and a recv state without
an extra wait. State transitions are observed via bit toggling, e.g.

This interface supports an efficient `send -> ack -> send -> ack -> send`
interface and allows for the last send to be ignored without checking
the ack.

A following patch will add some more comprehensive testing to this interface. I
I informally made an RPC call that simply incremented an integer and it took
roughly 10 microsends to complete an RPC call.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D148288
2023-04-19 20:02:31 -05:00

174 lines
5.7 KiB
C++

//===-- Loader Implementation for NVPTX devices --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file impelements a simple loader to run images supporting the NVPTX
// architecture. The file launches the '_start' kernel which should be provided
// by the device application start code and call ultimately call the 'main'
// function.
//
//===----------------------------------------------------------------------===//
#include "Loader.h"
#include "Server.h"
#include "cuda.h"
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
/// The arguments to the '_start' kernel.
struct kernel_args_t {
int argc;
void *argv;
void *envp;
void *ret;
void *inbox;
void *outbox;
void *buffer;
};
static void handle_error(CUresult err) {
if (err == CUDA_SUCCESS)
return;
const char *err_str = nullptr;
CUresult result = cuGetErrorString(err, &err_str);
if (result != CUDA_SUCCESS)
fprintf(stderr, "Unknown Error\n");
else
fprintf(stderr, "%s\n", err_str);
exit(1);
}
static void handle_error(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(EXIT_FAILURE);
}
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {
if (CUresult err = cuInit(0))
handle_error(err);
// Obtain the first device found on the system.
CUdevice device;
if (CUresult err = cuDeviceGet(&device, 0))
handle_error(err);
// Initialize the CUDA context and claim it for this execution.
CUcontext context;
if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
handle_error(err);
if (CUresult err = cuCtxSetCurrent(context))
handle_error(err);
// Initialize a non-blocking CUDA stream to execute the kernel.
CUstream stream;
if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
handle_error(err);
// Load the image into a CUDA module.
CUmodule binary;
if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
handle_error(err);
// look up the '_start' kernel in the loaded module.
CUfunction function;
if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
handle_error(err);
// Allocate pinned memory on the host to hold the pointer array for the
// copied argv and allow the GPU device to access it.
auto allocator = [&](uint64_t size) -> void * {
void *dev_ptr;
if (CUresult err = cuMemAllocHost(&dev_ptr, size))
handle_error(err);
return dev_ptr;
};
void *dev_argv = copy_argument_vector(argc, argv, allocator);
if (!dev_argv)
handle_error("Failed to allocate device argv");
// Allocate pinned memory on the host to hold the pointer array for the
// copied environment array and allow the GPU device to access it.
void *dev_envp = copy_environment(envp, allocator);
if (!dev_envp)
handle_error("Failed to allocate device environment");
// Allocate space for the return pointer and initialize it to zero.
CUdeviceptr dev_ret;
if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
handle_error(err);
if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
handle_error(err);
void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
if (!server_inbox || !server_outbox || !buffer)
handle_error("Failed to allocate memory the RPC client / server.");
// Set up the arguments to the '_start' kernel on the GPU.
uint64_t args_size = sizeof(kernel_args_t);
kernel_args_t args;
std::memset(&args, 0, args_size);
args.argc = argc;
args.argv = dev_argv;
args.envp = dev_envp;
args.ret = reinterpret_cast<void *>(dev_ret);
args.inbox = server_outbox;
args.outbox = server_inbox;
args.buffer = buffer;
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};
// Initialize the RPC server's buffer for host-device communication.
server.reset(&lock, server_inbox, server_outbox, buffer);
// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);
// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
handle_server();
// Copy the return value back from the kernel and wait.
int host_ret = 0;
if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
handle_error(err);
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
// Free the memory allocated for the device.
if (CUresult err = cuMemFree(dev_ret))
handle_error(err);
if (CUresult err = cuMemFreeHost(dev_argv))
handle_error(err);
if (CUresult err = cuMemFreeHost(server_inbox))
handle_error(err);
if (CUresult err = cuMemFreeHost(server_outbox))
handle_error(err);
if (CUresult err = cuMemFreeHost(buffer))
handle_error(err);
// Destroy the context and the loaded binary.
if (CUresult err = cuModuleUnload(binary))
handle_error(err);
if (CUresult err = cuDevicePrimaryCtxRelease(device))
handle_error(err);
return host_ret;
}