Revert "[LLVM] Make the GPU loader utilities an LLVM tool (#132096)"

This reverts commit 221b0117fd. Some build failures requiring TargetParser and some warnings to clean up.
2025-03-20 14:26:40 -05:00
parent 00fabd21bc
commit df2a56767d
12 changed files with 117 additions and 102 deletions
--- a/libc/utils/gpu/CMakeLists.txt
+++ b/libc/utils/gpu/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(loader)
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -0,0 +1,54 @@
+add_library(gpu_loader OBJECT Main.cpp)
+
+include(FindLibcCommonUtils)
+target_link_libraries(gpu_loader PUBLIC llvm-libc-common-utilities)
+
+target_include_directories(gpu_loader PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${LIBC_SOURCE_DIR}/include
+  ${LIBC_SOURCE_DIR}
+  ${LLVM_MAIN_INCLUDE_DIR}
+  ${LLVM_BINARY_DIR}/include
+)
+if(NOT LLVM_ENABLE_RTTI)
+  target_compile_options(gpu_loader PUBLIC -fno-rtti)
+endif()
+
+find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+if(hsa-runtime64_FOUND)
+  add_subdirectory(amdgpu)
+endif()
+
+# The CUDA loader requires LLVM to traverse the ELF image for symbols.
+find_package(CUDAToolkit 11.2 QUIET)
+if(CUDAToolkit_FOUND)
+  add_subdirectory(nvptx)
+endif()
+
+if(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  add_custom_target(libc.utils.gpu.loader)
+  add_dependencies(libc.utils.gpu.loader amdhsa-loader)
+  set_target_properties(
+    libc.utils.gpu.loader
+    PROPERTIES
+      TARGET amdhsa-loader
+      EXECUTABLE "$<TARGET_FILE:amdhsa-loader>"
+  )
+elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  add_custom_target(libc.utils.gpu.loader)
+  add_dependencies(libc.utils.gpu.loader nvptx-loader)
+  set_target_properties(
+    libc.utils.gpu.loader
+    PROPERTIES
+      TARGET nvptx-loader
+      EXECUTABLE "$<TARGET_FILE:nvptx-loader>"
+  )
+endif()
+
+foreach(gpu_loader_tgt amdhsa-loader nvptx-loader)
+  if(TARGET ${gpu_loader_tgt})
+    install(TARGETS ${gpu_loader_tgt}
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+            COMPONENT libc)
+  endif()
+endforeach()
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -0,0 +1,198 @@
+//===-- Generic device loader interface -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
+#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
+
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include "shared/rpc_server.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+/// Generic launch parameters for configuration the number of blocks / threads.
+struct LaunchParameters {
+  uint32_t num_threads_x;
+  uint32_t num_threads_y;
+  uint32_t num_threads_z;
+  uint32_t num_blocks_x;
+  uint32_t num_blocks_y;
+  uint32_t num_blocks_z;
+};
+
+/// The arguments to the '_begin' kernel.
+struct begin_args_t {
+  int argc;
+  void *argv;
+  void *envp;
+};
+
+/// The arguments to the '_start' kernel.
+struct start_args_t {
+  int argc;
+  void *argv;
+  void *envp;
+  void *ret;
+};
+
+/// The arguments to the '_end' kernel.
+struct end_args_t {
+  int argc;
+};
+
+/// Generic interface to load the \p image and launch execution of the _start
+/// kernel on the target device. Copies \p argc and \p argv to the device.
+/// Returns the final value of the `main` function on the device.
+int load(int argc, const char **argv, const char **evnp, void *image,
+         size_t size, const LaunchParameters &params,
+         bool print_resource_usage);
+
+/// Return \p V aligned "upwards" according to \p Align.
+template <typename V, typename A> inline V align_up(V val, A align) {
+  return ((val + V(align) - 1) / V(align)) * V(align);
+}
+
+/// Copy the system's argument vector to GPU memory allocated using \p alloc.
+template <typename Allocator>
+void *copy_argument_vector(int argc, const char **argv, Allocator alloc) {
+  size_t argv_size = sizeof(char *) * (argc + 1);
+  size_t str_size = 0;
+  for (int i = 0; i < argc; ++i)
+    str_size += strlen(argv[i]) + 1;
+
+  // We allocate enough space for a null terminated array and all the strings.
+  void *dev_argv = alloc(argv_size + str_size);
+  if (!dev_argv)
+    return nullptr;
+
+  // Store the strings linerally in the same memory buffer.
+  void *dev_str = reinterpret_cast<uint8_t *>(dev_argv) + argv_size;
+  for (int i = 0; i < argc; ++i) {
+    size_t size = strlen(argv[i]) + 1;
+    std::memcpy(dev_str, argv[i], size);
+    static_cast<void **>(dev_argv)[i] = dev_str;
+    dev_str = reinterpret_cast<uint8_t *>(dev_str) + size;
+  }
+
+  // Ensure the vector is null terminated.
+  reinterpret_cast<void **>(dev_argv)[argc] = nullptr;
+  return dev_argv;
+}
+
+/// Copy the system's environment to GPU memory allocated using \p alloc.
+template <typename Allocator>
+void *copy_environment(const char **envp, Allocator alloc) {
+  int envc = 0;
+  for (const char **env = envp; *env != 0; ++env)
+    ++envc;
+
+  return copy_argument_vector(envc, envp, alloc);
+}
+
+inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
+  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
+  exit(EXIT_FAILURE);
+}
+#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)
+
+template <uint32_t num_lanes, typename Alloc, typename Free>
+inline uint32_t handle_server(rpc::Server &server, uint32_t index,
+                              Alloc &&alloc, Free &&free) {
+  auto port = server.try_open(num_lanes, index);
+  if (!port)
+    return 0;
+  index = port->get_index() + 1;
+
+  int status = rpc::RPC_SUCCESS;
+  switch (port->get_opcode()) {
+  case RPC_TEST_INCREMENT: {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+      reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
+    });
+    break;
+  }
+  case RPC_TEST_INTERFACE: {
+    bool end_with_recv;
+    uint64_t cnt;
+    port->recv([&](rpc::Buffer *buffer, uint32_t) {
+      end_with_recv = buffer->data[0];
+    });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    if (end_with_recv)
+      port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    else
+      port->send([&](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = cnt = cnt + 1;
+      });
+
+    break;
+  }
+  case RPC_TEST_STREAM: {
+    uint64_t sizes[num_lanes] = {0};
+    void *dst[num_lanes] = {nullptr};
+    port->recv_n(dst, sizes,
+                 [](uint64_t size) -> void * { return new char[size]; });
+    port->send_n(dst, sizes);
+    for (uint64_t i = 0; i < num_lanes; ++i) {
+      if (dst[i])
+        delete[] reinterpret_cast<uint8_t *>(dst[i]);
+    }
+    break;
+  }
+  case RPC_TEST_NOOP: {
+    port->recv([&](rpc::Buffer *, uint32_t) {});
+    break;
+  }
+  case LIBC_MALLOC: {
+    port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = reinterpret_cast<uintptr_t>(alloc(buffer->data[0]));
+    });
+    break;
+  }
+  case LIBC_FREE: {
+    port->recv([&](rpc::Buffer *buffer, uint32_t) {
+      free(reinterpret_cast<void *>(buffer->data[0]));
+    });
+    break;
+  }
+  default:
+    status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*port, num_lanes);
+    break;
+  }
+
+  // Handle all of the `libc` specific opcodes.
+  if (status != rpc::RPC_SUCCESS)
+    handle_error("Error handling RPC server");
+
+  port->close();
+
+  return index;
+}
+
+#endif
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -0,0 +1,142 @@
+//===-- Main entry into the loader interface ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file opens a device image passed on the command line and passes it to
+// one of the loader implementations for launch.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Loader.h"
+
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <sys/file.h>
+
+using namespace llvm;
+
+static cl::OptionCategory loader_category("loader options");
+
+static cl::opt<bool> help("h", cl::desc("Alias for -help"), cl::Hidden,
+                          cl::cat(loader_category));
+
+static cl::opt<unsigned>
+    threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"),
+              cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"),
+              cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"),
+              cl::init(1), cl::cat(loader_category));
+static cl::alias threads("threads", cl::aliasopt(threads_x),
+                         cl::desc("Alias for --threads-x"),
+                         cl::cat(loader_category));
+
+static cl::opt<unsigned>
+    blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
+             cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
+             cl::init(1), cl::cat(loader_category));
+static cl::opt<unsigned>
+    blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
+             cl::init(1), cl::cat(loader_category));
+static cl::alias blocks("blocks", cl::aliasopt(blocks_x),
+                        cl::desc("Alias for --blocks-x"),
+                        cl::cat(loader_category));
+
+static cl::opt<bool>
+    print_resource_usage("print-resource-usage",
+                         cl::desc("Output resource usage of launched kernels"),
+                         cl::init(false), cl::cat(loader_category));
+
+static cl::opt<bool>
+    no_parallelism("no-parallelism",
+                   cl::desc("Allows only a single process to use the GPU at a "
+                            "time. Useful to suppress out-of-resource errors"),
+                   cl::init(false), cl::cat(loader_category));
+
+static cl::opt<std::string> file(cl::Positional, cl::Required,
+                                 cl::desc("<gpu executable>"),
+                                 cl::cat(loader_category));
+static cl::list<std::string> args(cl::ConsumeAfter,
+                                  cl::desc("<program arguments>..."),
+                                  cl::cat(loader_category));
+
+[[noreturn]] void report_error(Error E) {
+  outs().flush();
+  logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
+  exit(EXIT_FAILURE);
+}
+
+std::string get_main_executable(const char *name) {
+  void *ptr = (void *)(intptr_t)&get_main_executable;
+  auto cow_path = sys::fs::getMainExecutable(name, ptr);
+  return sys::path::parent_path(cow_path).str();
+}
+
+int main(int argc, const char **argv, const char **envp) {
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  cl::HideUnrelatedOptions(loader_category);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "A utility used to launch unit tests built for a GPU target. This is\n"
+      "intended to provide an intrface simular to cross-compiling emulators\n");
+
+  if (help) {
+    cl::PrintHelpMessage();
+    return EXIT_SUCCESS;
+  }
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> image_or_err =
+      MemoryBuffer::getFileOrSTDIN(file);
+  if (std::error_code ec = image_or_err.getError())
+    report_error(errorCodeToError(ec));
+  MemoryBufferRef image = **image_or_err;
+
+  SmallVector<const char *> new_argv = {file.c_str()};
+  llvm::transform(args, std::back_inserter(new_argv),
+                  [](const std::string &arg) { return arg.c_str(); });
+
+  // Claim a file lock on the executable so only a single process can enter this
+  // region if requested. This prevents the loader from spurious failures.
+  int fd = -1;
+  if (no_parallelism) {
+    fd = open(get_main_executable(argv[0]).c_str(), O_RDONLY);
+    if (flock(fd, LOCK_EX) == -1)
+      report_error(createStringError("Failed to lock '%s': %s", argv[0],
+                                     strerror(errno)));
+  }
+
+  // Drop the loader from the program arguments.
+  LaunchParameters params{threads_x, threads_y, threads_z,
+                          blocks_x,  blocks_y,  blocks_z};
+  int ret = load(new_argv.size(), new_argv.data(), envp,
+                 const_cast<char *>(image.getBufferStart()),
+                 image.getBufferSize(), params, print_resource_usage);
+
+  if (no_parallelism) {
+    if (flock(fd, LOCK_UN) == -1)
+      report_error(createStringError("Failed to unlock '%s': %s", argv[0],
+                                     strerror(errno)));
+  }
+
+  return ret;
+}
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  BinaryFormat
+  Object
+  Option
+  Support
+  FrontendOffloading
+  )
+
+add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
+target_link_libraries(amdhsa-loader PRIVATE hsa-runtime64::hsa-runtime64 gpu_loader)
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -0,0 +1,594 @@
+//===-- Loader Implementation for AMDHSA devices --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file impelements a simple loader to run images supporting the AMDHSA
+// architecture. The file launches the '_start' kernel which should be provided
+// by the device application start code and call ultimately call the 'main'
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Loader.h"
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+#include "llvm/Frontend/Offloading/Utility.h"
+
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <thread>
+#include <tuple>
+#include <utility>
+
+// The implicit arguments of COV5 AMDGPU kernels.
+struct implicit_args_t {
+  uint32_t grid_size_x;
+  uint32_t grid_size_y;
+  uint32_t grid_size_z;
+  uint16_t workgroup_size_x;
+  uint16_t workgroup_size_y;
+  uint16_t workgroup_size_z;
+  uint8_t Unused0[46];
+  uint16_t grid_dims;
+  uint8_t Unused1[190];
+};
+
+/// Print the error code and exit if \p code indicates an error.
+static void handle_error_impl(const char *file, int32_t line,
+                              hsa_status_t code) {
+  if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
+    return;
+
+  const char *desc;
+  if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
+    desc = "Unknown error";
+  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
+  exit(EXIT_FAILURE);
+}
+
+/// Generic interface for iterating using the HSA callbacks.
+template <typename elem_ty, typename func_ty, typename callback_ty>
+hsa_status_t iterate(func_ty func, callback_ty cb) {
+  auto l = [](elem_ty elem, void *data) -> hsa_status_t {
+    callback_ty *unwrapped = static_cast<callback_ty *>(data);
+    return (*unwrapped)(elem);
+  };
+  return func(l, static_cast<void *>(&cb));
+}
+
+/// Generic interface for iterating using the HSA callbacks.
+template <typename elem_ty, typename func_ty, typename func_arg_ty,
+          typename callback_ty>
+hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
+  auto l = [](elem_ty elem, void *data) -> hsa_status_t {
+    callback_ty *unwrapped = static_cast<callback_ty *>(data);
+    return (*unwrapped)(elem);
+  };
+  return func(func_arg, l, static_cast<void *>(&cb));
+}
+
+/// Iterate through all availible agents.
+template <typename callback_ty>
+hsa_status_t iterate_agents(callback_ty callback) {
+  return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
+}
+
+/// Iterate through all availible memory pools.
+template <typename callback_ty>
+hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
+  return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
+                                        agent, cb);
+}
+
+template <hsa_device_type_t flag>
+hsa_status_t get_agent(hsa_agent_t *output_agent) {
+  // Find the first agent with a matching device type.
+  auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
+    hsa_device_type_t type;
+    hsa_status_t status =
+        hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
+    if (status != HSA_STATUS_SUCCESS)
+      return status;
+
+    if (type == flag) {
+      // Ensure that a GPU agent supports kernel dispatch packets.
+      if (type == HSA_DEVICE_TYPE_GPU) {
+        hsa_agent_feature_t features;
+        status =
+            hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
+        if (status != HSA_STATUS_SUCCESS)
+          return status;
+        if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
+          *output_agent = hsa_agent;
+      } else {
+        *output_agent = hsa_agent;
+      }
+      return HSA_STATUS_INFO_BREAK;
+    }
+    return HSA_STATUS_SUCCESS;
+  };
+
+  return iterate_agents(cb);
+}
+
+void print_kernel_resources(const char *kernel_name) {
+  fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
+}
+
+/// Retrieve a global memory pool with a \p flag from the agent.
+template <hsa_amd_memory_pool_global_flag_t flag>
+hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
+                                   hsa_amd_memory_pool_t *output_pool) {
+  auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
+    uint32_t flags;
+    hsa_amd_segment_t segment;
+    if (auto err = hsa_amd_memory_pool_get_info(
+            memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
+      return err;
+    if (auto err = hsa_amd_memory_pool_get_info(
+            memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
+      return err;
+
+    if (segment != HSA_AMD_SEGMENT_GLOBAL)
+      return HSA_STATUS_SUCCESS;
+
+    if (flags & flag)
+      *output_pool = memory_pool;
+
+    return HSA_STATUS_SUCCESS;
+  };
+  return iterate_agent_memory_pools(agent, cb);
+}
+
+template <typename args_t>
+hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
+                           hsa_amd_memory_pool_t kernargs_pool,
+                           hsa_amd_memory_pool_t coarsegrained_pool,
+                           hsa_queue_t *queue, rpc::Server &server,
+                           const LaunchParameters &params,
+                           const char *kernel_name, args_t kernel_args,
+                           uint32_t wavefront_size, bool print_resource_usage) {
+  // Look up the kernel in the loaded executable.
+  hsa_executable_symbol_t symbol;
+  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+          executable, kernel_name, &dev_agent, &symbol))
+    return err;
+
+  // Retrieve different properties of the kernel symbol used for launch.
+  uint64_t kernel;
+  uint32_t args_size;
+  uint32_t group_size;
+  uint32_t private_size;
+  bool dynamic_stack;
+
+  std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
+
+  for (auto &[info, value] : symbol_infos)
+    if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
+      return err;
+
+  // Allocate space for the kernel arguments on the host and allow the GPU agent
+  // to access it.
+  void *args;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
+                                                      /*flags=*/0, &args))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
+
+  // Initialize all the arguments (explicit and implicit) to zero, then set the
+  // explicit arguments to the values created above.
+  std::memset(args, 0, args_size);
+  std::memcpy(args, &kernel_args, sizeof(args_t));
+
+  // Initialize the necessary implicit arguments to the proper values.
+  int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
+             (params.num_blocks_z * params.num_threads_z != 1);
+  implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
+      reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
+  implicit_args->grid_dims = dims;
+  implicit_args->grid_size_x = params.num_blocks_x;
+  implicit_args->grid_size_y = params.num_blocks_y;
+  implicit_args->grid_size_z = params.num_blocks_z;
+  implicit_args->workgroup_size_x = params.num_threads_x;
+  implicit_args->workgroup_size_y = params.num_threads_y;
+  implicit_args->workgroup_size_z = params.num_threads_z;
+
+  // Obtain a packet from the queue.
+  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+  while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
+    ;
+
+  const uint32_t mask = queue->size - 1;
+  hsa_kernel_dispatch_packet_t *packet =
+      static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
+      (packet_id & mask);
+
+  // Set up the packet for exeuction on the device. We currently only launch
+  // with one thread on the device, forcing the rest of the wavefront to be
+  // masked off.
+  uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  packet->workgroup_size_x = params.num_threads_x;
+  packet->workgroup_size_y = params.num_threads_y;
+  packet->workgroup_size_z = params.num_threads_z;
+  packet->reserved0 = 0;
+  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
+  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
+  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
+  packet->private_segment_size =
+      dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
+  packet->group_segment_size = group_size;
+  packet->kernel_object = kernel;
+  packet->kernarg_address = args;
+  packet->reserved2 = 0;
+  // Create a signal to indicate when this packet has been completed.
+  if (hsa_status_t err =
+          hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
+    handle_error(err);
+
+  if (print_resource_usage)
+    print_kernel_resources(kernel_name);
+
+  // Initialize the packet header and set the doorbell signal to begin execution
+  // by the HSA runtime.
+  uint16_t header =
+      1u << HSA_PACKET_HEADER_BARRIER |
+      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+  uint32_t header_word = header | (setup << 16u);
+  __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
+  hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+
+  std::atomic<bool> finished = false;
+  std::thread server_thread(
+      [](std::atomic<bool> *finished, rpc::Server *server,
+         uint32_t wavefront_size, hsa_agent_t dev_agent,
+         hsa_amd_memory_pool_t coarsegrained_pool) {
+        // Register RPC callbacks for the malloc and free functions on HSA.
+        auto malloc_handler = [&](size_t size) -> void * {
+          void *dev_ptr = nullptr;
+          if (hsa_status_t err =
+                  hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+                                               /*flags=*/0, &dev_ptr))
+            dev_ptr = nullptr;
+          hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+          return dev_ptr;
+        };
+
+        auto free_handler = [](void *ptr) -> void {
+          if (hsa_status_t err =
+                  hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
+            handle_error(err);
+        };
+
+        uint32_t index = 0;
+        while (!*finished) {
+          if (wavefront_size == 32)
+            index =
+                handle_server<32>(*server, index, malloc_handler, free_handler);
+          else
+            index =
+                handle_server<64>(*server, index, malloc_handler, free_handler);
+        }
+      },
+      &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
+
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
+  while (hsa_signal_wait_scacquire(packet->completion_signal,
+                                   HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+                                   HSA_WAIT_STATE_BLOCKED) != 0)
+    ;
+
+  finished = true;
+  if (server_thread.joinable())
+    server_thread.join();
+
+  // Destroy the resources acquired to launch the kernel and return.
+  if (hsa_status_t err = hsa_amd_memory_pool_free(args))
+    handle_error(err);
+  if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
+    handle_error(err);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/// Copies data from the source agent to the destination agent. The source
+/// memory must first be pinned explicitly or allocated via HSA.
+static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
+                               const void *src, hsa_agent_t src_agent,
+                               uint64_t size) {
+  // Create a memory signal to copy information between the host and device.
+  hsa_signal_t memory_signal;
+  if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
+    return err;
+
+  if (hsa_status_t err = hsa_amd_memory_async_copy(
+          dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
+    return err;
+
+  while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
+                                   UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
+    ;
+
+  if (hsa_status_t err = hsa_signal_destroy(memory_signal))
+    return err;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+int load(int argc, const char **argv, const char **envp, void *image,
+         size_t size, const LaunchParameters &params,
+         bool print_resource_usage) {
+  // Initialize the HSA runtime used to communicate with the device.
+  if (hsa_status_t err = hsa_init())
+    handle_error(err);
+
+  // Register a callback when the device encounters a memory fault.
+  if (hsa_status_t err = hsa_amd_register_system_event_handler(
+          [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
+            if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
+              return HSA_STATUS_ERROR;
+            return HSA_STATUS_SUCCESS;
+          },
+          nullptr))
+    handle_error(err);
+
+  // Obtain a single agent for the device and host to use the HSA memory model.
+  hsa_agent_t dev_agent;
+  hsa_agent_t host_agent;
+  if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
+    handle_error(err);
+  if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
+    handle_error(err);
+
+  // Load the code object's ISA information and executable data segments.
+  hsa_code_object_reader_t reader;
+  if (hsa_status_t err =
+          hsa_code_object_reader_create_from_memory(image, size, &reader))
+    handle_error(err);
+
+  hsa_executable_t executable;
+  if (hsa_status_t err = hsa_executable_create_alt(
+          HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
+          &executable))
+    handle_error(err);
+
+  hsa_loaded_code_object_t object;
+  if (hsa_status_t err = hsa_executable_load_agent_code_object(
+          executable, dev_agent, reader, "", &object))
+    handle_error(err);
+
+  // No modifications to the executable are allowed  after this point.
+  if (hsa_status_t err = hsa_executable_freeze(executable, ""))
+    handle_error(err);
+
+  // Check the validity of the loaded executable. If the agents ISA features do
+  // not match the executable's code object it will fail here.
+  uint32_t result;
+  if (hsa_status_t err = hsa_executable_validate(executable, &result))
+    handle_error(err);
+  if (result)
+    handle_error(HSA_STATUS_ERROR);
+
+  if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
+    handle_error(err);
+
+  // Obtain memory pools to exchange data between the host and the device. The
+  // fine-grained pool acts as pinned memory on the host for DMA transfers to
+  // the device, the coarse-grained pool is for allocations directly on the
+  // device, and the kernerl-argument pool is for executing the kernel.
+  hsa_amd_memory_pool_t kernargs_pool;
+  hsa_amd_memory_pool_t finegrained_pool;
+  hsa_amd_memory_pool_t coarsegrained_pool;
+  if (hsa_status_t err =
+          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
+              host_agent, &kernargs_pool))
+    handle_error(err);
+  if (hsa_status_t err =
+          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
+              host_agent, &finegrained_pool))
+    handle_error(err);
+  if (hsa_status_t err =
+          get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
+              dev_agent, &coarsegrained_pool))
+    handle_error(err);
+
+  // The AMDGPU target can change its wavefront size. There currently isn't a
+  // good way to look this up through the HSA API so we use the LLVM interface.
+  uint16_t abi_version;
+  llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
+  llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
+  if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
+          llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
+    handle_error(llvm::toString(std::move(err)).c_str());
+  }
+
+  // Allocate fine-grained memory on the host to hold the pointer array for the
+  // copied argv and allow the GPU agent to access it.
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr = nullptr;
+    if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
+                                                        /*flags=*/0, &dev_ptr))
+      handle_error(err);
+    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+    return dev_ptr;
+  };
+  void *dev_argv = copy_argument_vector(argc, argv, allocator);
+  if (!dev_argv)
+    handle_error("Failed to allocate device argv");
+
+  // Allocate fine-grained memory on the host to hold the pointer array for the
+  // copied environment array and allow the GPU agent to access it.
+  void *dev_envp = copy_environment(envp, allocator);
+  if (!dev_envp)
+    handle_error("Failed to allocate device environment");
+
+  // Allocate space for the return pointer and initialize it to zero.
+  void *dev_ret;
+  if (hsa_status_t err =
+          hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
+                                       /*flags=*/0, &dev_ret))
+    handle_error(err);
+  hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
+
+  // Allocate finegrained memory for the RPC server and client to share.
+  uint32_t wavefront_size =
+      llvm::max_element(info_map, [](auto &&x, auto &&y) {
+        return x.second.WavefrontSize < y.second.WavefrontSize;
+      })->second.WavefrontSize;
+
+  // Set up the RPC server.
+  void *rpc_buffer;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool,
+          rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
+          /*flags=*/0, &rpc_buffer))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
+
+  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
+
+  // Initialize the RPC client on the device by copying the local data to the
+  // device's internal pointer.
+  hsa_executable_symbol_t rpc_client_sym;
+  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+          executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
+    handle_error(err);
+
+  void *rpc_client_dev;
+  if (hsa_status_t err = hsa_executable_symbol_get_info(
+          rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+          &rpc_client_dev))
+    handle_error(err);
+
+  void *rpc_client_buffer;
+  if (hsa_status_t err =
+          hsa_amd_memory_lock(&client, sizeof(rpc::Client),
+                              /*agents=*/nullptr, 0, &rpc_client_buffer))
+    handle_error(err);
+
+  // Copy the RPC client buffer to the address pointed to by the symbol.
+  if (hsa_status_t err =
+          hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
+                     sizeof(rpc::Client)))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_amd_memory_unlock(&client))
+    handle_error(err);
+
+  // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
+  // If the clock_freq symbol is missing, no work to do.
+  hsa_executable_symbol_t freq_sym;
+  if (HSA_STATUS_SUCCESS ==
+      hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
+                                        &dev_agent, &freq_sym)) {
+    void *host_clock_freq;
+    if (hsa_status_t err =
+            hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
+                                         /*flags=*/0, &host_clock_freq))
+      handle_error(err);
+    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
+
+    if (HSA_STATUS_SUCCESS ==
+        hsa_agent_get_info(dev_agent,
+                           static_cast<hsa_agent_info_t>(
+                               HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
+                           host_clock_freq)) {
+
+      void *freq_addr;
+      if (hsa_status_t err = hsa_executable_symbol_get_info(
+              freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+              &freq_addr))
+        handle_error(err);
+
+      if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
+                                        host_agent, sizeof(uint64_t)))
+        handle_error(err);
+    }
+  }
+
+  // Obtain a queue with the maximum (power of two) size, used to send commands
+  // to the HSA runtime and launch execution on the device.
+  uint64_t queue_size;
+  if (hsa_status_t err = hsa_agent_get_info(
+          dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
+    handle_error(err);
+  hsa_queue_t *queue = nullptr;
+  if (hsa_status_t err =
+          hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
+                           nullptr, UINT32_MAX, UINT32_MAX, &queue))
+    handle_error(err);
+
+  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
+  begin_args_t init_args = {argc, dev_argv, dev_envp};
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          server, single_threaded_params, "_begin.kd", init_args,
+          info_map["_begin"].WavefrontSize, print_resource_usage))
+    handle_error(err);
+
+  start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
+          print_resource_usage))
+    handle_error(err);
+
+  void *host_ret;
+  if (hsa_status_t err =
+          hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
+                                       /*flags=*/0, &host_ret))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
+
+  if (hsa_status_t err =
+          hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
+    handle_error(err);
+
+  // Save the return value and perform basic clean-up.
+  int ret = *static_cast<int *>(host_ret);
+
+  end_args_t fini_args = {ret};
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          server, single_threaded_params, "_end.kd", fini_args,
+          info_map["_end"].WavefrontSize, print_resource_usage))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
+    handle_error(err);
+
+  // Free the memory allocated for the device.
+  if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_queue_destroy(queue))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_executable_destroy(executable))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_shut_down())
+    handle_error(err);
+
+  return ret;
+}
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS
+  BinaryFormat
+  Object
+  Option
+  Support
+  )
+
+add_llvm_executable(nvptx-loader nvptx-loader.cpp)
+target_link_libraries(nvptx-loader PRIVATE gpu_loader CUDA::cuda_driver)
--- a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
@@ -0,0 +1,366 @@
+//===-- Loader Implementation for NVPTX devices --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file impelements a simple loader to run images supporting the NVPTX
+// architecture. The file launches the '_start' kernel which should be provided
+// by the device application start code and call ultimately call the 'main'
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Loader.h"
+
+#include "cuda.h"
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+using namespace llvm;
+using namespace object;
+
+static void handle_error_impl(const char *file, int32_t line, CUresult err) {
+  if (err == CUDA_SUCCESS)
+    return;
+
+  const char *err_str = nullptr;
+  CUresult result = cuGetErrorString(err, &err_str);
+  if (result != CUDA_SUCCESS)
+    fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
+  else
+    fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
+  exit(1);
+}
+
+// Gets the names of all the globals that contain functions to initialize or
+// deinitialize. We need to do this manually because the NVPTX toolchain does
+// not contain the necessary binary manipulation tools.
+template <typename Alloc>
+Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
+                                     Alloc allocator, CUmodule binary) {
+  auto mem_buffer = MemoryBuffer::getMemBuffer(
+      StringRef(reinterpret_cast<const char *>(image), size), "image",
+      /*RequiresNullTerminator=*/false);
+  Expected<ELF64LEObjectFile> elf_or_err =
+      ELF64LEObjectFile::create(*mem_buffer);
+  if (!elf_or_err)
+    handle_error(toString(elf_or_err.takeError()).c_str());
+
+  std::vector<std::pair<const char *, uint16_t>> ctors;
+  std::vector<std::pair<const char *, uint16_t>> dtors;
+  // CUDA has no way to iterate over all the symbols so we need to inspect the
+  // ELF directly using the LLVM libraries.
+  for (const auto &symbol : elf_or_err->symbols()) {
+    auto name_or_err = symbol.getName();
+    if (!name_or_err)
+      handle_error(toString(name_or_err.takeError()).c_str());
+
+    // Search for all symbols that contain a constructor or destructor.
+    if (!name_or_err->starts_with("__init_array_object_") &&
+        !name_or_err->starts_with("__fini_array_object_"))
+      continue;
+
+    uint16_t priority;
+    if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
+      handle_error("Invalid priority for constructor or destructor");
+
+    if (name_or_err->starts_with("__init"))
+      ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
+    else
+      dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
+  }
+  // Lower priority constructors are run before higher ones. The reverse is true
+  // for destructors.
+  llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
+
+  // Allocate host pinned memory to make these arrays visible to the GPU.
+  CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
+      ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
+  uint64_t global_size = 0;
+
+  // Get the address of the global and then store the address of the constructor
+  // function to call in the constructor array.
+  CUdeviceptr *dev_ctors_start = dev_memory;
+  CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
+  for (uint64_t i = 0; i < ctors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Get the address of the global and then store the address of the destructor
+  // function to call in the destructor array.
+  CUdeviceptr *dev_dtors_start = dev_ctors_end;
+  CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
+  for (uint64_t i = 0; i < dtors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Obtain the address of the pointers the startup implementation uses to
+  // iterate the constructors and destructors.
+  CUdeviceptr init_start;
+  if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
+                                       "__init_array_start"))
+    handle_error(err);
+  CUdeviceptr init_end;
+  if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
+                                       "__init_array_end"))
+    handle_error(err);
+  CUdeviceptr fini_start;
+  if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
+                                       "__fini_array_start"))
+    handle_error(err);
+  CUdeviceptr fini_end;
+  if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
+                                       "__fini_array_end"))
+    handle_error(err);
+
+  // Copy the pointers to the newly written array to the symbols so the startup
+  // implementation can iterate them.
+  if (CUresult err =
+          cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err =
+          cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
+    handle_error(err);
+
+  return dev_memory;
+}
+
+void print_kernel_resources(CUmodule binary, const char *kernel_name) {
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+    handle_error(err);
+  int num_regs;
+  if (CUresult err =
+          cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
+    handle_error(err);
+  printf("Executing kernel %s:\n", kernel_name);
+  printf("%6s registers: %d\n", kernel_name, num_regs);
+}
+
+template <typename args_t>
+CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
+                       const LaunchParameters &params, const char *kernel_name,
+                       args_t kernel_args, bool print_resource_usage) {
+  // look up the '_start' kernel in the loaded module.
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+    handle_error(err);
+
+  // Set up the arguments to the '_start' kernel on the GPU.
+  uint64_t args_size = sizeof(args_t);
+  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
+                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+                         CU_LAUNCH_PARAM_END};
+  if (print_resource_usage)
+    print_kernel_resources(binary, kernel_name);
+
+  // Initialize a non-blocking CUDA stream to allocate memory if needed.
+  // This needs to be done on a separate stream or else it will deadlock
+  // with the executing kernel.
+  CUstream memory_stream;
+  if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  std::atomic<bool> finished = false;
+  std::thread server_thread(
+      [](std::atomic<bool> *finished, rpc::Server *server,
+         CUstream memory_stream) {
+        auto malloc_handler = [&](size_t size) -> void * {
+          CUdeviceptr dev_ptr;
+          if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
+            dev_ptr = 0UL;
+
+          // Wait until the memory allocation is complete.
+          while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
+            ;
+          return reinterpret_cast<void *>(dev_ptr);
+        };
+
+        auto free_handler = [&](void *ptr) -> void {
+          if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
+                                            memory_stream))
+            handle_error(err);
+        };
+
+        uint32_t index = 0;
+        while (!*finished) {
+          index =
+              handle_server<32>(*server, index, malloc_handler, free_handler);
+        }
+      },
+      &finished, &server, memory_stream);
+
+  // Call the kernel with the given arguments.
+  if (CUresult err = cuLaunchKernel(
+          function, params.num_blocks_x, params.num_blocks_y,
+          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
+          params.num_threads_z, 0, stream, nullptr, args_config))
+    handle_error(err);
+
+  if (CUresult err = cuStreamSynchronize(stream))
+    handle_error(err);
+
+  finished = true;
+  if (server_thread.joinable())
+    server_thread.join();
+
+  return CUDA_SUCCESS;
+}
+
+int load(int argc, const char **argv, const char **envp, void *image,
+         size_t size, const LaunchParameters &params,
+         bool print_resource_usage) {
+  if (CUresult err = cuInit(0))
+    handle_error(err);
+  // Obtain the first device found on the system.
+  uint32_t device_id = 0;
+  CUdevice device;
+  if (CUresult err = cuDeviceGet(&device, device_id))
+    handle_error(err);
+
+  // Initialize the CUDA context and claim it for this execution.
+  CUcontext context;
+  if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
+    handle_error(err);
+  if (CUresult err = cuCtxSetCurrent(context))
+    handle_error(err);
+
+  // Increase the stack size per thread.
+  // TODO: We should allow this to be passed in so only the tests that require a
+  // larger stack can specify it to save on memory usage.
+  if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
+    handle_error(err);
+
+  // Initialize a non-blocking CUDA stream to execute the kernel.
+  CUstream stream;
+  if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  // Load the image into a CUDA module.
+  CUmodule binary;
+  if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
+    handle_error(err);
+
+  // Allocate pinned memory on the host to hold the pointer array for the
+  // copied argv and allow the GPU device to access it.
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr;
+    if (CUresult err = cuMemAllocHost(&dev_ptr, size))
+      handle_error(err);
+    return dev_ptr;
+  };
+
+  auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
+  if (!memory_or_err)
+    handle_error(toString(memory_or_err.takeError()).c_str());
+
+  void *dev_argv = copy_argument_vector(argc, argv, allocator);
+  if (!dev_argv)
+    handle_error("Failed to allocate device argv");
+
+  // Allocate pinned memory on the host to hold the pointer array for the
+  // copied environment array and allow the GPU device to access it.
+  void *dev_envp = copy_environment(envp, allocator);
+  if (!dev_envp)
+    handle_error("Failed to allocate device environment");
+
+  // Allocate space for the return pointer and initialize it to zero.
+  CUdeviceptr dev_ret;
+  if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
+    handle_error(err);
+  if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
+    handle_error(err);
+
+  uint32_t warp_size = 32;
+  void *rpc_buffer = nullptr;
+  if (CUresult err = cuMemAllocHost(
+          &rpc_buffer,
+          rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
+    handle_error(err);
+  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
+
+  // Initialize the RPC client on the device by copying the local data to the
+  // device's internal pointer.
+  CUdeviceptr rpc_client_dev = 0;
+  uint64_t client_ptr_size = sizeof(void *);
+  if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
+                                       binary, "__llvm_rpc_client"))
+    handle_error(err);
+
+  if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
+    handle_error(err);
+
+  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
+  begin_args_t init_args = {argc, dev_argv, dev_envp};
+  if (CUresult err =
+          launch_kernel(binary, stream, server, single_threaded_params,
+                        "_begin", init_args, print_resource_usage))
+    handle_error(err);
+
+  start_args_t args = {argc, dev_argv, dev_envp,
+                       reinterpret_cast<void *>(dev_ret)};
+  if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
+                                   args, print_resource_usage))
+    handle_error(err);
+
+  // Copy the return value back from the kernel and wait.
+  int host_ret = 0;
+  if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
+    handle_error(err);
+
+  if (CUresult err = cuStreamSynchronize(stream))
+    handle_error(err);
+
+  end_args_t fini_args = {host_ret};
+  if (CUresult err =
+          launch_kernel(binary, stream, server, single_threaded_params, "_end",
+                        fini_args, print_resource_usage))
+    handle_error(err);
+
+  // Free the memory allocated for the device.
+  if (CUresult err = cuMemFreeHost(*memory_or_err))
+    handle_error(err);
+  if (CUresult err = cuMemFree(dev_ret))
+    handle_error(err);
+  if (CUresult err = cuMemFreeHost(dev_argv))
+    handle_error(err);
+  if (CUresult err = cuMemFreeHost(rpc_buffer))
+    handle_error(err);
+
+  // Destroy the context and the loaded binary.
+  if (CUresult err = cuModuleUnload(binary))
+    handle_error(err);
+  if (CUresult err = cuDevicePrimaryCtxRelease(device))
+    handle_error(err);
+  return host_ret;
+}