//===-- Loader Implementation for NVPTX devices --------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file impelements a simple loader to run images supporting the NVPTX // architecture. The file launches the '_start' kernel which should be provided // by the device application start code and call ultimately call the 'main' // function. // //===----------------------------------------------------------------------===// #include "Loader.h" #include "Server.h" #include "cuda.h" #include #include #include #include /// The arguments to the '_start' kernel. struct kernel_args_t { int argc; void *argv; void *envp; void *ret; void *inbox; void *outbox; void *buffer; }; static void handle_error(CUresult err) { if (err == CUDA_SUCCESS) return; const char *err_str = nullptr; CUresult result = cuGetErrorString(err, &err_str); if (result != CUDA_SUCCESS) fprintf(stderr, "Unknown Error\n"); else fprintf(stderr, "%s\n", err_str); exit(1); } static void handle_error(const char *msg) { fprintf(stderr, "%s\n", msg); exit(EXIT_FAILURE); } int load(int argc, char **argv, char **envp, void *image, size_t size, const LaunchParameters ¶ms) { if (CUresult err = cuInit(0)) handle_error(err); // Obtain the first device found on the system. CUdevice device; if (CUresult err = cuDeviceGet(&device, 0)) handle_error(err); // Initialize the CUDA context and claim it for this execution. CUcontext context; if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) handle_error(err); if (CUresult err = cuCtxSetCurrent(context)) handle_error(err); // Initialize a non-blocking CUDA stream to execute the kernel. CUstream stream; if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) handle_error(err); // Load the image into a CUDA module. CUmodule binary; if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) handle_error(err); // look up the '_start' kernel in the loaded module. CUfunction function; if (CUresult err = cuModuleGetFunction(&function, binary, "_start")) handle_error(err); // Allocate pinned memory on the host to hold the pointer array for the // copied argv and allow the GPU device to access it. auto allocator = [&](uint64_t size) -> void * { void *dev_ptr; if (CUresult err = cuMemAllocHost(&dev_ptr, size)) handle_error(err); return dev_ptr; }; void *dev_argv = copy_argument_vector(argc, argv, allocator); if (!dev_argv) handle_error("Failed to allocate device argv"); // Allocate pinned memory on the host to hold the pointer array for the // copied environment array and allow the GPU device to access it. void *dev_envp = copy_environment(envp, allocator); if (!dev_envp) handle_error("Failed to allocate device environment"); // Allocate space for the return pointer and initialize it to zero. CUdeviceptr dev_ret; if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) handle_error(err); if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) handle_error(err); void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic)); void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic)); void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer)); if (!server_inbox || !server_outbox || !buffer) handle_error("Failed to allocate memory the RPC client / server."); // Set up the arguments to the '_start' kernel on the GPU. uint64_t args_size = sizeof(kernel_args_t); kernel_args_t args; std::memset(&args, 0, args_size); args.argc = argc; args.argv = dev_argv; args.envp = dev_envp; args.ret = reinterpret_cast(dev_ret); args.inbox = server_outbox; args.outbox = server_inbox; args.buffer = buffer; void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args, CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, CU_LAUNCH_PARAM_END}; // Initialize the RPC server's buffer for host-device communication. server.reset(&lock, server_inbox, server_outbox, buffer); // Call the kernel with the given arguments. if (CUresult err = cuLaunchKernel( function, params.num_blocks_x, params.num_blocks_y, params.num_blocks_z, params.num_threads_x, params.num_threads_y, params.num_threads_z, 0, stream, nullptr, args_config)) handle_error(err); // Wait until the kernel has completed execution on the device. Periodically // check the RPC client for work to be performed on the server. while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) handle_server(); // Copy the return value back from the kernel and wait. int host_ret = 0; if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) handle_error(err); if (CUresult err = cuStreamSynchronize(stream)) handle_error(err); // Free the memory allocated for the device. if (CUresult err = cuMemFree(dev_ret)) handle_error(err); if (CUresult err = cuMemFreeHost(dev_argv)) handle_error(err); if (CUresult err = cuMemFreeHost(server_inbox)) handle_error(err); if (CUresult err = cuMemFreeHost(server_outbox)) handle_error(err); if (CUresult err = cuMemFreeHost(buffer)) handle_error(err); // Destroy the context and the loaded binary. if (CUresult err = cuModuleUnload(binary)) handle_error(err); if (CUresult err = cuDevicePrimaryCtxRelease(device)) handle_error(err); return host_ret; }