[Offload] Rework compiling device code for unit test suites (#144776)

Summary:
I'll probably want to use this as a more generic utility in the future.
This patch reworks it to make it a top level function. I also tried to
decouple this from the OpenMP utilities to make that easier in the
future. Instead, I just use `-march=native` functionality which is the
same thing. Needed a small hack to skip the linker stage for checking if
that works.

This should still create the same output as far as I'm aware.
This commit is contained in:
Joseph Huber
2025-06-20 10:31:54 -05:00
committed by GitHub
parent c734377544
commit 3f1de197b1
2 changed files with 74 additions and 68 deletions

View File

@@ -1,6 +1,72 @@
add_custom_target(OffloadUnitTests)
set_target_properties(OffloadUnitTests PROPERTIES FOLDER "Tests/UnitTests")
function(add_offload_test_device_code test_filename test_name)
set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
# Try to build with support for NVPTX devices.
if("cuda" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
get_filename_component(cuda_path "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
endif()
check_cxx_compiler_flag(
"--target=nvptx64-nvidia-cuda -march=native --cuda-path=${cuda_path}" PLATFORM_HAS_NVPTX)
if(OFFLOAD_TESTS_FORCE_NVPTX_ARCH)
set(nvptx_arch "${OFFLOAD_TESTS_FORCE_NVPTX_ARCH}")
elseif(PLATFORM_HAS_NVPTX)
set(nvptx_arch "native")
endif()
if(nvptx_arch AND CUDAToolkit_FOUND)
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin")
add_custom_command(
OUTPUT ${output_file}
COMMAND ${CMAKE_C_COMPILER}
--target=nvptx64-nvidia-cuda -march=${nvptx_arch}
-nogpulib --cuda-path=${CUDA_ROOT} -flto ${ARGN}
-c ${SRC_PATH} -o ${output_file}
DEPENDS ${SRC_PATH}
)
add_custom_target(${test_name}.nvptx64 DEPENDS ${output_file})
endif()
endif()
# Try to build with support for AMDGPU devices.
if("amdgpu" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
check_cxx_compiler_flag("--target=amdgcn-amd-amdhsa -mcpu=native" PLATFORM_HAS_AMDGPU)
if(OFFLOAD_TESTS_FORCE_AMDGPU_ARCH)
set(amdgpu_arch "${OFFLOAD_TESTS_FORCE_AMDGPU_ARCH}")
elseif(PLATFORM_HAS_AMDGPU)
set(amdgpu_arch "native")
endif()
if(amdgpu_arch)
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin")
add_custom_command(
OUTPUT ${output_file}
COMMAND ${CMAKE_C_COMPILER}
--target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch}
-nogpulib -flto ${ARGN} -c ${SRC_PATH} -o ${output_file}
DEPENDS ${SRC_PATH}
)
add_custom_target(${test_name}.amdgpu DEPENDS ${output_file})
endif()
endif()
# Create a single dependency target for the device code.
add_custom_target(${test_name}.bin)
if(TARGET ${test_name}.amdgpu)
add_dependencies(${test_name}.bin ${test_name}.amdgpu)
endif()
if(TARGET ${test_name}.nvptx64)
add_dependencies(${test_name}.bin ${test_name}.nvptx64)
endif()
endfunction()
function(add_offload_unittest test_dirname)
set(target_name "${test_dirname}.unittests")
@@ -9,10 +75,15 @@ function(add_offload_unittest test_dirname)
add_unittest(OffloadUnitTests "${target_name}"
${CMAKE_CURRENT_SOURCE_DIR}/common/Environment.cpp
${files})
add_dependencies(${target_name} ${PLUGINS_TEST_COMMON} OffloadUnitTestsDeviceBins)
add_dependencies(${target_name} ${PLUGINS_TEST_COMMON} offload_device_binaries)
target_compile_definitions(${target_name} PRIVATE DEVICE_CODE_PATH="${OFFLOAD_TEST_DEVICE_CODE_PATH}")
target_link_libraries(${target_name} PRIVATE ${PLUGINS_TEST_COMMON})
target_include_directories(${target_name} PRIVATE ${PLUGINS_TEST_INCLUDE})
endfunction()
set(OFFLOAD_TESTS_FORCE_NVPTX_ARCH "" CACHE STRING
"Force building of NVPTX device code for Offload unit tests with the given arch, e.g. sm_61")
set(OFFLOAD_TESTS_FORCE_AMDGPU_ARCH "" CACHE STRING
"Force building of AMDGPU device code for Offload unit tests with the given arch, e.g. gfx1030")
add_subdirectory(OffloadAPI)

View File

@@ -1,72 +1,7 @@
macro(add_offload_test_device_code test_filename test_name)
set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
# Build for NVPTX
if(OFFLOAD_TEST_TARGET_NVIDIA)
set(BIN_PATH ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin)
add_custom_command(OUTPUT ${BIN_PATH}
COMMAND
${CMAKE_C_COMPILER} --target=nvptx64-nvidia-cuda
${ARGN}
-march=${LIBOMPTARGET_DEP_CUDA_ARCH}
--cuda-path=${CUDA_ROOT}
${SRC_PATH} -o ${BIN_PATH}
DEPENDS ${SRC_PATH}
)
list(APPEND BIN_PATHS ${BIN_PATH})
endif()
# Build for AMDGPU
if(OFFLOAD_TEST_TARGET_AMDGPU)
set(BIN_PATH ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin)
add_custom_command(OUTPUT ${BIN_PATH}
COMMAND
${CMAKE_C_COMPILER} --target=amdgcn-amd-amdhsa -nogpulib
${ARGN}
-mcpu=${LIBOMPTARGET_DEP_AMDGPU_ARCH}
${SRC_PATH} -o ${BIN_PATH}
DEPENDS ${SRC_PATH}
)
list(APPEND BIN_PATHS ${BIN_PATH})
endif()
# TODO: Build for host CPU
endmacro()
# Decide what device targets to build for. LibomptargetGetDependencies is
# included at the top-level so the GPUs present on the system are already
# detected.
set(OFFLOAD_TESTS_FORCE_NVIDIA_ARCH "" CACHE STRING
"Force building of NVPTX device code for Offload unit tests with the given arch, e.g. sm_61")
set(OFFLOAD_TESTS_FORCE_AMDGPU_ARCH "" CACHE STRING
"Force building of AMDGPU device code for Offload unit tests with the given arch, e.g. gfx1030")
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
get_filename_component(CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
endif()
if (OFFLOAD_TESTS_FORCE_NVIDIA_ARCH)
set(LIBOMPTARGET_DEP_CUDA_ARCH ${OFFLOAD_TESTS_FORCE_NVIDIA_ARCH})
set(OFFLOAD_TEST_TARGET_NVIDIA ON)
elseif (LIBOMPTARGET_FOUND_NVIDIA_GPU AND CUDA_ROOT AND "cuda" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
set(OFFLOAD_TEST_TARGET_NVIDIA ON)
endif()
if (OFFLOAD_TESTS_FORCE_AMDGPU_ARCH)
set(LIBOMPTARGET_DEP_AMDGPU_ARCH ${OFFLOAD_TESTS_FORCE_AMDGPU_ARCH})
set(OFFLOAD_TEST_TARGET_AMDGPU ON)
elseif (LIBOMPTARGET_FOUND_AMDGPU_GPU AND "amdgpu" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
list(GET LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST 0 LIBOMPTARGET_DEP_AMDGPU_ARCH)
set(OFFLOAD_TEST_TARGET_AMDGPU ON)
endif()
add_offload_test_device_code(foo.c foo)
add_offload_test_device_code(bar.c bar)
# By default, amdhsa will add a number of "hidden" arguments to the kernel defintion
# O3 disables this, and results in a kernel function with actually no arguments as seen by liboffload
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.c noargs -O3)
add_custom_target(OffloadUnitTestsDeviceBins DEPENDS ${BIN_PATHS})
add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)