Files
clang-p2996/openmp/libomptarget/DeviceRTL/src/Utils.cpp
Joseph Huber 9223315903 [DeviceRTL] Allow IsSPMDMode to be optimized out in LTO mode
A previous patch merged the static and bitcode versions of the
deviceRTL. We previously used the static library's separate compilation
to set a special flag that prevented `IsSPMDMode` from being put in the
used list and preventing it from being optimized out. When they were
merged we could no longer do this separate compilation that allowed
users of LTO to get more optimal code.

This patch rearranges the code. The `IsSPMDMode` global is now
transitively used by its inclusion in the changed `__keep_alive`
function. This allows us to then manually delete the `__keep_alive`
function from the module when building the static library via
`llvm-extract`. The result is that the bitcode library correctly will
maintain the needed shared state, while the static library will be able
to internalize it and optimize it out.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D135280
2022-10-05 14:40:01 -05:00

169 lines
5.0 KiB
C++

//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#include "Utils.h"
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#pragma omp begin declare target device_type(nohost)
using namespace _OMP;
extern "C" __attribute__((weak)) int IsSPMDMode;
/// Helper to keep code alive without introducing a performance penalty.
extern "C" __attribute__((weak, optnone, cold, used, retain)) void
__keep_alive() {
__kmpc_get_hardware_thread_id_in_block();
__kmpc_get_hardware_num_threads_in_block();
__kmpc_get_warp_size();
__kmpc_barrier_simple_spmd(nullptr, IsSPMDMode);
__kmpc_barrier_simple_generic(nullptr, IsSPMDMode);
}
namespace impl {
bool isSharedMemPtr(const void *Ptr) { return false; }
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
*HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
}
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
#pragma omp end declare variant
///}
/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
uint32_t LowBitsLocal, HighBitsLocal;
asm("mov.b64 {%0,%1}, %2;"
: "=r"(LowBitsLocal), "=r"(HighBitsLocal)
: "l"(Val));
*LowBits = LowBitsLocal;
*HighBits = HighBitsLocal;
}
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
uint64_t Val;
asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
return Val;
}
#pragma omp end declare variant
///}
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
int Width = mapping::getWarpSize();
int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = Self + LaneDelta;
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
bool isSharedMemPtr(const void * Ptr) {
return __builtin_amdgcn_is_shared((const __attribute__((address_space(0))) void *)Ptr);
}
#pragma omp end declare variant
///}
/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
}
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
#pragma omp end declare variant
///}
} // namespace impl
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
return impl::Pack(LowBits, HighBits);
}
void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
impl::Unpack(Val, &LowBits, &HighBits);
}
int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
return impl::shuffle(Mask, Var, SrcLane);
}
int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
int32_t Width) {
return impl::shuffleDown(Mask, Var, Delta, Width);
}
bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
FunctionTracingRAII();
return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
}
int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
FunctionTracingRAII();
uint32_t lo, hi;
utils::unpack(Val, lo, hi);
hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
return utils::pack(lo, hi);
}
}
#pragma omp end declare target