Files
clang-p2996/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
Joseph Huber 74f91741b6 [OpenMP] Use function tracing RAII for runtime functions.
This patch adds support for using function tracing features to track the
executino of runtime functions in the device runtime library. This is
enabled by first compiling the new runtime with
`-fopenmp-target-debug=3` and running with
`LIBOMPTARGET_DEVICE_RTL_DEBUG=3`. The output only tracks team 0 and
thread 0 so there isn't much output when using a generic region.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D112002
2021-10-29 14:57:11 -04:00

321 lines
11 KiB
C++

//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of reduction with KMPC interface.
//
//===----------------------------------------------------------------------===//
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace _OMP;
namespace {
#pragma omp declare target
void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
shflFct(reduce_data, /*LaneId - not used= */ 0,
/*Offset = */ mask, /*AlgoVersion=*/0);
}
}
void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
uint32_t size, uint32_t tid) {
uint32_t curr_size;
uint32_t mask;
curr_size = size;
mask = curr_size / 2;
while (mask > 0) {
shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
curr_size = (curr_size + 1) / 2;
mask = curr_size / 2;
}
}
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
__kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
__kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
do {
Liveness = mapping::activemask();
remote_id = utils::ffs(Liveness & lanemask_gt);
size = utils::popc(Liveness);
logical_lane_id /= 2;
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);
}
#endif
static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
uint64_t reduce_size,
void *reduce_data,
ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct,
bool isSPMDExecutionMode, bool) {
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
BlockThreadId = 0;
uint32_t NumThreads = omp_get_num_threads();
if (NumThreads == 1)
return 1;
/*
* This reduce function handles reduction within a team. It handles
* parallel regions in both L1 and L2 parallelism levels. It also
* supports Generic, SPMD, and NoOMP modes.
*
* 1. Reduce within a warp.
* 2. Warp master copies value to warp 0 via shared memory.
* 3. Warp 0 reduces to a single value.
* 4. The reduced value is available in the thread that returns 1.
*/
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
uint32_t WarpId = mapping::getWarpId();
// Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/NumThreads % mapping::getWarpSize(),
/*LaneId=*/mapping::getThreadIdInBlock() %
mapping::getWarpSize());
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
}
return BlockThreadId == 0;
#else
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
if (Liveness == lanes::All) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/utils::popc(Liveness),
/*LaneId=*/mapping::getThreadIdInBlock() %
mapping::getWarpSize());
else { // Dispersed lanes. Only threads in L2
// parallel region may enter here; return
// early.
return gpu_irregular_simd_reduce(reduce_data, shflFct);
}
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
return BlockThreadId == 0;
}
// Get the OMP thread Id. This is different from BlockThreadId in the case of
// an L2 parallel region.
return TId == 0;
#endif // __CUDA_ARCH__ >= 700
}
uint32_t roundToWarpsize(uint32_t s) {
if (s < mapping::getWarpSize())
return 1;
return (s & ~(unsigned)(mapping::getWarpSize() - 1));
}
uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
static volatile uint32_t IterCnt = 0;
static volatile uint32_t Cnt = 0;
} // namespace
extern "C" {
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
FunctionTracingRAII();
return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
shflFct, cpyFct, mapping::isSPMDMode(),
false);
}
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
ListGlobalFnTy glredFct) {
FunctionTracingRAII();
// Terminate all threads in non-SPMD mode except for the master thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
if (!mapping::isMainThreadInGenericMode())
return 0;
ThreadId = 0;
}
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads = omp_get_num_threads();
uint32_t TeamId = omp_get_team_num();
uint32_t NumTeams = omp_get_num_teams();
static unsigned SHARED(Bound);
static unsigned SHARED(ChunkTeamCount);
// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal
// to the number of slots in the buffer.
bool IsMaster = (ThreadId == 0);
while (IsMaster) {
Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
if (TeamId < Bound + num_of_records)
break;
}
if (IsMaster) {
int ModBockId = TeamId % num_of_records;
if (TeamId < num_of_records) {
lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
} else
lgredFct(GlobalBuffer, ModBockId, reduce_data);
fence::system(__ATOMIC_SEQ_CST);
// Increment team counter.
// This counter is incremented by all teams in the current
// BUFFER_SIZE chunk.
ChunkTeamCount =
atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
}
// Synchronize
if (mapping::isSPMDMode())
__kmpc_barrier(Loc, TId);
// reduce_data is global or shared so before being reduced within the
// warp we need to bring it in local memory:
// local_reduce_data = reduce_data[i]
//
// Example for 3 reduction variables a, b, c (of potentially different
// types):
//
// buffer layout (struct of arrays):
// a, a, ..., a, b, b, ... b, c, c, ... c
// |__________|
// num_of_records
//
// local_data_reduce layout (struct):
// a, b, c
//
// Each thread will have a local struct containing the values to be
// reduced:
// 1. do reduction within each warp.
// 2. do reduction across warps.
// 3. write the final result to the main reduction variable
// by returning 1 in the thread holding the reduction result.
// Check if this is the very last team.
unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
if (ChunkTeamCount == NumTeams - Bound - 1) {
//
// Last team processing.
//
if (ThreadId >= NumRecs)
return 0;
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
if (ThreadId >= NumThreads)
return 0;
// Load from buffer and reduce.
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
glredFct(GlobalBuffer, i, reduce_data);
// Reduce across warps to the warp master.
if (NumThreads > 1) {
gpu_regular_warp_reduce(reduce_data, shflFct);
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
if (ActiveThreads > mapping::getWarpSize()) {
uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
uint32_t WarpId = ThreadId / mapping::getWarpSize();
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
ThreadId);
}
}
if (IsMaster) {
Cnt = 0;
IterCnt = 0;
return 1;
}
return 0;
}
if (IsMaster && ChunkTeamCount == num_of_records - 1) {
// Allow SIZE number of teams to proceed writing their
// intermediate results to the global buffer.
atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records),
__ATOMIC_SEQ_CST);
}
return 0;
}
void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
}
#pragma omp end declare target