Files
clang-p2996/offload/DeviceRTL/src/Workshare.cpp
Johannes Doerfert 330d8983d2 [Offload] Move /openmp/libomptarget to /offload (#75125)
In a nutshell, this moves our libomptarget code to populate the offload
subproject.

With this commit, users need to enable the new LLVM/Offload subproject
as a runtime in their cmake configuration.
No further changes are expected for downstream code.

Tests and other components still depend on OpenMP and have also not been
renamed. The results below are for a build in which OpenMP and Offload
are enabled runtimes. In addition to the pure `git mv`, we needed to
adjust some CMake files. Nothing is intended to change semantics.

```
ninja check-offload
```
Works with the X86 and AMDGPU offload tests

```
ninja check-openmp
```
Still works but doesn't build offload tests anymore.

```
ls install/lib
```
Shows all expected libraries, incl.
- `libomptarget.devicertl.a`
- `libomptarget-nvptx-sm_90.bc`
- `libomptarget.rtl.amdgpu.so` -> `libomptarget.rtl.amdgpu.so.18git`
- `libomptarget.so` -> `libomptarget.so.18git`

Fixes: https://github.com/llvm/llvm-project/issues/75124

---------

Co-authored-by: Saiyedul Islam <Saiyedul.Islam@amd.com>
2024-04-22 09:51:33 -07:00

891 lines
33 KiB
C++

//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of the KMPC interface
// for the loop construct plus other worksharing constructs that use the same
// interface as loops.
//
//===----------------------------------------------------------------------===//
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace ompx;
// TODO:
struct DynamicScheduleTracker {
int64_t Chunk;
int64_t LoopUpperBound;
int64_t NextLowerBound;
int64_t Stride;
kmp_sched_t ScheduleType;
DynamicScheduleTracker *NextDST;
};
#define ASSERT0(...)
// used by the library for the interface with the app
#define DISPATCH_FINISHED 0
#define DISPATCH_NOTFINISHED 1
// used by dynamic scheduling
#define FINISHED 0
#define NOT_FINISHED 1
#define LAST_CHUNK 2
#pragma omp begin declare target device_type(nohost)
// TODO: This variable is a hack inherited from the old runtime.
static uint64_t SHARED(Cnt);
template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
////////////////////////////////////////////////////////////////////////////////
// Loop with static scheduling with chunk
// Generic implementation of OMP loop scheduling with static policy
/*! \brief Calculate initial bounds for static loop and stride
* @param[in] loc location in code of the call (not used here)
* @param[in] global_tid global thread id
* @param[in] schetype type of scheduling (see omptarget-nvptx.h)
* @param[in] plastiter pointer to last iteration
* @param[in,out] pointer to loop lower bound. it will contain value of
* lower bound of first chunk
* @param[in,out] pointer to loop upper bound. It will contain value of
* upper bound of first chunk
* @param[in,out] pointer to loop stride. It will contain value of stride
* between two successive chunks executed by the same thread
* @param[in] loop increment bump
* @param[in] chunk size
*/
// helper function for static chunk
static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
T entityId, T numberOfEntities) {
// each thread executes multiple chunks all of the same size, except
// the last one
// distance between two successive chunks
stride = numberOfEntities * chunk;
lb = lb + entityId * chunk;
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
// Say ub' is the begining of the last chunk. Then who ever has a
// lower bound plus a multiple of the increment equal to ub' is
// the last one.
T beginingLastChunk = inputUb - (inputUb % chunk);
last = ((beginingLastChunk - lb) % stride) == 0;
}
////////////////////////////////////////////////////////////////////////////////
// Loop with static scheduling without chunk
// helper function for static no chunk
static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
T entityId, T numberOfEntities) {
// No chunk size specified. Each thread or warp gets at most one
// chunk; chunks are all almost of equal size
T loopSize = ub - lb + 1;
chunk = loopSize / numberOfEntities;
T leftOver = loopSize - chunk * numberOfEntities;
if (entityId < leftOver) {
chunk++;
lb = lb + entityId * chunk;
} else {
lb = lb + entityId * chunk + leftOver;
}
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
}
////////////////////////////////////////////////////////////////////////////////
// Support for Static Init
static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
T *plower, T *pupper, ST *pstride, ST chunk,
bool IsSPMDExecutionMode) {
int32_t gtid = omp_get_thread_num();
int numberOfActiveOMPThreads = omp_get_num_threads();
// All warps that are in excess of the maximum requested, do
// not execute the loop
ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
"current thread is not needed here; error");
// copy
int lastiter = 0;
T lb = *plower;
T ub = *pupper;
ST stride = *pstride;
// init
switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
case kmp_sched_static_chunk: {
if (chunk > 0) {
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
break;
}
[[fallthrough]];
} // note: if chunk <=0, use nochunk
case kmp_sched_static_balanced_chunk: {
if (chunk > 0) {
// round up to make sure the chunk is enough to cover all iterations
T tripCount = ub - lb + 1; // +1 because ub is inclusive
T span = (tripCount + numberOfActiveOMPThreads - 1) /
numberOfActiveOMPThreads;
// perform chunk adjustment
chunk = (span + chunk - 1) & ~(chunk - 1);
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
T oldUb = ub;
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
if (ub > oldUb)
ub = oldUb;
break;
}
[[fallthrough]];
} // note: if chunk <=0, use nochunk
case kmp_sched_static_nochunk: {
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_static_chunk: {
if (chunk > 0) {
ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
omp_get_num_teams());
break;
}
[[fallthrough]];
} // note: if chunk <=0, use nochunk
case kmp_sched_distr_static_nochunk: {
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
omp_get_num_teams());
break;
}
case kmp_sched_distr_static_chunk_sched_static_chunkone: {
ForStaticChunk(lastiter, lb, ub, stride, chunk,
numberOfActiveOMPThreads * omp_get_team_num() + gtid,
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
default: {
// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
break;
}
}
// copy back
*plastiter = lastiter;
*plower = lb;
*pupper = ub;
*pstride = stride;
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch Init
static int OrderedSchedule(kmp_sched_t schedule) {
return schedule >= kmp_sched_ordered_first &&
schedule <= kmp_sched_ordered_last;
}
static void dispatch_init(IdentTy *loc, int32_t threadId,
kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
DynamicScheduleTracker *DST) {
int tid = mapping::getThreadIdInBlock();
T tnum = omp_get_num_threads();
T tripCount = ub - lb + 1; // +1 because ub is inclusive
ASSERT0(LT_FUSSY, threadId < tnum,
"current thread is not needed here; error");
/* Currently just ignore the monotonic and non-monotonic modifiers
* (the compiler isn't producing them * yet anyway).
* When it is we'll want to look at them somewhere here and use that
* information to add to our schedule choice. We shouldn't need to pass
* them on, they merely affect which schedule we can legally choose for
* various dynamic cases. (In particular, whether or not a stealing scheme
* is legal).
*/
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
// Process schedule.
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
if (OrderedSchedule(schedule))
__kmpc_barrier(loc, threadId);
schedule = kmp_sched_static_chunk;
chunk = tripCount; // one thread gets the whole loop
} else if (schedule == kmp_sched_runtime) {
// process runtime
omp_sched_t rtSched;
int ChunkInt;
omp_get_schedule(&rtSched, &ChunkInt);
chunk = ChunkInt;
switch (rtSched) {
case omp_sched_static: {
if (chunk > 0)
schedule = kmp_sched_static_chunk;
else
schedule = kmp_sched_static_nochunk;
break;
}
case omp_sched_auto: {
schedule = kmp_sched_static_chunk;
chunk = 1;
break;
}
case omp_sched_dynamic:
case omp_sched_guided: {
schedule = kmp_sched_dynamic;
break;
}
}
} else if (schedule == kmp_sched_auto) {
schedule = kmp_sched_static_chunk;
chunk = 1;
} else {
// ASSERT(LT_FUSSY,
// schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
// "unknown schedule %d & chunk %lld\n", (int)schedule,
// (long long)chunk);
}
// init schedules
if (schedule == kmp_sched_static_chunk) {
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
// save sched state
DST->ScheduleType = schedule;
// save ub
DST->LoopUpperBound = ub;
// compute static chunk
ST stride;
int lastiter = 0;
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
DST->Chunk = chunk;
DST->NextLowerBound = lb;
DST->Stride = stride;
} else if (schedule == kmp_sched_static_balanced_chunk) {
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
// save sched state
DST->ScheduleType = schedule;
// save ub
DST->LoopUpperBound = ub;
// compute static chunk
ST stride;
int lastiter = 0;
// round up to make sure the chunk is enough to cover all iterations
T span = (tripCount + tnum - 1) / tnum;
// perform chunk adjustment
chunk = (span + chunk - 1) & ~(chunk - 1);
T oldUb = ub;
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
if (ub > oldUb)
ub = oldUb;
// save computed params
DST->Chunk = chunk;
DST->NextLowerBound = lb;
DST->Stride = stride;
} else if (schedule == kmp_sched_static_nochunk) {
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
// save sched state
DST->ScheduleType = schedule;
// save ub
DST->LoopUpperBound = ub;
// compute static chunk
ST stride;
int lastiter = 0;
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
DST->Chunk = chunk;
DST->NextLowerBound = lb;
DST->Stride = stride;
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
// save data
DST->ScheduleType = schedule;
if (chunk < 1)
chunk = 1;
DST->Chunk = chunk;
DST->LoopUpperBound = ub;
DST->NextLowerBound = lb;
__kmpc_barrier(loc, threadId);
if (tid == 0) {
Cnt = 0;
fence::team(atomic::seq_cst);
}
__kmpc_barrier(loc, threadId);
}
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
static uint64_t NextIter() {
__kmpc_impl_lanemask_t active = mapping::activemask();
uint32_t leader = utils::ffs(active) - 1;
uint32_t change = utils::popc(active);
__kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
unsigned int rank = utils::popc(active & lane_mask_lt);
uint64_t warp_res = 0;
if (rank == 0) {
warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
}
warp_res = utils::shuffle(active, warp_res, leader);
return warp_res + rank;
}
static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
T loopUpperBound) {
T N = NextIter();
lb = loopLowerBound + N * chunkSize;
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
// a. lb and ub < loopUpperBound --> NOT_FINISHED
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
// NOT_FINISHED
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
// a.
if (lb <= loopUpperBound && ub < loopUpperBound) {
return NOT_FINISHED;
}
// b.
if (lb <= loopUpperBound) {
ub = loopUpperBound;
return LAST_CHUNK;
}
// c. if we are here, we are in case 'c'
lb = loopUpperBound + 2;
ub = loopUpperBound + 1;
return FINISHED;
}
static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
T *plower, T *pupper, ST *pstride,
DynamicScheduleTracker *DST) {
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule = DST->ScheduleType;
// xxx reduce to one
if (schedule == kmp_sched_static_chunk ||
schedule == kmp_sched_static_nochunk) {
T myLb = DST->NextLowerBound;
T ub = DST->LoopUpperBound;
// finished?
if (myLb > ub) {
return DISPATCH_FINISHED;
}
// not finished, save current bounds
ST chunk = DST->Chunk;
*plower = myLb;
T myUb = myLb + chunk - 1; // Clang uses i <= ub
if (myUb > ub)
myUb = ub;
*pupper = myUb;
*plast = (int32_t)(myUb == ub);
// increment next lower bound by the stride
ST stride = DST->Stride;
DST->NextLowerBound = myLb + stride;
return DISPATCH_NOTFINISHED;
}
ASSERT0(LT_FUSSY,
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"bad sched");
T myLb, myUb;
int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
DST->LoopUpperBound);
if (finished == FINISHED)
return DISPATCH_FINISHED;
// not finished (either not finished or last chunk)
*plast = (int32_t)(finished == LAST_CHUNK);
*plower = myLb;
*pupper = myUb;
*pstride = 1;
return DISPATCH_NOTFINISHED;
}
static void dispatch_fini() {
// nothing
}
////////////////////////////////////////////////////////////////////////////////
// end of template class that encapsulate all the helper functions
////////////////////////////////////////////////////////////////////////////////
};
////////////////////////////////////////////////////////////////////////////////
// KMP interface implementation (dyn loops)
////////////////////////////////////////////////////////////////////////////////
// TODO: This is a stopgap. We probably want to expand the dispatch API to take
// an DST pointer which can then be allocated properly without malloc.
static DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
// Create a new DST, link the current one, and define the new as current.
static DynamicScheduleTracker *pushDST() {
DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
*NewDST = DynamicScheduleTracker({0});
NewDST->NextDST = ThreadDSTPtr;
ThreadDSTPtr = NewDST;
return ThreadDSTPtr;
}
// Return the current DST.
static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
// Pop the current DST and restore the last one.
static void popDST() {
DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
memory::freeGlobal(ThreadDSTPtr, "remove DST");
ThreadDSTPtr = OldDST;
}
extern "C" {
// init
void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
uint32_t lb, uint32_t ub, int32_t st,
int32_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
uint64_t lb, uint64_t ub, int64_t st,
int64_t chunk) {
DynamicScheduleTracker *DST = pushDST();
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
}
// next
int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
DynamicScheduleTracker *DST = peekDST();
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st, DST);
}
// fini
void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
popDST();
}
void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
popDST();
}
void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
popDST();
}
void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
popDST();
}
////////////////////////////////////////////////////////////////////////////////
// KMP interface implementation (static loops)
////////////////////////////////////////////////////////////////////////////////
void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr, int32_t chunk) {
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr, int32_t chunk) {
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr, int64_t chunk) {
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr, int64_t chunk) {
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
mapping::isSPMDMode());
}
void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
}
namespace ompx {
/// Helper class to hide the generic loop nest and provide the template argument
/// throughout.
template <typename Ty> class StaticLoopChunker {
/// Generic loop nest that handles block and/or thread distribution in the
/// absence of user specified chunk sizes. This implicitly picks a block chunk
/// size equal to the number of threads in the block and a thread chunk size
/// equal to one. In contrast to the chunked version we can get away with a
/// single loop in this case
static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
Ty NumBlocks, Ty BId, Ty NumThreads,
Ty TId, Ty NumIters,
bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * NumThreads;
// Start index in the normalized space.
Ty IV = BId * NumThreads + TId;
ASSERT(IV >= 0, "Bad index");
// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
if (IV < NumIters) {
do {
// Execute the loop body.
LoopBody(IV, Arg);
// Every thread executed one block and thread chunk now.
IV += KernelIteration;
if (OneIterationPerThread)
return;
} while (IV < NumIters);
}
}
/// Generic loop nest that handles block and/or thread distribution in the
/// presence of user specified chunk sizes (for at least one of them).
static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
Ty BlockChunk, Ty NumBlocks, Ty BId,
Ty ThreadChunk, Ty NumThreads, Ty TId,
Ty NumIters,
bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * BlockChunk;
// Start index in the chunked space.
Ty IV = BId * BlockChunk + TId;
ASSERT(IV >= 0, "Bad index");
// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
do {
Ty BlockChunkLeft =
BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
Ty ThreadChunkLeft =
ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
while (ThreadChunkLeft--) {
// Given the blocking it's hard to keep track of what to execute.
if (IV >= NumIters)
return;
// Execute the loop body.
LoopBody(IV, Arg);
if (OneIterationPerThread)
return;
++IV;
}
IV += KernelIteration;
} while (IV < NumIters);
}
public:
/// Worksharing `for`-loop.
static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(ThreadChunk >= 0, "Bad thread count");
// All threads need to participate but we don't know if we are in a
// parallel at all or if the user might have used a `num_threads` clause
// on the parallel and reduced the number compared to the block size.
// Since nested parallels are possible too we need to get the thread id
// from the `omp` getter and not the mapping directly.
Ty TId = omp_get_thread_num();
// There are no blocks involved here.
Ty BlockChunk = 0;
Ty NumBlocks = 1;
Ty BId = 0;
// If the thread chunk is not specified we pick a default now.
if (ThreadChunk == 0)
ThreadChunk = 1;
// If we know we have more threads than iterations we can indicate that to
// avoid an outer loop.
bool OneIterationPerThread = false;
if (config::getAssumeThreadsOversubscription()) {
ASSERT(NumThreads >= NumIters, "Broken assumption");
OneIterationPerThread = true;
}
if (ThreadChunk != 1)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
OneIterationPerThread);
else
NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
NumIters, OneIterationPerThread);
}
/// Worksharing `distrbute`-loop.
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
Ty NumIters, Ty BlockChunk) {
ASSERT(icv::Level == 0, "Bad distribute");
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(BlockChunk >= 0, "Bad block count");
// There are no threads involved here.
Ty ThreadChunk = 0;
Ty NumThreads = 1;
Ty TId = 0;
ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
// All teams need to participate.
Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
Ty BId = mapping::getBlockIdInKernel();
// If the block chunk is not specified we pick a default now.
if (BlockChunk == 0)
BlockChunk = NumThreads;
// If we know we have more blocks than iterations we can indicate that to
// avoid an outer loop.
bool OneIterationPerThread = false;
if (config::getAssumeTeamsOversubscription()) {
ASSERT(NumBlocks >= NumIters, "Broken assumption");
OneIterationPerThread = true;
}
if (BlockChunk != NumThreads)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
OneIterationPerThread);
else
NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
NumIters, OneIterationPerThread);
ASSERT(icv::Level == 0, "Bad distribute");
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
}
/// Worksharing `distrbute parallel for`-loop.
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
void *Arg, Ty NumIters, Ty NumThreads,
Ty BlockChunk, Ty ThreadChunk) {
ASSERT(icv::Level == 1, "Bad distribute");
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
ASSERT(NumIters >= 0, "Bad iteration count");
ASSERT(BlockChunk >= 0, "Bad block count");
ASSERT(ThreadChunk >= 0, "Bad thread count");
// All threads need to participate but the user might have used a
// `num_threads` clause on the parallel and reduced the number compared to
// the block size.
Ty TId = mapping::getThreadIdInBlock();
// All teams need to participate.
Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
Ty BId = mapping::getBlockIdInKernel();
// If the block chunk is not specified we pick a default now.
if (BlockChunk == 0)
BlockChunk = NumThreads;
// If the thread chunk is not specified we pick a default now.
if (ThreadChunk == 0)
ThreadChunk = 1;
// If we know we have more threads (across all blocks) than iterations we
// can indicate that to avoid an outer loop.
bool OneIterationPerThread = false;
if (config::getAssumeTeamsOversubscription() &
config::getAssumeThreadsOversubscription()) {
OneIterationPerThread = true;
ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
}
if (BlockChunk != NumThreads || ThreadChunk != 1)
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
ThreadChunk, NumThreads, TId, NumIters,
OneIterationPerThread);
else
NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
NumIters, OneIterationPerThread);
ASSERT(icv::Level == 1, "Bad distribute");
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
}
};
} // namespace ompx
#define OMP_LOOP_ENTRY(BW, TY) \
[[gnu::flatten, clang::always_inline]] void \
__kmpc_distribute_for_static_loop##BW( \
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
TY num_threads, TY block_chunk, TY thread_chunk) { \
ompx::StaticLoopChunker<TY>::DistributeFor( \
loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \
} \
[[gnu::flatten, clang::always_inline]] void \
__kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
void *arg, TY num_iters, \
TY block_chunk) { \
ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1, \
block_chunk); \
} \
[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
TY num_threads, TY thread_chunk) { \
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
thread_chunk); \
}
extern "C" {
OMP_LOOP_ENTRY(_4, int32_t)
OMP_LOOP_ENTRY(_4u, uint32_t)
OMP_LOOP_ENTRY(_8, int64_t)
OMP_LOOP_ENTRY(_8u, uint64_t)
}
#pragma omp end declare target