[Offload][OMPX] Add the runtime support for multi-dim grid and block (#118042)

This commit is contained in:
Shilei Tian
2024-12-06 09:07:50 -05:00
committed by GitHub
parent 2f4eac6287
commit 92376c3ff5
13 changed files with 168 additions and 95 deletions

View File

@@ -559,15 +559,15 @@ struct AMDGPUKernelTy : public GenericKernelTy {
}
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Print more elaborate kernel launch info for AMDGPU
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const override;
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const override;
/// Get group and private segment kernel size.
uint32_t getGroupSize() const { return GroupSize; }
@@ -719,7 +719,7 @@ struct AMDGPUQueueTy {
/// Push a kernel launch to the queue. The kernel launch requires an output
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t GroupSize, uint64_t StackSize,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
@@ -746,14 +746,18 @@ struct AMDGPUQueueTy {
assert(Packet && "Invalid packet");
// The first 32 bits of the packet are written after the other fields
uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = NumThreads;
Packet->workgroup_size_y = 1;
Packet->workgroup_size_z = 1;
uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1
? 3
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
uint16_t Setup = UINT16_C(Dims)
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = NumThreads[0];
Packet->workgroup_size_y = NumThreads[1];
Packet->workgroup_size_z = NumThreads[2];
Packet->reserved0 = 0;
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->grid_size_x = NumBlocks[0] * NumThreads[0];
Packet->grid_size_y = NumBlocks[1] * NumThreads[1];
Packet->grid_size_z = NumBlocks[2] * NumThreads[2];
Packet->private_segment_size =
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
@@ -1240,7 +1244,7 @@ public:
/// the kernel finalizes. Once the kernel is finished, the stream will release
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t GroupSize, uint64_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
@@ -2827,10 +2831,10 @@ private:
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
if (auto Err =
AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -3328,7 +3332,7 @@ private:
};
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3385,13 +3389,15 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
if (ImplArgs &&
getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks;
ImplArgs->BlockCountY = 1;
ImplArgs->BlockCountZ = 1;
ImplArgs->GroupSizeX = NumThreads;
ImplArgs->GroupSizeY = 1;
ImplArgs->GroupSizeZ = 1;
ImplArgs->GridDims = 1;
ImplArgs->BlockCountX = NumBlocks[0];
ImplArgs->BlockCountY = NumBlocks[1];
ImplArgs->BlockCountZ = NumBlocks[2];
ImplArgs->GroupSizeX = NumThreads[0];
ImplArgs->GroupSizeY = NumThreads[1];
ImplArgs->GroupSizeZ = NumThreads[2];
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
? 3
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
}
@@ -3402,8 +3408,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const {
// Only do all this when the output is requested
if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
return Plugin::success();
@@ -3440,12 +3446,13 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
// S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
// Tripcount: loop tripcount for the kernel
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
"#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
"#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS "
"Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
"%lu\n",
ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
LoopTripCount);
ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2],
ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2],
MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
SGPRSpillCount, VGPRSpillCount, LoopTripCount);
return Plugin::success();
}

View File

@@ -269,8 +269,9 @@ struct GenericKernelTy {
Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
@@ -320,15 +321,15 @@ protected:
/// Prints generic kernel launch information.
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const;
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const;
/// Prints plugin-specific kernel launch information after generic kernel
/// launch information
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const;
uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const;
private:
/// Prepare the arguments before launching the kernel.
@@ -347,7 +348,7 @@ private:
/// The number of threads \p NumThreads can be adjusted by this method.
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
/// thread_limit clause.
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;

View File

@@ -526,20 +526,21 @@ GenericKernelTy::getKernelLaunchEnvironment(
Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const {
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
"Launching kernel %s with %" PRIu64
" blocks and %d threads in %s mode\n",
getName(), NumBlocks, NumThreads, getExecutionModeName());
"Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in "
"%s mode\n",
getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0],
NumThreads[1], NumThreads[2], getExecutionModeName());
return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
NumBlocks);
}
Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const {
return Plugin::success();
}
@@ -566,10 +567,16 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
Args, Ptrs, *KernelLaunchEnvOrErr);
}
uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
uint64_t NumBlocks =
getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount,
NumThreads, KernelArgs.ThreadLimit[0] > 0);
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
KernelArgs.ThreadLimit[1],
KernelArgs.ThreadLimit[2]};
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
KernelArgs.NumTeams[2]};
if (!IsBareKernel) {
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
}
// Record the kernel description after we modified the argument count and num
// blocks/threads.
@@ -578,7 +585,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
RecordReplay.saveImage(getName(), getImage());
RecordReplay.saveKernelInput(getName(), getImage());
RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
NumBlocks, NumThreads, KernelArgs.Tripcount);
NumBlocks[0], NumThreads[0],
KernelArgs.Tripcount);
}
if (auto Err =
@@ -618,11 +626,10 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs(
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
uint32_t ThreadLimitClause[3]) const {
assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 &&
"Multi dimensional launch not supported yet.");
assert(!IsBareKernel && "bare kernel should not call this function");
if (IsBareKernel && ThreadLimitClause[0] > 0)
return ThreadLimitClause[0];
assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 &&
"Multi dimensional launch not supported yet.");
if (ThreadLimitClause[0] > 0 && isGenericMode())
ThreadLimitClause[0] += GenericDevice.getWarpSize();
@@ -632,16 +639,15 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
: PreferredNumThreads);
}
uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t NumTeamsClause[3],
uint64_t LoopTripCount,
uint32_t &NumThreads,
bool IsNumThreadsFromUser) const {
assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
"Multi dimensional launch not supported yet.");
assert(!IsBareKernel && "bare kernel should not call this function");
if (IsBareKernel && NumTeamsClause[0] > 0)
return NumTeamsClause[0];
assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 &&
"Multi dimensional launch not supported yet.");
if (NumTeamsClause[0] > 0) {
// TODO: We need to honor any value and consequently allow more than the

View File

@@ -149,8 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
}
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
@@ -1228,10 +1228,10 @@ private:
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
if (auto Err =
CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = CUDAKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
@@ -1274,7 +1274,7 @@ private:
};
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1292,9 +1292,8 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
reinterpret_cast<void *>(&LaunchParams.Size),
CU_LAUNCH_PARAM_END};
CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
/*gridDimZ=*/1, NumThreads,
/*blockDimY=*/1, /*blockDimZ=*/1,
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
NumThreads[0], NumThreads[1], NumThreads[2],
MaxDynCGroupMem, Stream, nullptr, Config);
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
}

View File

@@ -89,8 +89,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
}
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.

View File

@@ -284,14 +284,25 @@ static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
LocalKernelArgs.Flags = KernelArgs->Flags;
LocalKernelArgs.DynCGroupMem = 0;
LocalKernelArgs.NumTeams[0] = NumTeams;
LocalKernelArgs.NumTeams[1] = 0;
LocalKernelArgs.NumTeams[2] = 0;
LocalKernelArgs.NumTeams[1] = 1;
LocalKernelArgs.NumTeams[2] = 1;
LocalKernelArgs.ThreadLimit[0] = ThreadLimit;
LocalKernelArgs.ThreadLimit[1] = 0;
LocalKernelArgs.ThreadLimit[2] = 0;
LocalKernelArgs.ThreadLimit[1] = 1;
LocalKernelArgs.ThreadLimit[2] = 1;
return &LocalKernelArgs;
}
// FIXME: This is a WA to "calibrate" the bad work done in the front end.
// Delete this ugly code after the front end emits proper values.
auto CorrectMultiDim = [](uint32_t(&Val)[3]) {
if (Val[1] == 0)
Val[1] = 1;
if (Val[2] == 0)
Val[2] = 1;
};
CorrectMultiDim(KernelArgs->ThreadLimit);
CorrectMultiDim(KernelArgs->NumTeams);
return KernelArgs;
}
@@ -320,12 +331,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
KernelArgs =
upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit);
assert(KernelArgs->NumTeams[0] == static_cast<uint32_t>(NumTeams) &&
!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
"OpenMP interface should not use multiple dimensions");
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"Runtime: target exe",
"NumTeams=" + std::to_string(NumTeams) +

View File

@@ -1451,8 +1451,6 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
"Multi dimensional launch not supported yet.");
/// RAII to establish tool anchors before and after kernel launch
int32_t NumTeams = KernelArgs.NumTeams[0];
// No need to guard this with OMPT_IF_BUILT

View File

@@ -1,11 +1,12 @@
// RUN: %libomptarget-compile-generic
// RUN: env OMP_NUM_TEAMS=1 OMP_TEAMS_THREAD_LIMIT=1 LIBOMPTARGET_INFO=16 \
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
// REQUIRES: gpu
#define N 256
int main() {
// CHECK: Launching kernel [[KERNEL:.+_main_.+]] with 1 blocks and 1 threads
// CHECK: Launching kernel [[KERNEL:.+_main_.+]] with [1,1,1] blocks and [1,1,1] threads
#pragma omp target teams
#pragma omp parallel
{}

View File

@@ -42,7 +42,7 @@ int main() {
// INFO: info: {{.*}} {{.*}} 256 1 0 A[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
// INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments:
// INFO: info: firstprivate(val)[4]
// INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode
// INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with [{{[0-9]+}},1,1] blocks and [{{[0-9]+}},1,1] threads in Generic mode
// AMDGPU: AMDGPU device {{[0-9]}} info: #Args: {{[0-9]}} Teams x Thrds: {{[0-9]+}}x {{[0-9]+}} (MaxFlatWorkGroupSize: {{[0-9]+}}) LDS Usage: {{[0-9]+}}B #SGPRs/VGPRs: {{[0-9]+}}/{{[0-9]+}} #SGPR/VGPR Spills: {{[0-9]+}}/{{[0-9]+}} Tripcount: {{[0-9]+}}
// INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}:
// INFO: info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration

View File

@@ -15,7 +15,7 @@ int main(int argc, char *argv[]) {
const int N = num_blocks * block_size;
int *data = (int *)malloc(N * sizeof(int));
// CHECK: "PluginInterface" device 0 info: Launching kernel __omp_offloading_{{.*}} with 64 blocks and 64 threads in SPMD mode
// CHECK: "PluginInterface" device 0 info: Launching kernel __omp_offloading_{{.*}} with [64,1,1] blocks and [64,1,1] threads in SPMD mode
#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(from: data[0:N])
{

View File

@@ -0,0 +1,56 @@
// RUN: %libomptarget-compilexx-generic
// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | %fcheck-generic
// REQUIRES: gpu
#include <ompx.h>
#include <cassert>
#include <vector>
// CHECK: "PluginInterface" device 0 info: Launching kernel __omp_offloading_{{.*}} with [2,4,6] blocks and [32,4,2] threads in SPMD mode
int main(int argc, char *argv[]) {
int bs[3] = {32u, 4u, 2u};
int gs[3] = {2u, 4u, 6u};
int n = bs[0] * bs[1] * bs[2] * gs[0] * gs[1] * gs[2];
std::vector<int> x_buf(n);
std::vector<int> y_buf(n);
std::vector<int> z_buf(n);
auto x = x_buf.data();
auto y = y_buf.data();
auto z = z_buf.data();
for (int i = 0; i < n; ++i) {
x[i] = i;
y[i] = i + 1;
}
#pragma omp target teams ompx_bare num_teams(gs[0], gs[1], gs[2]) \
thread_limit(bs[0], bs[1], bs[2]) map(to : x[ : n], y[ : n]) \
map(from : z[ : n])
{
int tid_x = ompx_thread_id_x();
int tid_y = ompx_thread_id_y();
int tid_z = ompx_thread_id_z();
int gid_x = ompx_block_id_x();
int gid_y = ompx_block_id_y();
int gid_z = ompx_block_id_z();
int bs_x = ompx_block_dim_x();
int bs_y = ompx_block_dim_y();
int bs_z = ompx_block_dim_z();
int bs = bs_x * bs_y * bs_z;
int gs_x = ompx_grid_dim_x();
int gs_y = ompx_grid_dim_y();
int gid = gid_z * gs_y * gs_x + gid_y * gs_x + gid_x;
int tid = tid_z * bs_x * bs_y + tid_y * bs_x + tid_x;
int i = gid * bs + tid;
z[i] = x[i] + y[i];
}
for (int i = 0; i < n; ++i) {
if (z[i] != (2 * i + 1))
return 1;
}
return 0;
}

View File

@@ -12,26 +12,26 @@
__attribute__((optnone)) void optnone() {}
int main() {
// DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
// DEFAULT: Launching kernel {{.+_main_.+}} with [4,1,1] blocks and [32,1,1] threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with [16,1,1] blocks and [8,1,1] threads in SPMD mode
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N; ++i) {
optnone();
}
// DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
// DEFAULT: Launching kernel {{.+_main_.+}} with [4,1,1] blocks and [32,1,1] threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with [16,1,1] blocks and [8,1,1] threads in SPMD mode
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N - 1; ++i) {
optnone();
}
// DEFAULT: Launching kernel {{.+_main_.+}} with 5 blocks and 32 threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with 17 blocks and 8 threads in SPMD mode
// DEFAULT: Launching kernel {{.+_main_.+}} with [5,1,1] blocks and [32,1,1] threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with [17,1,1] blocks and [8,1,1] threads in SPMD mode
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < N + 1; ++i) {
optnone();
}
// DEFAULT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
// DEFAULT: Launching kernel {{.+_main_.+}} with [32,1,1] blocks and [4,1,1] threads in SPMD mode
// EIGHT: Launching kernel {{.+_main_.+}} with [32,1,1] blocks and [4,1,1] threads in SPMD mode
#pragma omp target teams distribute parallel for simd thread_limit(4)
for (int i = 0; i < N; ++i) {
optnone();

View File

@@ -25,4 +25,4 @@ int main(int argc, char *argv[]) {
return 0;
}
// CHECK: Launching kernel {{.*}} with 4 blocks and 256 threads in SPMD mode
// CHECK: Launching kernel {{.*}} with [4,1,1] blocks and [256,1,1] threads in SPMD mode