* Serial teams now use a stack (similar to dispatch buffers) * Serial teams always use `t_task_team[0]` as the task team and the second pointer is a next pointer for the stack `t_task_team[1]` is interpreted as a stack of task teams where each level is a nested level ``` inner serial team outer serial team [ t_task_team[0] ] -> (task_team) [ t_task_team[0] ] -> (task_team) [ next ] ----------------> [ next ] -> ... ``` * Remove the task state memo stack from thread structure. * Instead of a thread-private stack, use team structure to store th_task_state of the primary thread. When coming out of a parallel, restore the primary thread's task state. The new field in the team structure doesn't cause sizeof(team) to change and is in the cache line which is only read/written by the primary thread. Fixes: #50602 Fixes: #69368 Fixes: #69733 Fixes: #79416
319 lines
9.3 KiB
C++
319 lines
9.3 KiB
C++
// RUN: %libomp-cxx-compile
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
|
|
//
|
|
// RUN: %libomp-cxx-compile -DUSE_HIDDEN_HELPERS=1
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
|
|
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
|
|
|
|
// This test stresses the task team mechanism by running a simple
|
|
// increment task over and over with varying number of threads and nesting.
|
|
// The test covers nested serial teams and mixing serial teams with
|
|
// normal active teams.
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <omp.h>
|
|
|
|
// The number of times to run each test
|
|
#define NTIMES 5
|
|
|
|
// Regular single increment task
|
|
void task_inc_a(int *a) {
|
|
#pragma omp task
|
|
{
|
|
#pragma omp atomic
|
|
(*a)++;
|
|
}
|
|
}
|
|
|
|
// Splitting increment task that binary splits the incrementing task
|
|
void task_inc_split_a(int *a, int low, int high) {
|
|
#pragma omp task firstprivate(low, high)
|
|
{
|
|
if (low == high) {
|
|
#pragma omp atomic
|
|
(*a)++;
|
|
} else if (low < high) {
|
|
int mid = (high - low) / 2 + low;
|
|
task_inc_split_a(a, low, mid);
|
|
task_inc_split_a(a, mid + 1, high);
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef USE_HIDDEN_HELPERS
|
|
// Hidden helper tasks force serial regions to create task teams
|
|
void task_inc_a_hidden_helper(int *a) {
|
|
#pragma omp target map(tofrom : a[0]) nowait
|
|
{
|
|
#pragma omp atomic
|
|
(*a)++;
|
|
}
|
|
}
|
|
#else
|
|
// Detached tasks force serial regions to create task teams
|
|
void task_inc_a_detached(int *a, omp_event_handle_t handle) {
|
|
#pragma omp task detach(handle)
|
|
{
|
|
#pragma omp atomic
|
|
(*a)++;
|
|
omp_fulfill_event(handle);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void check_a(int *a, int expected) {
|
|
if (*a != expected) {
|
|
fprintf(stderr,
|
|
"FAIL: a = %d instead of expected = %d. Compile with "
|
|
"-DVERBOSE for more verbose output.\n",
|
|
*a, expected);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
// Every thread creates a single "increment" task
|
|
void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
|
|
int tid = omp_get_thread_num();
|
|
|
|
task_inc_a(a);
|
|
|
|
#pragma omp barrier
|
|
check_a(a, expected);
|
|
#pragma omp barrier
|
|
check_a(a, expected);
|
|
#pragma omp barrier
|
|
|
|
#ifdef USE_HIDDEN_HELPERS
|
|
task_inc_a_hidden_helper(a);
|
|
#else
|
|
task_inc_a_detached(a, handles[tid]);
|
|
#endif
|
|
|
|
#pragma omp barrier
|
|
check_a(a, 2 * expected);
|
|
#pragma omp barrier
|
|
task_inc_a(a);
|
|
#pragma omp barrier
|
|
check_a(a, 3 * expected);
|
|
}
|
|
|
|
// Testing single level of parallelism with increment tasks
|
|
void test_base(int nthreads) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf(" test_base(%d)\n", nthreads);
|
|
#endif
|
|
int a = 0;
|
|
omp_event_handle_t *handles;
|
|
handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
|
|
#pragma omp parallel num_threads(nthreads) shared(a)
|
|
{ test_tasks(handles, nthreads, &a); }
|
|
free(handles);
|
|
}
|
|
|
|
// Testing nested parallel with increment tasks
|
|
// first = nthreads of outer parallel
|
|
// second = nthreads of nested parallel
|
|
void test_nest(int first, int second) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf(" test_nest(%d, %d)\n", first, second);
|
|
#endif
|
|
#pragma omp parallel num_threads(first)
|
|
{ test_base(second); }
|
|
}
|
|
|
|
// Testing 2-level nested parallels with increment tasks
|
|
// first = nthreads of outer parallel
|
|
// second = nthreads of nested parallel
|
|
// third = nthreads of second nested parallel
|
|
void test_nest2(int first, int second, int third) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf(" test_nest2(%d, %d, %d)\n", first, second, third);
|
|
#endif
|
|
#pragma omp parallel num_threads(first)
|
|
{ test_nest(second, third); }
|
|
}
|
|
|
|
// Testing 3-level nested parallels with increment tasks
|
|
// first = nthreads of outer parallel
|
|
// second = nthreads of nested parallel
|
|
// third = nthreads of second nested parallel
|
|
// fourth = nthreads of third nested parallel
|
|
void test_nest3(int first, int second, int third, int fourth) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth);
|
|
#endif
|
|
#pragma omp parallel num_threads(first)
|
|
{ test_nest2(second, third, fourth); }
|
|
}
|
|
|
|
// Testing 4-level nested parallels with increment tasks
|
|
// first = nthreads of outer parallel
|
|
// second = nthreads of nested parallel
|
|
// third = nthreads of second nested parallel
|
|
// fourth = nthreads of third nested parallel
|
|
// fifth = nthreads of fourth nested parallel
|
|
void test_nest4(int first, int second, int third, int fourth, int fifth) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth,
|
|
fifth);
|
|
#endif
|
|
#pragma omp parallel num_threads(first)
|
|
{ test_nest3(second, third, fourth, fifth); }
|
|
}
|
|
|
|
// Single thread starts a binary splitting "increment" task
|
|
// Detached tasks are still single "increment" task
|
|
void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
|
|
int tid = omp_get_thread_num();
|
|
|
|
#pragma omp single
|
|
task_inc_split_a(a, 1, expected); // task team A
|
|
|
|
#pragma omp barrier
|
|
check_a(a, expected);
|
|
#pragma omp barrier
|
|
check_a(a, expected);
|
|
#pragma omp barrier
|
|
|
|
#ifdef USE_HIDDEN_HELPERS
|
|
task_inc_a_hidden_helper(a);
|
|
#else
|
|
task_inc_a_detached(a, handles[tid]);
|
|
#endif
|
|
|
|
#pragma omp barrier
|
|
check_a(a, 2 * expected);
|
|
#pragma omp barrier
|
|
#pragma omp single
|
|
task_inc_split_a(a, 1, expected); // task team B
|
|
#pragma omp barrier
|
|
check_a(a, 3 * expected);
|
|
}
|
|
|
|
// Testing single level of parallelism with splitting incrementing tasks
|
|
void test_base_split(int nthreads) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf(" test_base_split(%d)\n", nthreads);
|
|
#endif
|
|
int a = 0;
|
|
omp_event_handle_t *handles;
|
|
handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
|
|
#pragma omp parallel num_threads(nthreads) shared(a)
|
|
{ test_tasks_split(handles, nthreads, &a); }
|
|
free(handles);
|
|
}
|
|
|
|
// Testing nested parallels with splitting tasks
|
|
// first = nthreads of outer parallel
|
|
// second = nthreads of nested parallel
|
|
void test_nest_split(int first, int second) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf(" test_nest_split(%d, %d)\n", first, second);
|
|
#endif
|
|
#pragma omp parallel num_threads(first)
|
|
{ test_base_split(second); }
|
|
}
|
|
|
|
// Testing doubly nested parallels with splitting tasks
|
|
// first = nthreads of outer parallel
|
|
// second = nthreads of nested parallel
|
|
// third = nthreads of second nested parallel
|
|
void test_nest2_split(int first, int second, int third) {
|
|
#ifdef VERBOSE
|
|
#pragma omp master
|
|
printf("test_nest2_split(%d, %d, %d)\n", first, second, third);
|
|
#endif
|
|
#pragma omp parallel num_threads(first)
|
|
{ test_nest_split(second, third); }
|
|
}
|
|
|
|
template <typename... Args>
|
|
void run_ntimes(int n, void (*func)(Args...), Args... args) {
|
|
for (int i = 0; i < n; ++i) {
|
|
func(args...);
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
omp_set_max_active_levels(5);
|
|
|
|
run_ntimes(NTIMES, test_base, 4);
|
|
run_ntimes(NTIMES, test_base, 1);
|
|
run_ntimes(NTIMES, test_base, 8);
|
|
run_ntimes(NTIMES, test_base, 2);
|
|
run_ntimes(NTIMES, test_base, 6);
|
|
run_ntimes(NTIMES, test_nest, 1, 1);
|
|
run_ntimes(NTIMES, test_nest, 1, 5);
|
|
run_ntimes(NTIMES, test_nest, 2, 6);
|
|
run_ntimes(NTIMES, test_nest, 1, 1);
|
|
run_ntimes(NTIMES, test_nest, 4, 3);
|
|
run_ntimes(NTIMES, test_nest, 3, 2);
|
|
run_ntimes(NTIMES, test_nest, 1, 1);
|
|
run_ntimes(NTIMES, test_nest2, 1, 1, 2);
|
|
run_ntimes(NTIMES, test_nest2, 1, 2, 1);
|
|
run_ntimes(NTIMES, test_nest2, 2, 2, 1);
|
|
run_ntimes(NTIMES, test_nest2, 2, 1, 1);
|
|
run_ntimes(NTIMES, test_nest2, 4, 2, 1);
|
|
run_ntimes(NTIMES, test_nest2, 4, 2, 2);
|
|
run_ntimes(NTIMES, test_nest2, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest2, 4, 2, 2);
|
|
run_ntimes(NTIMES, test_nest3, 1, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest3, 1, 2, 1, 1);
|
|
run_ntimes(NTIMES, test_nest3, 1, 1, 2, 1);
|
|
run_ntimes(NTIMES, test_nest3, 1, 1, 1, 2);
|
|
run_ntimes(NTIMES, test_nest3, 2, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest4, 2, 1, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest4, 1, 2, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest4, 1, 1, 2, 1, 1);
|
|
run_ntimes(NTIMES, test_nest4, 1, 1, 1, 2, 1);
|
|
run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 2);
|
|
run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest4, 1, 2, 1, 2, 1);
|
|
|
|
run_ntimes(NTIMES, test_base_split, 4);
|
|
run_ntimes(NTIMES, test_base_split, 2);
|
|
|
|
run_ntimes(NTIMES, test_base_split, 7);
|
|
|
|
run_ntimes(NTIMES, test_base_split, 1);
|
|
run_ntimes(NTIMES, test_nest_split, 4, 2);
|
|
run_ntimes(NTIMES, test_nest_split, 2, 1);
|
|
|
|
run_ntimes(NTIMES, test_nest_split, 7, 2);
|
|
run_ntimes(NTIMES, test_nest_split, 1, 1);
|
|
run_ntimes(NTIMES, test_nest_split, 1, 4);
|
|
|
|
run_ntimes(NTIMES, test_nest2_split, 1, 1, 2);
|
|
run_ntimes(NTIMES, test_nest2_split, 1, 2, 1);
|
|
run_ntimes(NTIMES, test_nest2_split, 2, 2, 1);
|
|
run_ntimes(NTIMES, test_nest2_split, 2, 1, 1);
|
|
run_ntimes(NTIMES, test_nest2_split, 4, 2, 1);
|
|
run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
|
|
run_ntimes(NTIMES, test_nest2_split, 1, 1, 1);
|
|
run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
|
|
|
|
printf("PASS\n");
|
|
return EXIT_SUCCESS;
|
|
}
|