From 77ad061923418ba0f4c8fd4a0710a5ace825bf8e Mon Sep 17 00:00:00 2001 From: Josep Pinot Date: Fri, 14 Mar 2025 08:02:23 +0100 Subject: [PATCH] [OpenMP] Update OpenMP runtime to adopt taskgraph clause from 6.0 Specs (#130751) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updating OpenMP runtime taskgraph support(record/replay mechanism): - Adds a `graph_reset` bit in `kmp_taskgraph_flags_t` to discard existing TDG records. - Switches from a strict index-based TDG ID/IDX to a more flexible integer-based, which can be any integer (e.g. hashed). - Adds helper functions like `__kmp_find_tdg`, `__kmp_alloc_tdg`, and `__kmp_free_tdg` to manage TDGs by their IDs. These changes pave the way for the integration of OpenMP taskgraph (spec 6.0). Taskgraphs are still recorded in an array with a lookup efficiency reduced to O(n), where n ≤ `__kmp_max_tdgs`. This can be optimized by moving the TDGs to a hashtable, making lookups more efficient. The provided helper routines facilitate easier future optimizations. --- openmp/runtime/src/kmp.h | 6 +- openmp/runtime/src/kmp_global.cpp | 3 +- openmp/runtime/src/kmp_tasking.cpp | 130 ++++++++++++------ .../tasking/omp_record_replay_random_id.cpp | 47 +++++++ .../test/tasking/omp_record_replay_reset.cpp | 47 +++++++ 5 files changed, 189 insertions(+), 44 deletions(-) create mode 100644 openmp/runtime/test/tasking/omp_record_replay_random_id.cpp create mode 100644 openmp/runtime/test/tasking/omp_record_replay_reset.cpp diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 9b8c6102dbee..856f14e5f057 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2606,7 +2606,9 @@ typedef struct { typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */ unsigned nowait : 1; unsigned re_record : 1; - unsigned reserved : 30; + unsigned graph_reset : 1; /* 1==discard taskgraph record, 0==use taskgraph + record */ + unsigned reserved : 29; } kmp_taskgraph_flags_t; /// Represents a TDG node @@ -2650,7 +2652,7 @@ typedef struct kmp_tdg_info { extern int __kmp_tdg_dot; extern kmp_int32 __kmp_max_tdgs; extern kmp_tdg_info_t **__kmp_global_tdgs; -extern kmp_int32 __kmp_curr_tdg_idx; +extern kmp_tdg_info_t *__kmp_curr_tdg; extern kmp_int32 __kmp_successors_size; extern std::atomic __kmp_tdg_task_id; extern kmp_int32 __kmp_num_tdg; diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 52e0fdbdfb1d..7b6bfff7c54e 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -554,8 +554,7 @@ int *__kmp_nesting_nth_level; int __kmp_tdg_dot = 0; kmp_int32 __kmp_max_tdgs = 100; kmp_tdg_info_t **__kmp_global_tdgs = NULL; -kmp_int32 __kmp_curr_tdg_idx = - 0; // Id of the current TDG being recorded or executed +kmp_tdg_info_t *__kmp_curr_tdg = NULL; // Current TDG being recorded or executed kmp_int32 __kmp_num_tdg = 0; kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for // recording diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 563aa29f6265..90004bfc8afe 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -1651,11 +1651,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, } #if OMPX_TASKGRAPH - kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx); + kmp_tdg_info_t *tdg = __kmp_curr_tdg; if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) && (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) { taskdata->is_taskgraph = 1; - taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx]; + taskdata->tdg = tdg; taskdata->td_task_id = KMP_GEN_TASK_ID(); taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id); } @@ -2577,14 +2577,11 @@ without help of the runtime library. */ void *__kmpc_task_reduction_init(int gtid, int num, void *data) { #if OMPX_TASKGRAPH - kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx); + kmp_tdg_info_t *tdg = __kmp_curr_tdg; if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) { - kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx]; - this_tdg->rec_taskred_data = - __kmp_allocate(sizeof(kmp_task_red_input_t) * num); - this_tdg->rec_num_taskred = num; - KMP_MEMCPY(this_tdg->rec_taskred_data, data, - sizeof(kmp_task_red_input_t) * num); + tdg->rec_taskred_data = __kmp_allocate(sizeof(kmp_task_red_input_t) * num); + tdg->rec_num_taskred = num; + KMP_MEMCPY(tdg->rec_taskred_data, data, sizeof(kmp_task_red_input_t) * num); } #endif return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); @@ -2604,14 +2601,11 @@ has two parameters, pointer to object to be initialized and pointer to omp_orig */ void *__kmpc_taskred_init(int gtid, int num, void *data) { #if OMPX_TASKGRAPH - kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx); + kmp_tdg_info_t *tdg = __kmp_curr_tdg; if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) { - kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx]; - this_tdg->rec_taskred_data = - __kmp_allocate(sizeof(kmp_task_red_input_t) * num); - this_tdg->rec_num_taskred = num; - KMP_MEMCPY(this_tdg->rec_taskred_data, data, - sizeof(kmp_task_red_input_t) * num); + tdg->rec_taskred_data = __kmp_allocate(sizeof(kmp_task_red_input_t) * num); + tdg->rec_num_taskred = num; + KMP_MEMCPY(tdg->rec_taskred_data, data, sizeof(kmp_task_red_input_t) * num); } #endif return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); @@ -2662,8 +2656,7 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { #if OMPX_TASKGRAPH if ((thread->th.th_current_task->is_taskgraph) && - (!__kmp_tdg_is_recording( - __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) { + (!__kmp_tdg_is_recording(__kmp_curr_tdg->tdg_status))) { tg = thread->th.th_current_task->td_taskgroup; KMP_ASSERT(tg != NULL); KMP_ASSERT(tg->reduce_data != NULL); @@ -5452,7 +5445,6 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) { #if OMPX_TASKGRAPH // __kmp_find_tdg: identify a TDG through its ID -// gtid: Global Thread ID // tdg_id: ID of the TDG // returns: If a TDG corresponding to this ID is found and not // its initial state, return the pointer to it, otherwise nullptr @@ -5465,12 +5457,71 @@ static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) { __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate( sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs); - if ((__kmp_global_tdgs[tdg_id]) && - (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE)) - res = __kmp_global_tdgs[tdg_id]; + for (kmp_int32 tdg_idx = 0; tdg_idx < __kmp_max_tdgs; tdg_idx++) { + if (__kmp_global_tdgs[tdg_idx] && + __kmp_global_tdgs[tdg_idx]->tdg_id == tdg_id) { + if (__kmp_global_tdgs[tdg_idx]->tdg_status != KMP_TDG_NONE) + res = __kmp_global_tdgs[tdg_idx]; + break; + } + } return res; } +// __kmp_alloc_tdg: Allocates a TDG if it doesn't already exist. +// tdg_id: ID of the TDG. +// returns: A pointer to the TDG if it already exists. Otherwise, +// allocates a new TDG if the maximum limit has not been reached. +// Returns nullptr if no TDG can be allocated. +static kmp_tdg_info_t *__kmp_alloc_tdg(kmp_int32 tdg_id) { + kmp_tdg_info_t *res = nullptr; + if ((res = __kmp_find_tdg(tdg_id))) + return res; + + if (__kmp_num_tdg > __kmp_max_tdgs) + return res; + + for (kmp_int32 tdg_idx = 0; tdg_idx < __kmp_max_tdgs; tdg_idx++) { + if (!__kmp_global_tdgs[tdg_idx]) { + kmp_tdg_info_t *tdg = + (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t)); + __kmp_global_tdgs[tdg_idx] = tdg; + __kmp_curr_tdg = tdg; + res = __kmp_global_tdgs[tdg_idx]; + break; + } + } + return res; +} + +// __kmp_free_tdg: Frees a TDG if it exists. +// tdg_id: ID of the TDG to be freed. +// returns: true if a TDG with the given ID was found and successfully freed, +// false if no such TDG exists. +static bool __kmp_free_tdg(kmp_int32 tdg_id) { + kmp_tdg_info_t *tdg = nullptr; + if (__kmp_global_tdgs == NULL) + return false; + + for (kmp_int32 tdg_idx = 0; tdg_idx < __kmp_max_tdgs; tdg_idx++) { + if (__kmp_global_tdgs[tdg_idx] && + __kmp_global_tdgs[tdg_idx]->tdg_id == tdg_id) { + tdg = __kmp_global_tdgs[tdg_idx]; + for (kmp_int map_idx = 0; map_idx < tdg->map_size; map_idx++) { + __kmp_free(tdg->record_map[map_idx].successors); + } + __kmp_free(tdg->record_map); + if (tdg->root_tasks) + __kmp_free(tdg->root_tasks); + + __kmp_free(tdg); + __kmp_global_tdgs[tdg_idx] = NULL; + return true; + } + } + return false; +} + // __kmp_print_tdg_dot: prints the TDG to a dot file // tdg: ID of the TDG // gtid: Global Thread ID @@ -5505,7 +5556,7 @@ void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) { KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id)); } -// __kmp_start_record: launch the execution of a previous +// __kmp_exec_tdg: launch the execution of a previous // recorded TDG // gtid: Global Thread ID // tdg: ID of the TDG @@ -5565,9 +5616,7 @@ void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) { static inline void __kmp_start_record(kmp_int32 gtid, kmp_taskgraph_flags_t *flags, kmp_int32 tdg_id) { - kmp_tdg_info_t *tdg = - (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t)); - __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg; + kmp_tdg_info_t *tdg = __kmp_alloc_tdg(tdg_id); // Initializing the TDG structure tdg->tdg_id = tdg_id; tdg->map_size = INIT_MAPSIZE; @@ -5592,7 +5641,7 @@ static inline void __kmp_start_record(kmp_int32 gtid, KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0); } - __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map; + tdg->record_map = this_record_map; } // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark @@ -5600,34 +5649,34 @@ static inline void __kmp_start_record(kmp_int32 gtid, // loc_ref: Location of TDG, not used yet // gtid: Global Thread ID of the encountering thread // input_flags: Flags associated with the TDG -// tdg_id: ID of the TDG to record, for now, incremental integer +// tdg_id: ID of the TDG to record // returns: 1 if we record, otherwise, 0 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 input_flags, kmp_int32 tdg_id) { - kmp_int32 res; kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags; - KA_TRACE(10, - ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n", - gtid, loc_ref, input_flags, tdg_id)); + KA_TRACE(10, ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d " + "tdg_id=%d\n", + gtid, loc_ref, input_flags, tdg_id)); if (__kmp_max_tdgs == 0) { - KA_TRACE( - 10, - ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, " - "__kmp_max_tdgs = 0\n", - gtid, loc_ref, input_flags, tdg_id)); + KA_TRACE(10, ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d " + "tdg_id = %d, __kmp_max_tdgs = 0\n", + gtid, loc_ref, input_flags, tdg_id)); return 1; } __kmpc_taskgroup(loc_ref, gtid); + if (flags->graph_reset) { + __kmp_free_tdg(tdg_id); + __kmp_num_tdg--; + } if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) { // TODO: use re_record flag __kmp_exec_tdg(gtid, tdg); res = 0; } else { - __kmp_curr_tdg_idx = tdg_id; - KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs); + KMP_DEBUG_ASSERT(__kmp_num_tdg < __kmp_max_tdgs); __kmp_start_record(gtid, flags, tdg_id); __kmp_num_tdg++; res = 1; @@ -5690,10 +5739,11 @@ void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 input_flags, kmp_int32 tdg_id) { kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id); + KMP_DEBUG_ASSERT(tdg != NULL); KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording" " tdg=%d with flags=%d\n", gtid, loc_ref, tdg_id, input_flags)); - if (__kmp_max_tdgs) { + if (__kmp_max_tdgs && tdg) { // TODO: use input_flags->nowait __kmpc_end_taskgroup(loc_ref, gtid); if (__kmp_tdg_is_recording(tdg->tdg_status)) diff --git a/openmp/runtime/test/tasking/omp_record_replay_random_id.cpp b/openmp/runtime/test/tasking/omp_record_replay_random_id.cpp new file mode 100644 index 000000000000..58e90da4d782 --- /dev/null +++ b/openmp/runtime/test/tasking/omp_record_replay_random_id.cpp @@ -0,0 +1,47 @@ +// REQUIRES: ompx_taskgraph +// RUN: %libomp-cxx-compile-and-run +#include +#include +#define NT 10 + +// Compiler-generated code (emulation) +typedef struct ident { + void *dummy; +} ident_t; + +#ifdef __cplusplus +extern "C" { +int __kmpc_global_thread_num(ident_t *); +int __kmpc_start_record_task(ident_t *, int, int, int); +void __kmpc_end_record_task(ident_t *, int, int, int); +} +#endif + +static void func(int *num_exec) { (*num_exec)++; } + +int main() { + int num_exec = 0; + int num_tasks = 0; + int hash_id = 135343854; +#pragma omp parallel +#pragma omp single + for (int iter = 0; iter < NT; ++iter) { + int gtid = __kmpc_global_thread_num(nullptr); + int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, + /* tdg_id */ hash_id); + if (res) { + num_tasks++; +#pragma omp task + func(&num_exec); + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, + /* tdg_id */ hash_id); + } + + assert(num_tasks == 1); + assert(num_exec == NT); + + std::cout << "Passed" << std::endl; + return 0; +} +// CHECK: Passed diff --git a/openmp/runtime/test/tasking/omp_record_replay_reset.cpp b/openmp/runtime/test/tasking/omp_record_replay_reset.cpp new file mode 100644 index 000000000000..123a9fa5a72f --- /dev/null +++ b/openmp/runtime/test/tasking/omp_record_replay_reset.cpp @@ -0,0 +1,47 @@ +// REQUIRES: ompx_taskgraph +// RUN: %libomp-cxx-compile-and-run +#include +#include +#define NT 10 + +// Compiler-generated code (emulation) +typedef struct ident { + void *dummy; +} ident_t; + +#ifdef __cplusplus +extern "C" { +int __kmpc_global_thread_num(ident_t *); +int __kmpc_start_record_task(ident_t *, int, int, int); +void __kmpc_end_record_task(ident_t *, int, int, int); +} +#endif + +static void func(int *num_exec) { (*num_exec)++; } + +int main() { + int num_exec = 0; + int num_tasks = 0; + int flags = 1 << 2; +#pragma omp parallel +#pragma omp single + for (int iter = 0; iter < NT; ++iter) { + int gtid = __kmpc_global_thread_num(nullptr); + int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ flags, + /* tdg_id */ 0); + if (res) { + num_tasks++; +#pragma omp task + func(&num_exec); + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, + /* tdg_id */ 0); + } + + assert(num_tasks == NT); + assert(num_exec == NT); + + std::cout << "Passed" << std::endl; + return 0; +} +// CHECK: Passed