diff --git a/src/Profiling.cpp b/src/Profiling.cpp index e26a076df084..9951b74f0af0 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -133,6 +133,12 @@ class InjectProfiling : public IRMutator { unconditionally_set_current_func(stack.back())}); } + Stmt suspend_thread_but_keep_task_id(const Stmt &s) { + return Block::make({decr_active_threads(profiler_instance), + s, + incr_active_threads(profiler_instance)}); + } + private: using IRMutator::visit; @@ -499,7 +505,11 @@ class InjectProfiling : public IRMutator { Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body); if (update_active_threads) { - stmt = suspend_thread(stmt); + if (Internal::is_gpu(op->for_type)) { + stmt = suspend_thread_but_keep_task_id(stmt); + } else { + stmt = suspend_thread(stmt); + } } return stmt; diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 4a49da3d6521..83d76ad3166d 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -910,7 +910,8 @@ extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf); * without specifying any buffer to synchronize on. * Calling this with a null device_interface is always illegal. */ -extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface); +extern int halide_device_sync_global(void *user_context, + const struct halide_device_interface_t *device_interface); /** Allocate device memory to back a halide_buffer_t. */ extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf, diff --git a/src/runtime/device_interface.cpp b/src/runtime/device_interface.cpp index 1625a6698ccc..8bd2d4568baf 100644 --- a/src/runtime/device_interface.cpp +++ b/src/runtime/device_interface.cpp @@ -237,11 +237,12 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) { * This variation of the synchronizing is useful when a synchronization is desirable * without specifying any buffer to synchronize on. */ -WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) { +WEAK int halide_device_sync_global(void *user_context, + const struct halide_device_interface_t *device_interface) { if (device_interface == nullptr) { return halide_error_code_no_device_interface; } - // This function calls immediately the device_interface implementation to syncrhonize on + // This function calls immediately the device_interface implementation to synchronize on // "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync. return device_interface->impl->device_sync(user_context, nullptr); } diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp index a5633898b8a9..fcf2d4433e4d 100644 --- a/src/runtime/profiler_common.cpp +++ b/src/runtime/profiler_common.cpp @@ -517,9 +517,10 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st for (int i = 0; i < p->num_funcs; i++) { halide_profiler_func_stats *fs = p->funcs + i; - // The first id is always a catch-all overhead - // slot. Only report overhead time if it's non-zero - if (i == 0 && fs->time == 0) { + // The first id is always a catch-all overhead slot (notably containing the asserts). + // The second id is always the "wait for parallel tasks" slot. + // Only report these time if it's non-zero + if ((i == 0 || i == 1) && fs->time == 0) { continue; }