diff --git a/src/Profiling.cpp b/src/Profiling.cpp index d9aa8d4cf8cc..2be058b3c8a6 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -439,16 +439,17 @@ class InjectProfiling : public IRMutator { } else { internal_error << "Expected to find a variable as first argument of the function call " << call->name << ".\n"; } - Expr device_interface; + bool requires_sync = false; if (call->name == "halide_copy_to_host") { int copy_to_host_id = get_func_id(buffer_name + " (copy to host)"); start_profiler = set_current_func(copy_to_host_id); - device_interface = Call::make(type_of(), - Call::buffer_get_device_interface, {call->args.back()}, Call::Extern); + requires_sync = false; } else if (call->name == "halide_copy_to_device") { int copy_to_device_id = get_func_id(buffer_name + " (copy to device)"); start_profiler = set_current_func(copy_to_device_id); - device_interface = call->args.back(); // The last argument to the copy calls is the device_interface. + requires_sync = true; + } else { + internal_error << "Unexpected function name.\n"; } if (start_profiler.defined()) { // The copy functions are followed by an assert, which we will wrap in the timed body. @@ -462,14 +463,17 @@ class InjectProfiling : public IRMutator { } else if (const AssertStmt *assert = op->body.as()) { copy_assert = assert; } - if (copy_assert && device_interface.defined()) { - Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface}); - - std::vector steps{ - AssertStmt::make(copy_assert->condition, copy_assert->message), - sync_and_assert, - set_current_func(stack.back()), - }; + if (copy_assert) { + std::vector steps; + steps.push_back(AssertStmt::make(copy_assert->condition, copy_assert->message)); + if (requires_sync) { + internal_assert(call->name == "halide_copy_to_device"); + Expr device_interface = call->args.back(); // The last argument to the copy_to_device calls is the device_interface. + Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface}); + steps.push_back(sync_and_assert); + } + steps.push_back(set_current_func(stack.back())); + if (other.defined()) { steps.push_back(mutate(other)); } diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 4f4b2f881f37..8fb792cf9243 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -880,8 +880,8 @@ extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf); * Wait for current GPU operations to complete. Calling this explicitly * should rarely be necessary, except maybe for profiling. * This variation of the synchronizing is useful when a synchronization is desirable - * without specifying any buffer to synchronize on. Calling this with a null device_interface - * is always legal and immediately returns with no error. + * without specifying any buffer to synchronize on. + * Calling this with a null device_interface is always illegal. */ extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface); diff --git a/src/runtime/device_interface.cpp b/src/runtime/device_interface.cpp index b21dee75c7f2..710d1259678d 100644 --- a/src/runtime/device_interface.cpp +++ b/src/runtime/device_interface.cpp @@ -239,7 +239,7 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) { */ WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) { if (device_interface == nullptr) { - return halide_error_code_success; + return halide_error_code_no_device_interface; } // This function calls immediately the device_interface implementation to syncrhonize on // "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync. diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp index 58aa096724f5..ccbe0bf11ecb 100644 --- a/src/runtime/profiler_common.cpp +++ b/src/runtime/profiler_common.cpp @@ -444,10 +444,6 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st } const auto print_time_and_percentage = [&sstr, p](uint64_t time, size_t &cursor, bool light) { - if (light) { - sstr.erase(2); // Cheat a little. There is 5 characters margin anyway. - sstr << "[ "; - } float ft = time / (p->runs * 1000000.0f); if (ft < 10000) { sstr << " "; @@ -480,12 +476,11 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st } int percent = perthousand / 10; sstr << percent << "." << (perthousand - percent * 10) << "%)"; - if (light) { - sstr << " ]"; - } - cursor += 10; - while (sstr.size() < cursor) { - sstr << " "; + if (!light) { + cursor += 10; + while (sstr.size() < cursor) { + sstr << " "; + } } }; @@ -549,16 +544,18 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st const auto print_section_header = [&](const char *name, uint64_t total_time) { size_t cursor = 0; sstr.clear(); + sstr << " "; if (support_colors) { - sstr << "\033[2m\033[1m"; - cursor += 8; + sstr << "\033[90m\033[3m"; + cursor += 9; } - sstr << " [" << name << "]"; + sstr << "[" << name << " "; cursor += max_func_name_length + 7; while (sstr.size() < cursor) { - sstr << " "; + sstr << ":"; } print_time_and_percentage(total_time, cursor, true); + sstr << " ::::]"; if (support_colors) { sstr << "\033[0m"; } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 8fc403b298bb..62a039c6ea06 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -75,6 +75,7 @@ tests(GROUPS correctness debug_to_file_reorder.cpp deferred_loop_level.cpp deinterleave4.cpp + device_buffer_copies_with_profile.cpp device_buffer_copy.cpp device_copy_at_inner_loop.cpp device_crop.cpp diff --git a/test/correctness/device_buffer_copies_with_profile.cpp b/test/correctness/device_buffer_copies_with_profile.cpp new file mode 100644 index 000000000000..aa6a0475864c --- /dev/null +++ b/test/correctness/device_buffer_copies_with_profile.cpp @@ -0,0 +1,72 @@ +#include "Halide.h" + +using namespace Halide; + +int run_test(Target t) { + // Sliding window with the producer on the GPU and the consumer on + // the CPU. This requires a copy inside the loop over which we are + // sliding. Currently this copies the entire buffer back and + // forth, which is suboptimal in the general case. In this + // specific case we're folded over y, so copying the entire buffer + // is not much more than just copying the part that was modified. + + Func f0{"f0_on_cpu"}, f1{"f1_on_gpu"}, f2{"f2_on_cpu"}; + Var x, y, tx, ty; + + // Produce something on CPU + f0(x, y) = x + y; + f0.compute_root(); + + // Which we use to produce something on GPU, causing a copy_to_device. + f1(x, y) = f0(x, y) + f0(x, y + 1); + f1.compute_root().gpu_tile(x, y, tx, ty, 8, 8); + + // Which in turn we use to produce something on CPU, causing a copy_to_host. + f2(x, y) = f1(x, y) * 2; + f2.compute_root(); + + + // Make the buffer a little bigger so we actually can see the copy time. + Buffer out = f2.realize({2000, 2000}, t); + // Let's only verify a part of it... + for (int y = 0; y < 100; y++) { + for (int x = 0; x < 100; x++) { + int correct = 4 * (x + y) + 2; + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + return 1; + } + } + } + return 0; +} + +int main(int argc, char **argv) { + Target t = get_jit_target_from_environment(); + if (!t.has_gpu_feature()) { + printf("[SKIP] no gpu feature enabled\n"); + return 0; + } + printf("Testing without profiler.\n"); + int result = run_test(t); + if (result != 0) { + return 1; + } + + printf("Testing thread based profiler.\n"); + result = run_test(t.with_feature(Target::Profile)); + if (result != 0) { + return 1; + } + if (t.os == Target::Linux) { + printf("Testing timer based profiler.\n"); + result = run_test(t.with_feature(Target::ProfileByTimer)); + if (result != 0) { + return 1; + } + } + + printf("Success!\n"); + return 0; +} diff --git a/test/performance/memory_profiler.cpp b/test/performance/memory_profiler.cpp index 2b758fe1f099..8ca5cf3c2295 100644 --- a/test/performance/memory_profiler.cpp +++ b/test/performance/memory_profiler.cpp @@ -22,7 +22,7 @@ void my_print(JITUserContext *, const char *msg) { int val; // printf("%s", msg); - val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f peak: %d num: %d avg: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f peak: %d num: %d avg: %d", &idx, &this_ms, &this_percentage, &this_threads, &this_heap_peak, &this_num_mallocs, &this_malloc_avg); if (val == 7) { @@ -31,7 +31,7 @@ void my_print(JITUserContext *, const char *msg) { malloc_avg = this_malloc_avg; } - val = sscanf(msg, " g_%d: %fms (%f%%) peak: %d num: %d avg: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) peak: %d num: %d avg: %d", &idx, &this_ms, &this_percentage, &this_heap_peak, &this_num_mallocs, &this_malloc_avg); if (val == 6) { @@ -40,13 +40,13 @@ void my_print(JITUserContext *, const char *msg) { malloc_avg = this_malloc_avg; } - val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f stack: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f stack: %d", &idx, &this_ms, &this_percentage, &this_threads, &this_stack_peak); if (val == 5) { stack_peak = this_stack_peak; } - val = sscanf(msg, " g_%d: %fms (%f%%) stack: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) stack: %d", &idx, &this_ms, &this_percentage, &this_stack_peak); if (val == 4) { stack_peak = this_stack_peak; diff --git a/test/performance/profiler.cpp b/test/performance/profiler.cpp index c71991b1f267..bf5d166c0e81 100644 --- a/test/performance/profiler.cpp +++ b/test/performance/profiler.cpp @@ -8,9 +8,9 @@ float ms = 0; void my_print(JITUserContext *, const char *msg) { float this_ms; float this_percentage; - int val = sscanf(msg, " fn13: %fms (%f", &this_ms, &this_percentage); + int val = sscanf(msg, " fn13: %fms (%f", &this_ms, &this_percentage); if (val != 2) { - val = sscanf(msg, " fn13$1: %fms (%f", &this_ms, &this_percentage); + val = sscanf(msg, " fn13$1: %fms (%f", &this_ms, &this_percentage); } if (val == 2) { ms = this_ms;