Skip to content

Commit

Permalink
Fixed the buffer copy bug. Added a new test that will cause buffer co…
Browse files Browse the repository at this point in the history
…pies in two directions within the compiled pipeline. This will catch this better in the future. Tweaked the profile report section header printing.
  • Loading branch information
mcourteaux committed Dec 1, 2023
1 parent ddb8bbf commit aaa448c
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 35 deletions.
28 changes: 16 additions & 12 deletions src/Profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,16 +439,17 @@ class InjectProfiling : public IRMutator {
} else {
internal_error << "Expected to find a variable as first argument of the function call " << call->name << ".\n";
}
Expr device_interface;
bool requires_sync = false;
if (call->name == "halide_copy_to_host") {
int copy_to_host_id = get_func_id(buffer_name + " (copy to host)");
start_profiler = set_current_func(copy_to_host_id);
device_interface = Call::make(type_of<const halide_device_interface_t *>(),
Call::buffer_get_device_interface, {call->args.back()}, Call::Extern);
requires_sync = false;
} else if (call->name == "halide_copy_to_device") {
int copy_to_device_id = get_func_id(buffer_name + " (copy to device)");
start_profiler = set_current_func(copy_to_device_id);
device_interface = call->args.back(); // The last argument to the copy calls is the device_interface.
requires_sync = true;
} else {
internal_error << "Unexpected function name.\n";
}
if (start_profiler.defined()) {
// The copy functions are followed by an assert, which we will wrap in the timed body.
Expand All @@ -462,14 +463,17 @@ class InjectProfiling : public IRMutator {
} else if (const AssertStmt *assert = op->body.as<AssertStmt>()) {
copy_assert = assert;
}
if (copy_assert && device_interface.defined()) {
Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});

std::vector<Stmt> steps{
AssertStmt::make(copy_assert->condition, copy_assert->message),
sync_and_assert,
set_current_func(stack.back()),
};
if (copy_assert) {
std::vector<Stmt> steps;
steps.push_back(AssertStmt::make(copy_assert->condition, copy_assert->message));
if (requires_sync) {
internal_assert(call->name == "halide_copy_to_device");
Expr device_interface = call->args.back(); // The last argument to the copy_to_device calls is the device_interface.
Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
steps.push_back(sync_and_assert);
}
steps.push_back(set_current_func(stack.back()));

if (other.defined()) {
steps.push_back(mutate(other));
}
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -880,8 +880,8 @@ extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);
* Wait for current GPU operations to complete. Calling this explicitly
* should rarely be necessary, except maybe for profiling.
* This variation of the synchronizing is useful when a synchronization is desirable
* without specifying any buffer to synchronize on. Calling this with a null device_interface
* is always legal and immediately returns with no error.
* without specifying any buffer to synchronize on.
* Calling this with a null device_interface is always illegal.
*/
extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface);

Expand Down
2 changes: 1 addition & 1 deletion src/runtime/device_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) {
*/
WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) {
if (device_interface == nullptr) {
return halide_error_code_success;
return halide_error_code_no_device_interface;
}
// This function calls immediately the device_interface implementation to syncrhonize on
// "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync.
Expand Down
25 changes: 11 additions & 14 deletions src/runtime/profiler_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,10 +444,6 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
}

const auto print_time_and_percentage = [&sstr, p](uint64_t time, size_t &cursor, bool light) {
if (light) {
sstr.erase(2); // Cheat a little. There is 5 characters margin anyway.
sstr << "[ ";
}
float ft = time / (p->runs * 1000000.0f);
if (ft < 10000) {
sstr << " ";
Expand Down Expand Up @@ -480,12 +476,11 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
}
int percent = perthousand / 10;
sstr << percent << "." << (perthousand - percent * 10) << "%)";
if (light) {
sstr << " ]";
}
cursor += 10;
while (sstr.size() < cursor) {
sstr << " ";
if (!light) {
cursor += 10;
while (sstr.size() < cursor) {
sstr << " ";
}
}
};

Expand Down Expand Up @@ -549,16 +544,18 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
const auto print_section_header = [&](const char *name, uint64_t total_time) {
size_t cursor = 0;
sstr.clear();
sstr << " ";
if (support_colors) {
sstr << "\033[2m\033[1m";
cursor += 8;
sstr << "\033[90m\033[3m";
cursor += 9;
}
sstr << " [" << name << "]";
sstr << "[" << name << " ";
cursor += max_func_name_length + 7;
while (sstr.size() < cursor) {
sstr << " ";
sstr << ":";
}
print_time_and_percentage(total_time, cursor, true);
sstr << " ::::]";
if (support_colors) {
sstr << "\033[0m";
}
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ tests(GROUPS correctness
debug_to_file_reorder.cpp
deferred_loop_level.cpp
deinterleave4.cpp
device_buffer_copies_with_profile.cpp
device_buffer_copy.cpp
device_copy_at_inner_loop.cpp
device_crop.cpp
Expand Down
72 changes: 72 additions & 0 deletions test/correctness/device_buffer_copies_with_profile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include "Halide.h"

using namespace Halide;

int run_test(Target t) {
// Sliding window with the producer on the GPU and the consumer on
// the CPU. This requires a copy inside the loop over which we are
// sliding. Currently this copies the entire buffer back and
// forth, which is suboptimal in the general case. In this
// specific case we're folded over y, so copying the entire buffer
// is not much more than just copying the part that was modified.

Func f0{"f0_on_cpu"}, f1{"f1_on_gpu"}, f2{"f2_on_cpu"};
Var x, y, tx, ty;

// Produce something on CPU
f0(x, y) = x + y;
f0.compute_root();

// Which we use to produce something on GPU, causing a copy_to_device.
f1(x, y) = f0(x, y) + f0(x, y + 1);
f1.compute_root().gpu_tile(x, y, tx, ty, 8, 8);

// Which in turn we use to produce something on CPU, causing a copy_to_host.
f2(x, y) = f1(x, y) * 2;
f2.compute_root();


// Make the buffer a little bigger so we actually can see the copy time.
Buffer<int> out = f2.realize({2000, 2000}, t);
// Let's only verify a part of it...
for (int y = 0; y < 100; y++) {
for (int x = 0; x < 100; x++) {
int correct = 4 * (x + y) + 2;
if (out(x, y) != correct) {
printf("out(%d, %d) = %d instead of %d\n",
x, y, out(x, y), correct);
return 1;
}
}
}
return 0;
}

int main(int argc, char **argv) {
Target t = get_jit_target_from_environment();
if (!t.has_gpu_feature()) {
printf("[SKIP] no gpu feature enabled\n");
return 0;
}
printf("Testing without profiler.\n");
int result = run_test(t);
if (result != 0) {
return 1;
}

printf("Testing thread based profiler.\n");
result = run_test(t.with_feature(Target::Profile));
if (result != 0) {
return 1;
}
if (t.os == Target::Linux) {
printf("Testing timer based profiler.\n");
result = run_test(t.with_feature(Target::ProfileByTimer));
if (result != 0) {
return 1;
}
}

printf("Success!\n");
return 0;
}
8 changes: 4 additions & 4 deletions test/performance/memory_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void my_print(JITUserContext *, const char *msg) {
int val;

// printf("%s", msg);
val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f peak: %d num: %d avg: %d",
val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f peak: %d num: %d avg: %d",
&idx, &this_ms, &this_percentage, &this_threads, &this_heap_peak,
&this_num_mallocs, &this_malloc_avg);
if (val == 7) {
Expand All @@ -31,7 +31,7 @@ void my_print(JITUserContext *, const char *msg) {
malloc_avg = this_malloc_avg;
}

val = sscanf(msg, " g_%d: %fms (%f%%) peak: %d num: %d avg: %d",
val = sscanf(msg, " g_%d: %fms (%f%%) peak: %d num: %d avg: %d",
&idx, &this_ms, &this_percentage, &this_heap_peak,
&this_num_mallocs, &this_malloc_avg);
if (val == 6) {
Expand All @@ -40,13 +40,13 @@ void my_print(JITUserContext *, const char *msg) {
malloc_avg = this_malloc_avg;
}

val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f stack: %d",
val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f stack: %d",
&idx, &this_ms, &this_percentage, &this_threads, &this_stack_peak);
if (val == 5) {
stack_peak = this_stack_peak;
}

val = sscanf(msg, " g_%d: %fms (%f%%) stack: %d",
val = sscanf(msg, " g_%d: %fms (%f%%) stack: %d",
&idx, &this_ms, &this_percentage, &this_stack_peak);
if (val == 4) {
stack_peak = this_stack_peak;
Expand Down
4 changes: 2 additions & 2 deletions test/performance/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ float ms = 0;
void my_print(JITUserContext *, const char *msg) {
float this_ms;
float this_percentage;
int val = sscanf(msg, " fn13: %fms (%f", &this_ms, &this_percentage);
int val = sscanf(msg, " fn13: %fms (%f", &this_ms, &this_percentage);
if (val != 2) {
val = sscanf(msg, " fn13$1: %fms (%f", &this_ms, &this_percentage);
val = sscanf(msg, " fn13$1: %fms (%f", &this_ms, &this_percentage);
}
if (val == 2) {
ms = this_ms;
Expand Down

0 comments on commit aaa448c

Please sign in to comment.