Skip to content

Commit

Permalink
Disallow async nestings that violate read after write dependencies
Browse files Browse the repository at this point in the history
Fixes #7867
  • Loading branch information
abadams committed Sep 27, 2023
1 parent 3926b02 commit 9c6c062
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 7 deletions.
52 changes: 52 additions & 0 deletions src/AsyncProducers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,23 +109,75 @@ class NoOpCollapsingMutator : public IRMutator {
class GenerateProducerBody : public NoOpCollapsingMutator {
const string &func;
vector<Expr> sema;
std::set<string> producers_dropped;
bool found_producer = false;

using NoOpCollapsingMutator::visit;

void bad_producer_nesting_error(const string &producer, const string &async_consumer) {
user_error
<< "The non-async Func " << producer << " is consumed by async Func " << async_consumer
<< " and is compute_at a location in between the store_at "
<< "location and the compute_at location of " << async_consumer
<< ". This is unsupported. Either schedule " << producer
<< " it outside the store_at location of " << async_consumer
<< " or inside the compute_at location of " << async_consumer << ".";
}

// Preserve produce nodes and add synchronization
Stmt visit(const ProducerConsumer *op) override {
if (op->name == func && op->is_producer) {
found_producer = true;

// Add post-synchronization
internal_assert(!sema.empty()) << "Duplicate produce node: " << op->name << "\n";
Stmt body = op->body;

// We don't currently support waiting on producers to the producer
// half of the fork node. Or rather, if you want to do that you have
// to schedule those Funcs as async too. Check for any consume nodes
// where the producer has gone to the consumer side of the fork
// node.
class FindBadConsumeNodes : public IRVisitor {
const std::set<string> &producers_dropped;
using IRVisitor::visit;

void visit(const ProducerConsumer *op) override {
if (!op->is_producer && producers_dropped.count(op->name)) {
found = op->name;
}
}

public:
string found;
FindBadConsumeNodes(const std::set<string> &p)
: producers_dropped(p) {
}
} finder(producers_dropped);
body.accept(&finder);
if (!finder.found.empty()) {
bad_producer_nesting_error(finder.found, func);
}

while (!sema.empty()) {
Expr release = Call::make(Int(32), "halide_semaphore_release", {sema.back(), 1}, Call::Extern);
body = Block::make(body, Evaluate::make(release));
sema.pop_back();
}
return ProducerConsumer::make_produce(op->name, body);
} else {
if (op->is_producer) {
producers_dropped.insert(op->name);
}
bool found_producer_before = found_producer;
Stmt body = mutate(op->body);
if (!op->is_producer && producers_dropped.count(op->name) &&
found_producer && !found_producer_before) {
// We've found a consume node wrapping our async producer where
// the corresponding producer node was dropped from this half of
// the fork.
bad_producer_nesting_error(op->name, func);
}
if (is_no_op(body) || op->is_producer) {
return body;
} else {
Expand Down
1 change: 1 addition & 0 deletions test/error/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ tests(GROUPS error
auto_schedule_no_parallel.cpp
auto_schedule_no_reorder.cpp
autodiff_unbounded.cpp
bad_async_producer.cpp
bad_bound.cpp
bad_bound_storage.cpp
bad_compute_at.cpp
Expand Down
33 changes: 26 additions & 7 deletions test/performance/async_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Expr expensive(Expr x, int c) {
if (c <= 0) {
return x;
} else {
return expensive(fast_pow(x, x + 1), c - 1);
return expensive(x * (x + 1), c - 1);
}
}

Expand All @@ -31,11 +31,12 @@ int main(int argc, char **argv) {
}

double times[2];
uint32_t correct = 0;
for (int use_async = 0; use_async < 2; use_async++) {
Var x, y, t, xi, yi;

ImageParam in(Float(32), 3);
Func cpu, gpu;
ImageParam in(UInt(32), 3);
Func cpu("cpu"), gpu("gpu");

// We have a two-stage pipeline that processes frames. We want
// to run the first stage on the GPU and the second stage on
Expand All @@ -50,26 +51,44 @@ int main(int argc, char **argv) {

// Assume GPU memory is limited, and compute the GPU stage one
// frame at a time. Hoist the allocation to the top level.
gpu.compute_at(cpu, t).store_root().gpu_tile(x, y, xi, yi, 8, 8);
gpu.compute_at(gpu.in(), Var::outermost()).store_root().gpu_tile(x, y, xi, yi, 8, 8);

// Stage the copy-back of the GPU result into a host-side
// double-buffer.
gpu.in().copy_to_host().compute_at(cpu, t).store_root().fold_storage(t, 2);

if (use_async) {
// gpu.async();
gpu.in().async();
gpu.async();
}

in.set(Buffer<float>(800, 800, 16));
Buffer<float> out(800, 800, 16);
Buffer<uint32_t> in_buf(800, 800, 16);
in_buf.fill(17);
in.set(in_buf);
Buffer<uint32_t> out(800, 800, 16);

cpu.compile_jit();

times[use_async] = benchmark(10, 1, [&]() {
cpu.realize(out);
});

if (!use_async) {
correct = out(0, 0, 0);
} else {
for (int t = 0; t < out.dim(2).extent(); t++) {
for (int y = 0; y < out.dim(1).extent(); y++) {
for (int x = 0; x < out.dim(0).extent(); x++) {
if (out(x, y, t) != correct) {
printf("Async output at (%d, %d, %d) is %u instead of %u\n",
x, y, t, out(x, y, t), correct);
return 1;
}
}
}
}
}

printf("%s: %f\n",
use_async ? "with async" : "without async",
times[use_async]);
Expand Down

0 comments on commit 9c6c062

Please sign in to comment.