diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt index 19b72d58c09c..2221e61d74ec 100644 --- a/tutorial/CMakeLists.txt +++ b/tutorial/CMakeLists.txt @@ -209,3 +209,6 @@ if (TARGET Halide::Mullapudi2016) add_test(NAME tutorial_lesson_21_auto_scheduler_run COMMAND lesson_21_auto_scheduler_run) set_tests_properties(tutorial_lesson_21_auto_scheduler_run PROPERTIES LABELS "tutorial;multithreaded") endif () + +# Lesson 22 +add_tutorial(lesson_22_jit_performance.cpp) diff --git a/tutorial/lesson_22_jit_performance.cpp b/tutorial/lesson_22_jit_performance.cpp new file mode 100644 index 000000000000..e1178b95bbcd --- /dev/null +++ b/tutorial/lesson_22_jit_performance.cpp @@ -0,0 +1,259 @@ +// Halide tutorial lesson 22: JIT compilation performance + +// This lesson demonstrates the various performance implications of the +// various Halide methods of doing "Just-In-Time" compilation. + +// On linux, you can compile and run it like so: +// g++ lesson_22*.cpp -g -I -I -L -lHalide -lpthread -ldl -o lesson_22 -std=c++17 +// LD_LIBRARY_PATH= ./lesson_20 + +// On os x: +// g++ lesson_22*.cpp -g -I -I -L -lHalide -o lesson_22 -std=c++17 +// DYLD_LIBRARY_PATH= ./lesson_22 + +// If you have the entire Halide source tree, you can also build it by +// running: +// make tutorial_lesson_22_jit_performance +// in a shell at the top of the halide source tree. + +#include "Halide.h" +#include "halide_benchmark.h" +#include + +using namespace Halide; +using namespace Halide::Tools; // for benchmark() + +// Let's define a helper function to construct a simple pipeline that we'll use for our performance tests. +Pipeline make_pipeline() { + // We'll start with a simple transpose operation... + Func input("input"), output("output"); + Var x("x"), y("y"); + + // Fill the input with a linear combination of the coordinate values... + input(x, y) = cast(x + y); + input.compute_root(); + + // Transpose the rows and cols + output(x, y) = input(y, x); + + // Schedule it ... there's a number of possibilities here to do an efficient block-wise transpose. + Var xi("xi"), yi("yi"); + + // Let's focus on 8x8 subtiles, and then vectorize across X, and unroll across Y. + output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi); + + // For more advanced scheduling: + // + // We can improve this even more by using the .in() directive (see Tutorial 19), + // which allows us to interpose new Funcs in between input and output. + // + // Here we can inject a block_transpose function to allow us to do 8 vectorized loads from the input. + Func block_transpose("block_transpose"), block("block"); + block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y); + // + // And now Let's reorder and vectorize in X across the block. + block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y); + + // Return the constructed pipeline + return Pipeline(output); +} + +int main(int argc, char **argv) { + // Since we'll be using the same sample and iteration counts for our benchmarking, + // let's define them here in the outermost scope. + constexpr int samples = 100; + constexpr int iterations = 1; + + // Now, let's measure the performance of constructing and executing a simple pipeline from scratch... + { + size_t count = 0; + double t = benchmark(samples, iterations, [&]() { + + // First, create an output buffer to hold the results. + Buffer result(1024, 1024); + + // Now, construct our pipeline from scratch. + Pipeline pipeline = make_pipeline(); + + // And then call realize to execute the pipeline. + pipeline.realize(result); + ++count; + }); + + // On a MacBook Pro M1, we should get around ~1800 times/sec. + std::cout << "Compile & Execute Pipeline (from scratch): " << int(count / t) << " times/sec\n"; + } + + // This time, let's create the pipeline outside the timing loop and re-use it for each execution... + { + // Create our pipeline, and re-use it in the loop below + Pipeline pipeline = make_pipeline(); + + size_t count = 0; + double t = benchmark(samples, iterations, [&]() { + + // Create our output buffer + Buffer result(1024, 1024); + + // Now, call realize + pipeline.realize(result); + ++count; + }); + + // On a MacBook Pro M1, we should get around ~175000 times/sec (almost 95-100x times faster!). + std::cout << "Compile & Execute Pipeline (re-use pipeline): " << int(count / t) << " times/sec\n"; + } + + // Let's do the same thing as before, but explicitly JIT compile before we realize... + { + Pipeline pipeline = make_pipeline(); + + // Let's JIT compile for our target before we realize, and see what happens... + const Target target = get_jit_target_from_environment(); + pipeline.compile_jit(target); + + size_t count = 0; + double t = benchmark(samples, iterations, [&]() { + Buffer result(1024, 1024); + pipeline.realize(result); + ++count; + }); + + // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec) + // + // This may seem somewhat surprising, since compiling before realizing doesn't seem to make + // much of a difference to the previous case. However, the first call to realize() will implicitly + // JIT-compile and cache the generated code associated with the Pipeline object, which is basically + // what we've done here. Each subsequent call to realize uses the cached version of the native code, + // so there's no additional overhead, and the cost is amortized as we re-use the pipeline. + std::cout << "Execute Pipeline (compile before realize): " << int(count / t) << " times/sec\n"; + + // Another subtlety is the creation of the result buffer ... the declaration implicitly + // allocates memory which will add overhead to each loop iteration. This time, let's try + // using the realize({1024, 1024}) call which will use the buffer managed by the pipeline + // object for the outputs... + count = 0; + t = benchmark(samples, iterations, [&]() { + Buffer result = pipeline.realize({1024, 1024}); + ++count; + }); + + // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec). + std::cout << "Execute Pipeline (same but with realize({})): " << int(count / t) << " times/sec\n"; + + // Or ... we could move the declaration of the result buffer outside the timing loop, and + // re-use the allocation (with the caveat that we will be stomping over its contents on each + // execution). + Buffer result(1024, 1024); + + count = 0; + t = benchmark(samples, iterations, [&]() { + pipeline.realize(result); + ++count; + }); + + // On a MacBook Pro M1, this should be much more efficient ... ~200000 times/sec (or 10-12% faster). + std::cout << "Execute Pipeline (re-use buffer with realize): " << int(count / t) << " times/sec\n"; + } + + // Alternatively, we could compile to a Callable object... + { + Pipeline pipeline = make_pipeline(); + const Target target = get_jit_target_from_environment(); + + // Here, we can ask the pipeline for its argument list (these are either Params, + // ImageParams, or Buffers) so that we can construct a Callable object with the same + // calling convention. + auto arguments = pipeline.infer_arguments(); + + // The Callable object acts as a convenient way of invoking the compiled code like + // a function call, using an argv-like syntax for the argument list. It also caches + // the JIT compiled code, so there's no code generation overhead when invoking the + // callable object and executing the pipeline. + Callable callable = pipeline.compile_to_callable(arguments, target); + + // Again, we'll pre-allocate and re-use the result buffer. + Buffer result(1024, 1024); + + size_t count = 0; + double t = benchmark(samples, iterations, [&]() { + callable(result); + ++count; + }); + + // This should be about the same as the previous run (about ~200000 times/sec). + std::cout << "Execute Pipeline (compile to callable): " << int(count / t) << " times/sec\n"; + + // Perhaps even more convient, we can create a std::function object from the callable, + // which allows cleaner type checking for the parameters, and slightly less overhead + // for invoking the function. The list used for the template parameters needs to match + // the list for the parameters of the pipeline. Here, we have a single result buffer, + // so we specify Buffer in our call to .make_std_function<>. If we had other + // scalar parameters, input buffers or output buffers, we'd pass them in the template + // parameter list too. + auto function = callable.make_std_function>(); + + count = 0; + t = benchmark(samples, iterations, [&]() { + function(result); + ++count; + }); + + // On a MacBook Pro M1, this should be slightly more efficient than the callable (~1% faster). + std::cout << "Execute Pipeline (compile to std::function): " << int(count / t) << " times/sec\n"; + } + + // Let's see how much time is spent on just compiling... + { + Pipeline pipeline = make_pipeline(); + + // Only the first call to compile_jit() is expensive ... after the code is generated, + // it gets stored in a cache for later re-use, so repeatedly calling compile_jit has + // very little overhead after its been cached. + + size_t count = 0; + double t = benchmark(samples, iterations, [&]() { + pipeline.compile_jit(); + ++count; + }); + + // Only the first call does any work and the rest are essentially free. + // On a MacBook Pro M1, we should expect ~2 billion times/sec. + std::cout << "Compile JIT (using cache): " << int(count / t) << " times/sec\n"; + + // You can invalidate the cache manually, which will destroy all the compiled state. + count = 0; + t = benchmark(samples, iterations, [&]() { + pipeline.invalidate_cache(); + pipeline.compile_jit(); + ++count; + }); + + // This is an intentionally expensive loop, and very slow! + // On a MacBook Pro M1, we should see only ~2000 times/sec. + std::cout << "Compile JIT (from scratch): " << int(count / t) << " times/sec\n"; + } + + // Alternatively we could compile to a Module... + { + Pipeline pipeline = make_pipeline(); + auto args = pipeline.infer_arguments(); + + // Compiling to a module generates a self-contained Module containing an internal-representation + // of the lowered code suitable for further compilation. So, it's not directly + // runnable, but it can be used to link/combine Modules and generate object files, + // static libs, bitcode, etc. + + size_t count = 0; + double t = benchmark(samples, iterations, [&]() { + Module m = pipeline.compile_to_module(args, "transpose"); + ++count; + }); + + // On a MacBook Pro M1, this should be around ~10000 times/sec + std::cout << "Compile to Module: " << int(count / t) << " times/sec\n"; + } + + printf("DONE!\n"); + return 0; +}