diff --git a/examples/README.md b/examples/README.md index caf1c065..4e9a8eeb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,3 +8,5 @@ If you are new to Exo, we recommend going through the examples in the following 2. [Cursor](./cursors/README.md): This example shows how to use Cursors to efficiently write schedules and define a new scheduling operator. 3. [RVM](./rvm_conv1d/README.md): This example illustrates how to use Exo to define and target a new hardware accelerator entirely in the user code. + +4. Quizzes ([quiz1](./quiz1/README.md), [quiz2](./quiz2/README.md), [quiz3](./quiz3/README.md)) contains common scheduling mistakes in Exo and wrong schedules. You are invited to solve the quiz. diff --git a/examples/quiz1/README.md b/examples/quiz1/README.md new file mode 100644 index 00000000..e10bbf4b --- /dev/null +++ b/examples/quiz1/README.md @@ -0,0 +1,59 @@ +# Quiz 1 + +Throughout the quiz, we provide incorrect code and the correct output as a reference. Your goal is to understand the code and fix the bug to match the correct output! + +You can execute `quiz1.py` by running `exocc quiz1.py`. Without modification, it will show the incorrect output. + +## Incorrect Output + +The following output is incorrect because it does not make calls to vector intrinsics. While it matches the structure of SIMD vector code, it is still being executed one element at a time: + +```python +def double(N: size, inp: f32[N] @ DRAM, out: f32[N] @ DRAM): + assert N % 8 == 0 + two_vec: R[8] @ DRAM + for ii in seq(0, 8): + two_vec[ii] = 2.0 + for io in seq(0, N / 8): + out_vec: f32[8] @ DRAM + inp_vec: f32[8] @ DRAM + for i0 in seq(0, 8): + inp_vec[i0] = inp[i0 + 8 * io] + for ii in seq(0, 8): + out_vec[ii] = two_vec[ii] * inp_vec[ii] + for i0 in seq(0, 8): + out[i0 + 8 * io] = out_vec[i0] +``` + +## Correct Output + +The correct output optimizes the function to use vectorized arithmetic operations to compute the result over the entire array: + +```python +def double(N: size, inp: f32[N] @ DRAM, out: f32[N] @ DRAM): + assert N % 8 == 0 + two_vec: R[8] @ AVX2 + vector_assign_two(two_vec[0:8]) + for io in seq(0, N / 8): + out_vec: f32[8] @ AVX2 + inp_vec: f32[8] @ AVX2 + vector_load(inp_vec[0:8], inp[8 * io + 0:8 * io + 8]) + vector_multiply(out_vec[0:8], two_vec[0:8], inp_vec[0:8]) + vector_store(out[8 * io + 0:8 * io + 8], out_vec[0:8]) +``` + +--- + +## Solution + +Before calling `replace_all(p, avx_instrs)`, you need to set the buffer memory types to AVX2, because `replace_all` is memory-aware and will only replace code chunks with instructions that have matching memory annotations. + +Add the following code before the call to `replace_all`: + +```python + # Set the memory types to be AVX2 vectors + for name in ["two", "out", "inp"]: + p = set_memory(p, f"{name}_vec", AVX2) +``` + +This will ensure that the memory types are correctly set to AVX2 before replacing the code with vector intrinsics. diff --git a/examples/quiz1/quiz1.py b/examples/quiz1/quiz1.py new file mode 100644 index 00000000..d8475c60 --- /dev/null +++ b/examples/quiz1/quiz1.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from exo import * +from exo.libs.memories import AVX2 +from exo.stdlib.scheduling import * + + +@instr("{dst_data} = _mm256_loadu_ps(&{src_data});") +def vector_load(dst: [f32][8] @ AVX2, src: [f32][8] @ DRAM): + assert stride(src, 0) == 1 + assert stride(dst, 0) == 1 + + for i in seq(0, 8): + dst[i] = src[i] + + +@instr("_mm256_storeu_ps(&{dst_data}, {src_data});") +def vector_store(dst: [f32][8] @ DRAM, src: [f32][8] @ AVX2): + assert stride(src, 0) == 1 + assert stride(dst, 0) == 1 + + for i in seq(0, 8): + dst[i] = src[i] + + +@instr("{out_data} = _mm256_mul_ps({x_data}, {y_data});") +def vector_multiply(out: [f32][8] @ AVX2, x: [f32][8] @ AVX2, y: [f32][8] @ AVX2): + assert stride(out, 0) == 1 + assert stride(x, 0) == 1 + assert stride(y, 0) == 1 + + for i in seq(0, 8): + out[i] = x[i] * y[i] + + +@instr("{out_data} = _mm256_broadcast_ss(2.0);") +def vector_assign_two(out: [f32][8] @ AVX2): + assert stride(out, 0) == 1 + + for i in seq(0, 8): + out[i] = 2.0 + + +@proc +def vec_double(N: size, inp: f32[N], out: f32[N]): + assert N % 8 == 0 + for i in seq(0, N): + out[i] = 2.0 * inp[i] + + +def wrong_schedule(p): + """ + Forgot to set the memory types to be AVX2 vectors, so replace instruction + does not work as intended. + """ + p = rename(p, "vec_double_optimized") + p = divide_loop(p, "i", 8, ["io", "ii"], perfect=True) + + # Create a vector of twos + p = bind_expr(p, "2.0", "two_vec") + two_alloc = p.find("two_vec: _") + two_assign = p.find("two_vec = _") + p = expand_dim(p, two_alloc, 8, "ii") + + # Hoist the allocation and assignment of two vector + p = lift_alloc(p, two_alloc, 2) + p = fission(p, two_assign.after(), 2) + p = remove_loop(p, two_assign.parent().parent()) + + # Create vectors for the input and output values + innermost_loop = p.find_loop("ii #1") + p = stage_mem(p, innermost_loop, "out[8*io:8*io+8]", "out_vec") + p = stage_mem(p, innermost_loop, "inp[8*io:8*io+8]", "inp_vec") + p = simplify(p) + + # Replace with AVX instructinos + avx_instrs = [vector_assign_two, vector_multiply, vector_load, vector_store] + p = replace_all(p, avx_instrs) + + return p + + +w = wrong_schedule(vec_double) +print(w) diff --git a/examples/quiz1/quiz1/quiz1.c b/examples/quiz1/quiz1/quiz1.c new file mode 100644 index 00000000..5861f1ce --- /dev/null +++ b/examples/quiz1/quiz1/quiz1.c @@ -0,0 +1,56 @@ +#include "quiz1.h" + +#include +#include +#include + +// vec_double( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ) { +EXO_ASSUME(N % 8 == 0); +for (int_fast32_t i = 0; i < N; i++) { + out[i] = 2.0f * inp[i]; +} +} + +// vec_double_optimized( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ) { +EXO_ASSUME(N % 8 == 0); +__m256 two_vec; +two_vec = _mm256_broadcast_ss(2.0); +for (int_fast32_t io = 0; io < ((N) / (8)); io++) { + __m256 out_vec; + __m256 inp_vec; + inp_vec = _mm256_loadu_ps(&inp[8 * io]); + out_vec = _mm256_mul_ps(two_vec, inp_vec); + _mm256_storeu_ps(&out[8 * io], out_vec); +} +} + + +/* relying on the following instruction..." +vector_assign_two(out) +{out_data} = _mm256_broadcast_ss(2.0); +*/ + +/* relying on the following instruction..." +vector_load(dst,src) +{dst_data} = _mm256_loadu_ps(&{src_data}); +*/ + +/* relying on the following instruction..." +vector_multiply(out,x,y) +{out_data} = _mm256_mul_ps({x_data}, {y_data}); +*/ + +/* relying on the following instruction..." +vector_store(dst,src) +_mm256_storeu_ps(&{dst_data}, {src_data}); +*/ diff --git a/examples/quiz1/quiz1/quiz1.d b/examples/quiz1/quiz1/quiz1.d new file mode 100644 index 00000000..f87d2bc2 --- /dev/null +++ b/examples/quiz1/quiz1/quiz1.d @@ -0,0 +1,308 @@ +quiz1/quiz1.c quiz1/quiz1.h : /home/yuka/.local/bin/exocc \ + /home/yuka/.local/lib/python3.9/site-packages/_distutils_hack/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl_adt/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl_adt/adt.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl_adt/validators.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_cmp.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_compat.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_config.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_funcs.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_make.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_next_gen.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_version_info.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/converters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/exceptions.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/filters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/setters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/validators.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/converters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/exceptions.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/filters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/setters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/validators.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API_cursors.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API_scheduling.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API_types.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/LoopIR_compiler.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/mem_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/parallel_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/prec_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/win_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/LoopIR.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/LoopIR_pprint.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/configs.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/extern.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/internal_cursors.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/memory.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/prelude.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/proc_eqv.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/boundscheck.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/parse_fragment.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/pattern_match.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/pyparser.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/typecheck.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/libs/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/libs/memories.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/main.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/LoopIR_scheduling.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/LoopIR_unification.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/analysis_simplify.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/new_analysis_core.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/new_eff.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/range_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/stdlib/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/stdlib/analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/stdlib/scheduling.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_adapters.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_collections.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_compat.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_functools.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_itertools.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_meta.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_text.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/compat/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/compat/py39.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/api.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/unix.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/version.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/configuration.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/constants.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/decorators.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/environment.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/exceptions.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/factory.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/fnode.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/formula.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/logics.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/operators.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/oracles.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/printers.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/shortcuts.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/simplifier.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/annotations.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/commands.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/parser/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/parser/parser.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/printers.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/script.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/options.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/portfolio.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/qelim.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/smtlib.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/solver.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/z3.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/substituter.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/type_checker.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/typing.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/utils.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/dag.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/generic.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/identitydag.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/tree.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/pyparser.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/pyparser_utils.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/split_penalty_visitor.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/blank_line_calculator.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/comment_splicer.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/continuation_splicer.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/pytree_unwrapper.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/pytree_utils.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/pytree_visitor.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/split_penalty.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/subtype_assigner.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/errors.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/file_resources.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/format_decision_state.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/format_token.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/identify_container.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/line_joiner.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/logical_line.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/object_state.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/reformatter.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/split_penalty.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/style.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/subtypes.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/yapf_api.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/fixer_util.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/patcomp.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/driver.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/grammar.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/literals.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/parse.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/pgen.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/token.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/tokenize.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pygram.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pytree.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3consts.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3core.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3num.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3poly.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3printer.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3rcf.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3types.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3util.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/compat/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/compat/py310.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/glob.py \ + /usr/local/lib/python3.9/__future__.py \ + /usr/local/lib/python3.9/_bootlocale.py \ + /usr/local/lib/python3.9/_collections_abc.py \ + /usr/local/lib/python3.9/_compat_pickle.py \ + /usr/local/lib/python3.9/_compression.py \ + /usr/local/lib/python3.9/_sitebuiltins.py \ + /usr/local/lib/python3.9/_weakrefset.py \ + /usr/local/lib/python3.9/abc.py \ + /usr/local/lib/python3.9/argparse.py \ + /usr/local/lib/python3.9/ast.py \ + /usr/local/lib/python3.9/base64.py \ + /usr/local/lib/python3.9/bisect.py \ + /usr/local/lib/python3.9/calendar.py \ + /usr/local/lib/python3.9/codecs.py \ + /usr/local/lib/python3.9/collections/__init__.py \ + /usr/local/lib/python3.9/collections/abc.py \ + /usr/local/lib/python3.9/configparser.py \ + /usr/local/lib/python3.9/contextlib.py \ + /usr/local/lib/python3.9/copy.py \ + /usr/local/lib/python3.9/copyreg.py \ + /usr/local/lib/python3.9/ctypes/__init__.py \ + /usr/local/lib/python3.9/ctypes/_endian.py \ + /usr/local/lib/python3.9/dataclasses.py \ + /usr/local/lib/python3.9/datetime.py \ + /usr/local/lib/python3.9/decimal.py \ + /usr/local/lib/python3.9/difflib.py \ + /usr/local/lib/python3.9/dis.py \ + /usr/local/lib/python3.9/email/__init__.py \ + /usr/local/lib/python3.9/email/_encoded_words.py \ + /usr/local/lib/python3.9/email/_parseaddr.py \ + /usr/local/lib/python3.9/email/_policybase.py \ + /usr/local/lib/python3.9/email/base64mime.py \ + /usr/local/lib/python3.9/email/charset.py \ + /usr/local/lib/python3.9/email/encoders.py \ + /usr/local/lib/python3.9/email/errors.py \ + /usr/local/lib/python3.9/email/feedparser.py \ + /usr/local/lib/python3.9/email/header.py \ + /usr/local/lib/python3.9/email/iterators.py \ + /usr/local/lib/python3.9/email/message.py \ + /usr/local/lib/python3.9/email/parser.py \ + /usr/local/lib/python3.9/email/quoprimime.py \ + /usr/local/lib/python3.9/email/utils.py \ + /usr/local/lib/python3.9/encodings/__init__.py \ + /usr/local/lib/python3.9/encodings/aliases.py \ + /usr/local/lib/python3.9/encodings/latin_1.py \ + /usr/local/lib/python3.9/encodings/utf_8.py \ + /usr/local/lib/python3.9/enum.py \ + /usr/local/lib/python3.9/fnmatch.py \ + /usr/local/lib/python3.9/fractions.py \ + /usr/local/lib/python3.9/functools.py \ + /usr/local/lib/python3.9/genericpath.py \ + /usr/local/lib/python3.9/gettext.py \ + /usr/local/lib/python3.9/heapq.py \ + /usr/local/lib/python3.9/importlib/__init__.py \ + /usr/local/lib/python3.9/importlib/_bootstrap.py \ + /usr/local/lib/python3.9/importlib/_bootstrap_external.py \ + /usr/local/lib/python3.9/importlib/_common.py \ + /usr/local/lib/python3.9/importlib/abc.py \ + /usr/local/lib/python3.9/importlib/machinery.py \ + /usr/local/lib/python3.9/importlib/resources.py \ + /usr/local/lib/python3.9/importlib/util.py \ + /usr/local/lib/python3.9/inspect.py \ + /usr/local/lib/python3.9/io.py \ + /usr/local/lib/python3.9/json/__init__.py \ + /usr/local/lib/python3.9/json/decoder.py \ + /usr/local/lib/python3.9/json/encoder.py \ + /usr/local/lib/python3.9/json/scanner.py \ + /usr/local/lib/python3.9/keyword.py \ + /usr/local/lib/python3.9/lib-dynload/_bisect.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_datetime.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_decimal.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_heapq.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_json.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_lzma.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_opcode.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_pickle.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_posixsubprocess.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_random.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_sha512.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_socket.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_struct.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/array.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/binascii.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/grp.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/math.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/select.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/zlib.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/linecache.py \ + /usr/local/lib/python3.9/locale.py \ + /usr/local/lib/python3.9/logging/__init__.py \ + /usr/local/lib/python3.9/lzma.py \ + /usr/local/lib/python3.9/multiprocessing/__init__.py \ + /usr/local/lib/python3.9/multiprocessing/context.py \ + /usr/local/lib/python3.9/multiprocessing/process.py \ + /usr/local/lib/python3.9/multiprocessing/reduction.py \ + /usr/local/lib/python3.9/ntpath.py \ + /usr/local/lib/python3.9/numbers.py \ + /usr/local/lib/python3.9/opcode.py \ + /usr/local/lib/python3.9/operator.py \ + /usr/local/lib/python3.9/os.py \ + /usr/local/lib/python3.9/pathlib.py \ + /usr/local/lib/python3.9/pickle.py \ + /usr/local/lib/python3.9/pkgutil.py \ + /usr/local/lib/python3.9/platform.py \ + /usr/local/lib/python3.9/posixpath.py \ + /usr/local/lib/python3.9/quopri.py \ + /usr/local/lib/python3.9/random.py \ + /usr/local/lib/python3.9/re.py \ + /usr/local/lib/python3.9/reprlib.py \ + /usr/local/lib/python3.9/selectors.py \ + /usr/local/lib/python3.9/shutil.py \ + /usr/local/lib/python3.9/signal.py \ + /usr/local/lib/python3.9/site.py \ + /usr/local/lib/python3.9/socket.py \ + /usr/local/lib/python3.9/sre_compile.py \ + /usr/local/lib/python3.9/sre_constants.py \ + /usr/local/lib/python3.9/sre_parse.py \ + /usr/local/lib/python3.9/stat.py \ + /usr/local/lib/python3.9/string.py \ + /usr/local/lib/python3.9/struct.py \ + /usr/local/lib/python3.9/subprocess.py \ + /usr/local/lib/python3.9/tempfile.py \ + /usr/local/lib/python3.9/textwrap.py \ + /usr/local/lib/python3.9/threading.py \ + /usr/local/lib/python3.9/token.py \ + /usr/local/lib/python3.9/tokenize.py \ + /usr/local/lib/python3.9/traceback.py \ + /usr/local/lib/python3.9/types.py \ + /usr/local/lib/python3.9/typing.py \ + /usr/local/lib/python3.9/urllib/__init__.py \ + /usr/local/lib/python3.9/urllib/parse.py \ + /usr/local/lib/python3.9/uu.py \ + /usr/local/lib/python3.9/warnings.py \ + /usr/local/lib/python3.9/weakref.py \ + /usr/local/lib/python3.9/zipfile.py \ No newline at end of file diff --git a/examples/quiz1/quiz1/quiz1.h b/examples/quiz1/quiz1/quiz1.h new file mode 100644 index 00000000..534debbf --- /dev/null +++ b/examples/quiz1/quiz1/quiz1.h @@ -0,0 +1,66 @@ + +#pragma once +#ifndef QUIZ1_H +#define QUIZ1_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include + +// Compiler feature macros adapted from Hedley (public domain) +// https://github.com/nemequ/hedley + +#if defined(__has_builtin) +# define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +# define EXO_HAS_BUILTIN(builtin) (0) +#endif + +#if EXO_HAS_BUILTIN(__builtin_assume) +# define EXO_ASSUME(expr) __builtin_assume(expr) +#elif EXO_HAS_BUILTIN(__builtin_unreachable) +# define EXO_ASSUME(expr) \ + ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#else +# define EXO_ASSUME(expr) ((void)(expr)) +#endif + + +#ifndef EXO_WIN_1F32 +#define EXO_WIN_1F32 +struct exo_win_1f32{ + float * const data; + const int_fast32_t strides[1]; +}; +#endif +#ifndef EXO_WIN_1F32C +#define EXO_WIN_1F32C +struct exo_win_1f32c{ + const float * const data; + const int_fast32_t strides[1]; +}; +#endif +// vec_double( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ); + +// vec_double_optimized( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ); + + + +#ifdef __cplusplus +} +#endif +#endif // QUIZ1_H diff --git a/examples/quiz2/README.md b/examples/quiz2/README.md new file mode 100644 index 00000000..85b6527b --- /dev/null +++ b/examples/quiz2/README.md @@ -0,0 +1,78 @@ +# Quiz2! + +Loop fissioning and debugging via printing cursors. + +## Incorrect output (compiler error) +As written, the schedule has a bug which attempts to incorrectly fission a loop. +``` +Traceback (most recent call last): + File "/home/yuka/.local/bin/exocc", line 8, in + sys.exit(main()) + File "/home/yuka/.local/lib/python3.9/site-packages/exo/main.py", line 55, in main + library = [ + File "/home/yuka/.local/lib/python3.9/site-packages/exo/main.py", line 58, in + for proc in get_procs_from_module(load_user_code(mod)) + File "/home/yuka/.local/lib/python3.9/site-packages/exo/main.py", line 107, in load_user_code + loader.exec_module(user_module) + File "", line 790, in exec_module + File "", line 228, in _call_with_frames_removed + File "/home/yuka/exo/examples/quiz2/quiz2.py", line 42, in + w = wrong_schedule(scaled_add) + File "/home/yuka/exo/examples/quiz2/quiz2.py", line 38, in wrong_schedule + p = fission(p, vector_assign.after()) + File "/home/yuka/.local/lib/python3.9/site-packages/exo/API_scheduling.py", line 100, in __call__ + return self.func(*bound_args.args, **bound_args.kwargs) + File "/home/yuka/.local/lib/python3.9/site-packages/exo/API_scheduling.py", line 2066, in fission + ir, fwd = scheduling.DoFissionAfterSimple( + File "/home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/LoopIR_scheduling.py", line 2385, in DoFissionAfterSimple + alloc_check(pre, post) + File "/home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/LoopIR_scheduling.py", line 2352, in alloc_check + raise SchedulingError( +exo.rewrite.new_eff.SchedulingError: <<>>: Will not fission here, because doing so will hide the allocation of vec from a later use site. +``` + +## Correct Output +The correct output will divide the computation into individual, vectorizable loops. +``` +def scaled_add_scheduled(N: size, a: f32[N] @ DRAM, b: f32[N] @ DRAM, + c: f32[N] @ DRAM): + assert N % 8 == 0 + for io in seq(0, N / 8): + vec: R[8] @ DRAM + vec_1: R[8] @ DRAM + vec_2: f32[8] @ DRAM + vec_3: R[8] @ DRAM + vec_4: R[8] @ DRAM + vec_5: f32[8] @ DRAM + for ii in seq(0, 8): + vec_1[ii] = 2 + for ii in seq(0, 8): + vec_2[ii] = a[8 * io + ii] + for ii in seq(0, 8): + vec[ii] = vec_1[ii] * vec_2[ii] + for ii in seq(0, 8): + vec_4[ii] = 3 + for ii in seq(0, 8): + vec_5[ii] = b[8 * io + ii] + for ii in seq(0, 8): + vec_3[ii] = vec_4[ii] * vec_5[ii] + for ii in seq(0, 8): + c[8 * io + ii] = vec[ii] + vec_3[ii] +``` + +--- + +## Solution + +Have to +`print(vector_assign.after())` after line 37 +``` + for io in seq(0, N / 8): + vec: R[8] @ DRAM + for ii in seq(0, 8): + vec_1: R @ DRAM + vec_1 = 2 + [GAP - After] +``` + + diff --git a/examples/quiz2/quiz2.py b/examples/quiz2/quiz2.py new file mode 100644 index 00000000..f2a8c379 --- /dev/null +++ b/examples/quiz2/quiz2.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from exo import * +from exo.stdlib.scheduling import * + + +@proc +def scaled_add(N: size, a: f32[N], b: f32[N], c: f32[N]): + assert N % 8 == 0 + for i in seq(0, N): + c[i] = 2 * a[i] + 3 * b[i] + + +def stage_exprs(p, num_vectors, assign): + if isinstance(assign.rhs(), BinaryOpCursor): + p = bind_expr(p, assign.rhs().lhs(), "vec") + num_vectors += 1 + p, num_vectors = stage_exprs(p, num_vectors, p.forward(assign).prev()) + + p = bind_expr(p, assign.rhs().rhs(), "vec") + num_vectors += 1 + p, num_vectors = stage_exprs(p, num_vectors, p.forward(assign).prev()) + return p, num_vectors + + +def wrong_schedule(p): + p = rename(p, "scaled_add_scheduled") + num_vectors = 0 + + p = divide_loop(p, "i", 8, ["io", "ii"], perfect=True) + + p, num_vectors = stage_exprs(p, num_vectors, p.find("c[_] = _")) + + for i in range(num_vectors): + vector_reg = p.find(f"vec: _ #{i}") + p = expand_dim(p, vector_reg, 8, "ii") + p = lift_alloc(p, vector_reg) + + vector_assign = p.find(f"vec = _ #{i}") + p = fission(p, vector_assign.after()) + + return p + + +w = wrong_schedule(scaled_add) +print(w) diff --git a/examples/quiz2/quiz2/quiz2.c b/examples/quiz2/quiz2/quiz2.c new file mode 100644 index 00000000..abc4efdd --- /dev/null +++ b/examples/quiz2/quiz2/quiz2.c @@ -0,0 +1,63 @@ +#include "quiz2.h" + +#include +#include + +// scaled_add( +// N : size, +// a : f32[N] @DRAM, +// b : f32[N] @DRAM, +// c : f32[N] @DRAM +// ) +void scaled_add( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ) { +EXO_ASSUME(N % 8 == 0); +for (int_fast32_t i = 0; i < N; i++) { + c[i] = 2.0f * a[i] + 3.0f * b[i]; +} +} + +// scaled_add_scheduled( +// N : size, +// a : f32[N] @DRAM, +// b : f32[N] @DRAM, +// c : f32[N] @DRAM +// ) +void scaled_add_scheduled( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ) { +EXO_ASSUME(N % 8 == 0); +for (int_fast32_t io = 0; io < ((N) / (8)); io++) { + float *vec = (float*) malloc(8 * sizeof(*vec)); + float *vec_1 = (float*) malloc(8 * sizeof(*vec_1)); + float *vec_2 = (float*) malloc(8 * sizeof(*vec_2)); + float *vec_3 = (float*) malloc(8 * sizeof(*vec_3)); + float *vec_4 = (float*) malloc(8 * sizeof(*vec_4)); + float *vec_5 = (float*) malloc(8 * sizeof(*vec_5)); + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_1[ii] = 2.0f; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_2[ii] = a[8 * io + ii]; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec[ii] = vec_1[ii] * vec_2[ii]; + } + free(vec_2); + free(vec_1); + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_4[ii] = 3.0f; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_5[ii] = b[8 * io + ii]; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_3[ii] = vec_4[ii] * vec_5[ii]; + } + free(vec_5); + free(vec_4); + for (int_fast32_t ii = 0; ii < 8; ii++) { + c[8 * io + ii] = vec[ii] + vec_3[ii]; + } + free(vec_3); + free(vec); +} +} + diff --git a/examples/quiz2/quiz2/quiz2.d b/examples/quiz2/quiz2/quiz2.d new file mode 100644 index 00000000..da7e13da --- /dev/null +++ b/examples/quiz2/quiz2/quiz2.d @@ -0,0 +1,306 @@ +quiz2/quiz2.c quiz2/quiz2.h : /home/yuka/.local/bin/exocc \ + /home/yuka/.local/lib/python3.9/site-packages/_distutils_hack/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl_adt/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl_adt/adt.py \ + /home/yuka/.local/lib/python3.9/site-packages/asdl_adt/validators.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_cmp.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_compat.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_config.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_funcs.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_make.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_next_gen.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/_version_info.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/converters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/exceptions.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/filters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/setters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attr/validators.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/converters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/exceptions.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/filters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/setters.py \ + /home/yuka/.local/lib/python3.9/site-packages/attrs/validators.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API_cursors.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API_scheduling.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/API_types.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/LoopIR_compiler.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/mem_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/parallel_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/prec_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/backend/win_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/LoopIR.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/LoopIR_pprint.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/configs.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/extern.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/internal_cursors.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/memory.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/prelude.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/core/proc_eqv.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/boundscheck.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/parse_fragment.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/pattern_match.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/pyparser.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/frontend/typecheck.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/main.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/LoopIR_scheduling.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/LoopIR_unification.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/analysis_simplify.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/new_analysis_core.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/new_eff.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/rewrite/range_analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/stdlib/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/stdlib/analysis.py \ + /home/yuka/.local/lib/python3.9/site-packages/exo/stdlib/scheduling.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_adapters.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_collections.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_compat.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_functools.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_itertools.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_meta.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/_text.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/compat/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/importlib_metadata/compat/py39.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/api.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/unix.py \ + /home/yuka/.local/lib/python3.9/site-packages/platformdirs/version.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/configuration.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/constants.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/decorators.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/environment.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/exceptions.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/factory.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/fnode.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/formula.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/logics.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/operators.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/oracles.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/printers.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/shortcuts.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/simplifier.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/annotations.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/commands.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/parser/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/parser/parser.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/printers.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/smtlib/script.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/options.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/portfolio.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/qelim.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/smtlib.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/solver.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/solvers/z3.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/substituter.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/type_checker.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/typing.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/utils.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/dag.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/generic.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/identitydag.py \ + /home/yuka/.local/lib/python3.9/site-packages/pysmt/walkers/tree.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/pyparser.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/pyparser_utils.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pyparser/split_penalty_visitor.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/blank_line_calculator.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/comment_splicer.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/continuation_splicer.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/pytree_unwrapper.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/pytree_utils.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/pytree_visitor.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/split_penalty.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/pytree/subtype_assigner.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/errors.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/file_resources.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/format_decision_state.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/format_token.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/identify_container.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/line_joiner.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/logical_line.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/object_state.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/reformatter.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/split_penalty.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/style.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/subtypes.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf/yapflib/yapf_api.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/fixer_util.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/patcomp.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/driver.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/grammar.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/literals.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/parse.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/pgen.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/token.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pgen2/tokenize.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pygram.py \ + /home/yuka/.local/lib/python3.9/site-packages/yapf_third_party/_ylib2to3/pytree.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3consts.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3core.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3num.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3poly.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3printer.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3rcf.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3types.py \ + /home/yuka/.local/lib/python3.9/site-packages/z3/z3util.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/compat/__init__.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/compat/py310.py \ + /home/yuka/.local/lib/python3.9/site-packages/zipp/glob.py \ + /usr/local/lib/python3.9/__future__.py \ + /usr/local/lib/python3.9/_bootlocale.py \ + /usr/local/lib/python3.9/_collections_abc.py \ + /usr/local/lib/python3.9/_compat_pickle.py \ + /usr/local/lib/python3.9/_compression.py \ + /usr/local/lib/python3.9/_sitebuiltins.py \ + /usr/local/lib/python3.9/_weakrefset.py \ + /usr/local/lib/python3.9/abc.py \ + /usr/local/lib/python3.9/argparse.py \ + /usr/local/lib/python3.9/ast.py \ + /usr/local/lib/python3.9/base64.py \ + /usr/local/lib/python3.9/bisect.py \ + /usr/local/lib/python3.9/calendar.py \ + /usr/local/lib/python3.9/codecs.py \ + /usr/local/lib/python3.9/collections/__init__.py \ + /usr/local/lib/python3.9/collections/abc.py \ + /usr/local/lib/python3.9/configparser.py \ + /usr/local/lib/python3.9/contextlib.py \ + /usr/local/lib/python3.9/copy.py \ + /usr/local/lib/python3.9/copyreg.py \ + /usr/local/lib/python3.9/ctypes/__init__.py \ + /usr/local/lib/python3.9/ctypes/_endian.py \ + /usr/local/lib/python3.9/dataclasses.py \ + /usr/local/lib/python3.9/datetime.py \ + /usr/local/lib/python3.9/decimal.py \ + /usr/local/lib/python3.9/difflib.py \ + /usr/local/lib/python3.9/dis.py \ + /usr/local/lib/python3.9/email/__init__.py \ + /usr/local/lib/python3.9/email/_encoded_words.py \ + /usr/local/lib/python3.9/email/_parseaddr.py \ + /usr/local/lib/python3.9/email/_policybase.py \ + /usr/local/lib/python3.9/email/base64mime.py \ + /usr/local/lib/python3.9/email/charset.py \ + /usr/local/lib/python3.9/email/encoders.py \ + /usr/local/lib/python3.9/email/errors.py \ + /usr/local/lib/python3.9/email/feedparser.py \ + /usr/local/lib/python3.9/email/header.py \ + /usr/local/lib/python3.9/email/iterators.py \ + /usr/local/lib/python3.9/email/message.py \ + /usr/local/lib/python3.9/email/parser.py \ + /usr/local/lib/python3.9/email/quoprimime.py \ + /usr/local/lib/python3.9/email/utils.py \ + /usr/local/lib/python3.9/encodings/__init__.py \ + /usr/local/lib/python3.9/encodings/aliases.py \ + /usr/local/lib/python3.9/encodings/latin_1.py \ + /usr/local/lib/python3.9/encodings/utf_8.py \ + /usr/local/lib/python3.9/enum.py \ + /usr/local/lib/python3.9/fnmatch.py \ + /usr/local/lib/python3.9/fractions.py \ + /usr/local/lib/python3.9/functools.py \ + /usr/local/lib/python3.9/genericpath.py \ + /usr/local/lib/python3.9/gettext.py \ + /usr/local/lib/python3.9/heapq.py \ + /usr/local/lib/python3.9/importlib/__init__.py \ + /usr/local/lib/python3.9/importlib/_bootstrap.py \ + /usr/local/lib/python3.9/importlib/_bootstrap_external.py \ + /usr/local/lib/python3.9/importlib/_common.py \ + /usr/local/lib/python3.9/importlib/abc.py \ + /usr/local/lib/python3.9/importlib/machinery.py \ + /usr/local/lib/python3.9/importlib/resources.py \ + /usr/local/lib/python3.9/importlib/util.py \ + /usr/local/lib/python3.9/inspect.py \ + /usr/local/lib/python3.9/io.py \ + /usr/local/lib/python3.9/json/__init__.py \ + /usr/local/lib/python3.9/json/decoder.py \ + /usr/local/lib/python3.9/json/encoder.py \ + /usr/local/lib/python3.9/json/scanner.py \ + /usr/local/lib/python3.9/keyword.py \ + /usr/local/lib/python3.9/lib-dynload/_bisect.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_datetime.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_decimal.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_heapq.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_json.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_lzma.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_opcode.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_pickle.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_posixsubprocess.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_random.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_sha512.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_socket.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/_struct.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/array.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/binascii.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/grp.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/math.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/select.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/lib-dynload/zlib.cpython-39-x86_64-linux-gnu.so \ + /usr/local/lib/python3.9/linecache.py \ + /usr/local/lib/python3.9/locale.py \ + /usr/local/lib/python3.9/logging/__init__.py \ + /usr/local/lib/python3.9/lzma.py \ + /usr/local/lib/python3.9/multiprocessing/__init__.py \ + /usr/local/lib/python3.9/multiprocessing/context.py \ + /usr/local/lib/python3.9/multiprocessing/process.py \ + /usr/local/lib/python3.9/multiprocessing/reduction.py \ + /usr/local/lib/python3.9/ntpath.py \ + /usr/local/lib/python3.9/numbers.py \ + /usr/local/lib/python3.9/opcode.py \ + /usr/local/lib/python3.9/operator.py \ + /usr/local/lib/python3.9/os.py \ + /usr/local/lib/python3.9/pathlib.py \ + /usr/local/lib/python3.9/pickle.py \ + /usr/local/lib/python3.9/pkgutil.py \ + /usr/local/lib/python3.9/platform.py \ + /usr/local/lib/python3.9/posixpath.py \ + /usr/local/lib/python3.9/quopri.py \ + /usr/local/lib/python3.9/random.py \ + /usr/local/lib/python3.9/re.py \ + /usr/local/lib/python3.9/reprlib.py \ + /usr/local/lib/python3.9/selectors.py \ + /usr/local/lib/python3.9/shutil.py \ + /usr/local/lib/python3.9/signal.py \ + /usr/local/lib/python3.9/site.py \ + /usr/local/lib/python3.9/socket.py \ + /usr/local/lib/python3.9/sre_compile.py \ + /usr/local/lib/python3.9/sre_constants.py \ + /usr/local/lib/python3.9/sre_parse.py \ + /usr/local/lib/python3.9/stat.py \ + /usr/local/lib/python3.9/string.py \ + /usr/local/lib/python3.9/struct.py \ + /usr/local/lib/python3.9/subprocess.py \ + /usr/local/lib/python3.9/tempfile.py \ + /usr/local/lib/python3.9/textwrap.py \ + /usr/local/lib/python3.9/threading.py \ + /usr/local/lib/python3.9/token.py \ + /usr/local/lib/python3.9/tokenize.py \ + /usr/local/lib/python3.9/traceback.py \ + /usr/local/lib/python3.9/types.py \ + /usr/local/lib/python3.9/typing.py \ + /usr/local/lib/python3.9/urllib/__init__.py \ + /usr/local/lib/python3.9/urllib/parse.py \ + /usr/local/lib/python3.9/uu.py \ + /usr/local/lib/python3.9/warnings.py \ + /usr/local/lib/python3.9/weakref.py \ + /usr/local/lib/python3.9/zipfile.py \ No newline at end of file diff --git a/examples/quiz2/quiz2/quiz2.h b/examples/quiz2/quiz2/quiz2.h new file mode 100644 index 00000000..e7c89722 --- /dev/null +++ b/examples/quiz2/quiz2/quiz2.h @@ -0,0 +1,55 @@ + +#pragma once +#ifndef QUIZ2_H +#define QUIZ2_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include + +// Compiler feature macros adapted from Hedley (public domain) +// https://github.com/nemequ/hedley + +#if defined(__has_builtin) +# define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +# define EXO_HAS_BUILTIN(builtin) (0) +#endif + +#if EXO_HAS_BUILTIN(__builtin_assume) +# define EXO_ASSUME(expr) __builtin_assume(expr) +#elif EXO_HAS_BUILTIN(__builtin_unreachable) +# define EXO_ASSUME(expr) \ + ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#else +# define EXO_ASSUME(expr) ((void)(expr)) +#endif + + + +// scaled_add( +// N : size, +// a : f32[N] @DRAM, +// b : f32[N] @DRAM, +// c : f32[N] @DRAM +// ) +void scaled_add( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ); + +// scaled_add_scheduled( +// N : size, +// a : f32[N] @DRAM, +// b : f32[N] @DRAM, +// c : f32[N] @DRAM +// ) +void scaled_add_scheduled( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ); + + + +#ifdef __cplusplus +} +#endif +#endif // QUIZ2_H diff --git a/examples/quiz3/README.md b/examples/quiz3/README.md new file mode 100644 index 00000000..3afbf934 --- /dev/null +++ b/examples/quiz3/README.md @@ -0,0 +1,56 @@ +# Quiz3!! + +## Correct Output +This code makes the optimization of shrinking the `blur_x` memory allocation from (H+2, W) to (34, 256). Since the code has been tiled, we don't need to store the entire intermediate `blur_x` buffer in memory. Instead, we can just reuse the same intermediate buffer for each tile. + +To do so, the schedule tries to sink the allocation within the tile, reduce the memory size to the bare minimum necessary for computing that tile, and then lift the allocation back up to the top level scope. +``` +def tile_and_fused_blur(W: size, H: size, blur_y: ui16[H, W] @ DRAM, + inp: ui16[H + 2, W + 2] @ DRAM): + assert H % 32 == 0 + assert W % 256 == 0 + blur_x: ui16[34, 256] @ DRAM + for yo in seq(0, H / 32): + for xo in seq(0, W / 256): + for yi in seq(0, 34): + for xi in seq(0, 256): + blur_x[yi + 32 * yo - 32 * yo, xi + 256 * xo - 256 * + xo] = (inp[yi + 32 * yo, xi + 256 * xo] + + inp[yi + 32 * yo, 1 + xi + 256 * xo] + + inp[yi + 32 * yo, 2 + xi + 256 * xo]) / 3.0 + for yi in seq(0, 32): + for xi in seq(0, 256): + blur_y[yi + 32 * yo, xi + + 256 * xo] = (blur_x[yi + 32 * yo - 32 * yo, + xi + 256 * xo - 256 * xo] + + blur_x[1 + yi + 32 * yo - 32 * yo, + xi + 256 * xo - 256 * xo] + + blur_x[2 + yi + 32 * yo - 32 * yo, + xi + 256 * xo - 256 * xo]) / 3.0 +``` + +## Incorrect Output +This output is partially correct: it manages to reduce the height dimension from H+2 to 34. However, it wasn't able to reduce the memory in the width direction. +``` +def tile_and_fused_blur(W: size, H: size, blur_y: ui16[H, W] @ DRAM, + inp: ui16[H + 2, W + 2] @ DRAM): + assert H % 32 == 0 + assert W % 256 == 0 + blur_x: ui16[34, W] @ DRAM + for yo in seq(0, H / 32): + for xo in seq(0, W / 256): + for yi in seq(0, 34): + for xi in seq(0, 256): + blur_x[yi + 32 * yo - 32 * yo, xi + 256 * + xo] = (inp[yi + 32 * yo, xi + 256 * xo] + + inp[yi + 32 * yo, 1 + xi + 256 * xo] + + inp[yi + 32 * yo, 2 + xi + 256 * xo]) / 3.0 + for yi in seq(0, 32): + for xi in seq(0, 256): + blur_y[yi + 32 * yo, xi + 256 * xo] = ( + blur_x[yi + 32 * yo - 32 * yo, xi + 256 * xo] + + blur_x[1 + yi + 32 * yo - 32 * yo, xi + 256 * xo] + + blur_x[2 + yi + 32 * yo - 32 * yo, + xi + 256 * xo]) / 3.0 +``` + diff --git a/examples/quiz3/quiz3.py b/examples/quiz3/quiz3.py new file mode 100644 index 00000000..ef45174b --- /dev/null +++ b/examples/quiz3/quiz3.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from exo import * +from exo.stdlib.scheduling import * + + +@proc +def tile_and_fused_blur( + W: size, H: size, blur_y: ui16[H, W] @ DRAM, inp: ui16[H + 2, W + 2] @ DRAM +): + assert H % 32 == 0 + assert W % 256 == 0 + blur_x: ui16[2 + H, W] @ DRAM + for yo in seq(0, H / 32): + for xo in seq(0, W / 256): + for yi in seq(0, 34): + for xi in seq(0, 256): + blur_x[yi + 32 * yo, xi + 256 * xo] = ( + inp[yi + 32 * yo, xi + 256 * xo] + + inp[yi + 32 * yo, 1 + xi + 256 * xo] + + inp[yi + 32 * yo, 2 + xi + 256 * xo] + ) / 3.0 + for yi in seq(0, 32): + for xi in seq(0, 256): + blur_y[yi + 32 * yo, xi + 256 * xo] = ( + blur_x[yi + 32 * yo, xi + 256 * xo] + + blur_x[1 + yi + 32 * yo, xi + 256 * xo] + + blur_x[2 + yi + 32 * yo, xi + 256 * xo] + ) / 3.0 + + +def get_loops_at_or_above(cursor): + loops = [] + while not isinstance((parent := cursor.parent()), InvalidCursor): + loops.append(parent) + cursor = parent + return list(reversed(loops)) + + +def wrong_schedule(p): + """ + Incorrect function get_loops_at_or_above is missing the initial loop + when initiating the loops array + """ + + p = rename(p, "tile_and_fused_blur_scheduled") + xo_loop = p.find_loop("xo") + producer_alloc = p.find("blur_x : _") + + # each output depends on 3 rows of blur_x, so computing a 32x256 subarray + # of output requires a 34x256 subarray of blur_x. + tile_size = [32, 256] + blur_x_tile_size = [34, 256] + + loops_to_lower_allocation_into = get_loops_at_or_above(xo_loop) + for i, loop in enumerate(loops_to_lower_allocation_into): + # Forward cursors before using + loop = p.forward(loop) + producer_alloc = p.forward(producer_alloc) + + # Sink the blur_x allocation into the next for loop + p = sink_alloc(p, producer_alloc) + + # Shrink blur_x size accordingly + offset_expr = f"{tile_size[i]} * {loop.name()}" + p = resize_dim(p, producer_alloc, i, blur_x_tile_size[i], offset_expr) + + p = lift_alloc(p, producer_alloc, 1) + + return p + + +w = wrong_schedule(tile_and_fused_blur) +print(w) diff --git a/tests/golden/test_examples/test_quiz1.txt b/tests/golden/test_examples/test_quiz1.txt new file mode 100644 index 00000000..27e523b1 --- /dev/null +++ b/tests/golden/test_examples/test_quiz1.txt @@ -0,0 +1,100 @@ + +#pragma once +#ifndef TEST_CASE_H +#define TEST_CASE_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include + +// Compiler feature macros adapted from Hedley (public domain) +// https://github.com/nemequ/hedley + +#if defined(__has_builtin) +# define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +# define EXO_HAS_BUILTIN(builtin) (0) +#endif + +#if EXO_HAS_BUILTIN(__builtin_assume) +# define EXO_ASSUME(expr) __builtin_assume(expr) +#elif EXO_HAS_BUILTIN(__builtin_unreachable) +# define EXO_ASSUME(expr) \ + ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#else +# define EXO_ASSUME(expr) ((void)(expr)) +#endif + + + +// vec_double( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ); + +// vec_double_optimized( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ); + + + +#ifdef __cplusplus +} +#endif +#endif // TEST_CASE_H + +#include "test_case.h" + +#include +#include + +// vec_double( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ) { +EXO_ASSUME(N % 8 == 0); +for (int_fast32_t i = 0; i < N; i++) { + out[i] = 2.0f * inp[i]; +} +} + +// vec_double_optimized( +// N : size, +// inp : f32[N] @DRAM, +// out : f32[N] @DRAM +// ) +void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ) { +EXO_ASSUME(N % 8 == 0); +float *two_vec = (float*) malloc(8 * sizeof(*two_vec)); +for (int_fast32_t ii = 0; ii < 8; ii++) { + two_vec[ii] = 2.0f; +} +for (int_fast32_t io = 0; io < ((N) / (8)); io++) { + float *out_vec = (float*) malloc(8 * sizeof(*out_vec)); + float *inp_vec = (float*) malloc(8 * sizeof(*inp_vec)); + for (int_fast32_t i0 = 0; i0 < 8; i0++) { + inp_vec[i0] = inp[i0 + 8 * io]; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + out_vec[ii] = two_vec[ii] * inp_vec[ii]; + } + free(inp_vec); + for (int_fast32_t i0 = 0; i0 < 8; i0++) { + out[i0 + 8 * io] = out_vec[i0]; + } + free(out_vec); +} +free(two_vec); +} + diff --git a/tests/golden/test_examples/test_quiz3.txt b/tests/golden/test_examples/test_quiz3.txt new file mode 100644 index 00000000..78f57121 --- /dev/null +++ b/tests/golden/test_examples/test_quiz3.txt @@ -0,0 +1,115 @@ + +#pragma once +#ifndef TEST_CASE_H +#define TEST_CASE_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include +#include + +// Compiler feature macros adapted from Hedley (public domain) +// https://github.com/nemequ/hedley + +#if defined(__has_builtin) +# define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +# define EXO_HAS_BUILTIN(builtin) (0) +#endif + +#if EXO_HAS_BUILTIN(__builtin_assume) +# define EXO_ASSUME(expr) __builtin_assume(expr) +#elif EXO_HAS_BUILTIN(__builtin_unreachable) +# define EXO_ASSUME(expr) \ + ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#else +# define EXO_ASSUME(expr) ((void)(expr)) +#endif + + + +// tile_and_fused_blur( +// W : size, +// H : size, +// blur_y : ui16[H, W] @DRAM, +// inp : ui16[H + 2, W + 2] @DRAM +// ) +void tile_and_fused_blur( void *ctxt, int_fast32_t W, int_fast32_t H, uint16_t* blur_y, const uint16_t* inp ); + +// tile_and_fused_blur_scheduled( +// W : size, +// H : size, +// blur_y : ui16[H, W] @DRAM, +// inp : ui16[H + 2, W + 2] @DRAM +// ) +void tile_and_fused_blur_scheduled( void *ctxt, int_fast32_t W, int_fast32_t H, uint16_t* blur_y, const uint16_t* inp ); + + + +#ifdef __cplusplus +} +#endif +#endif // TEST_CASE_H + +#include "test_case.h" + +#include +#include + +// tile_and_fused_blur( +// W : size, +// H : size, +// blur_y : ui16[H, W] @DRAM, +// inp : ui16[H + 2, W + 2] @DRAM +// ) +void tile_and_fused_blur( void *ctxt, int_fast32_t W, int_fast32_t H, uint16_t* blur_y, const uint16_t* inp ) { +EXO_ASSUME(H % 32 == 0); +EXO_ASSUME(W % 256 == 0); +uint16_t *blur_x = (uint16_t*) malloc((2 + H) * W * sizeof(*blur_x)); +for (int_fast32_t yo = 0; yo < ((H) / (32)); yo++) { + for (int_fast32_t xo = 0; xo < ((W) / (256)); xo++) { + for (int_fast32_t yi = 0; yi < 34; yi++) { + for (int_fast32_t xi = 0; xi < 256; xi++) { + blur_x[(yi + 32 * yo) * W + xi + 256 * xo] = (inp[(yi + 32 * yo) * (W + 2) + xi + 256 * xo] + inp[(yi + 32 * yo) * (W + 2) + 1 + xi + 256 * xo] + inp[(yi + 32 * yo) * (W + 2) + 2 + xi + 256 * xo]) / ((uint16_t) 3.0); + } + } + for (int_fast32_t yi = 0; yi < 32; yi++) { + for (int_fast32_t xi = 0; xi < 256; xi++) { + blur_y[(yi + 32 * yo) * W + xi + 256 * xo] = (blur_x[(yi + 32 * yo) * W + xi + 256 * xo] + blur_x[(1 + yi + 32 * yo) * W + xi + 256 * xo] + blur_x[(2 + yi + 32 * yo) * W + xi + 256 * xo]) / ((uint16_t) 3.0); + } + } + } +} +free(blur_x); +} + +// tile_and_fused_blur_scheduled( +// W : size, +// H : size, +// blur_y : ui16[H, W] @DRAM, +// inp : ui16[H + 2, W + 2] @DRAM +// ) +void tile_and_fused_blur_scheduled( void *ctxt, int_fast32_t W, int_fast32_t H, uint16_t* blur_y, const uint16_t* inp ) { +EXO_ASSUME(H % 32 == 0); +EXO_ASSUME(W % 256 == 0); +uint16_t *blur_x = (uint16_t*) malloc(34 * W * sizeof(*blur_x)); +for (int_fast32_t yo = 0; yo < ((H) / (32)); yo++) { + for (int_fast32_t xo = 0; xo < ((W) / (256)); xo++) { + for (int_fast32_t yi = 0; yi < 34; yi++) { + for (int_fast32_t xi = 0; xi < 256; xi++) { + blur_x[(yi + 32 * yo - (32 * yo)) * W + xi + 256 * xo] = (inp[(yi + 32 * yo) * (W + 2) + xi + 256 * xo] + inp[(yi + 32 * yo) * (W + 2) + 1 + xi + 256 * xo] + inp[(yi + 32 * yo) * (W + 2) + 2 + xi + 256 * xo]) / ((uint16_t) 3.0); + } + } + for (int_fast32_t yi = 0; yi < 32; yi++) { + for (int_fast32_t xi = 0; xi < 256; xi++) { + blur_y[(yi + 32 * yo) * W + xi + 256 * xo] = (blur_x[(yi + 32 * yo - (32 * yo)) * W + xi + 256 * xo] + blur_x[(1 + yi + 32 * yo - (32 * yo)) * W + xi + 256 * xo] + blur_x[(2 + yi + 32 * yo - (32 * yo)) * W + xi + 256 * xo]) / ((uint16_t) 3.0); + } + } + } +} +free(blur_x); +} + diff --git a/tests/test_examples.py b/tests/test_examples.py index 8040214e..bde09d98 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -36,3 +36,13 @@ def test_cursors(golden): def test_rvm_conv1d(golden): module_file = REPO_ROOT / "examples" / "rvm_conv1d" / "exo" / "conv1d.py" assert _test_app(module_file) == golden + + +def test_quiz1(golden): + module_file = REPO_ROOT / "examples" / "quiz1" / "quiz1.py" + assert _test_app(module_file) == golden + + +def test_quiz3(golden): + module_file = REPO_ROOT / "examples" / "quiz3" / "quiz3.py" + assert _test_app(module_file) == golden