Add quizzes

exo-lang · Nov 10, 2024 · 838b9e9 · 838b9e9
1 parent d9772a3
commit 838b9e9
Show file tree

Hide file tree

Showing 16 changed files with 1,478 additions and 0 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -8,3 +8,5 @@ If you are new to Exo, we recommend going through the examples in the following
 2. [Cursor](./cursors/README.md): This example shows how to use Cursors to efficiently write schedules and define a new scheduling operator.
 
 3. [RVM](./rvm_conv1d/README.md): This example illustrates how to use Exo to define and target a new hardware accelerator entirely in the user code.
+
+4. Quizzes ([quiz1](./quiz1/README.md), [quiz2](./quiz2/README.md), [quiz3](./quiz3/README.md)) contains common scheduling mistakes in Exo and wrong schedules. You are invited to solve the quiz.
diff --git a/examples/quiz1/README.md b/examples/quiz1/README.md
@@ -0,0 +1,59 @@
+# Quiz 1
+
+Throughout the quiz, we provide incorrect code and the correct output as a reference. Your goal is to understand the code and fix the bug to match the correct output!
+
+You can execute `quiz1.py` by running `exocc quiz1.py`. Without modification, it will show the incorrect output.
+
+## Incorrect Output
+
+The following output is incorrect because it does not make calls to vector intrinsics. While it matches the structure of SIMD vector code, it is still being executed one element at a time:
+
+```python
+def double(N: size, inp: f32[N] @ DRAM, out: f32[N] @ DRAM):
+    assert N % 8 == 0
+    two_vec: R[8] @ DRAM
+    for ii in seq(0, 8):
+        two_vec[ii] = 2.0
+    for io in seq(0, N / 8):
+        out_vec: f32[8] @ DRAM
+        inp_vec: f32[8] @ DRAM
+        for i0 in seq(0, 8):
+            inp_vec[i0] = inp[i0 + 8 * io]
+        for ii in seq(0, 8):
+            out_vec[ii] = two_vec[ii] * inp_vec[ii]
+        for i0 in seq(0, 8):
+            out[i0 + 8 * io] = out_vec[i0]
+```
+
+## Correct Output
+
+The correct output optimizes the function to use vectorized arithmetic operations to compute the result over the entire array:
+
+```python
+def double(N: size, inp: f32[N] @ DRAM, out: f32[N] @ DRAM):
+    assert N % 8 == 0
+    two_vec: R[8] @ AVX2
+    vector_assign_two(two_vec[0:8])
+    for io in seq(0, N / 8):
+        out_vec: f32[8] @ AVX2
+        inp_vec: f32[8] @ AVX2
+        vector_load(inp_vec[0:8], inp[8 * io + 0:8 * io + 8])
+        vector_multiply(out_vec[0:8], two_vec[0:8], inp_vec[0:8])
+        vector_store(out[8 * io + 0:8 * io + 8], out_vec[0:8])
+```
+
+---
+
+## Solution
+
+Before calling `replace_all(p, avx_instrs)`, you need to set the buffer memory types to AVX2, because `replace_all` is memory-aware and will only replace code chunks with instructions that have matching memory annotations.
+
+Add the following code before the call to `replace_all`:
+
+```python
+    # Set the memory types to be AVX2 vectors
+    for name in ["two", "out", "inp"]:
+        p = set_memory(p, f"{name}_vec", AVX2)
+```
+
+This will ensure that the memory types are correctly set to AVX2 before replacing the code with vector intrinsics.
diff --git a/examples/quiz1/quiz1.py b/examples/quiz1/quiz1.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from exo import *
+from exo.libs.memories import AVX2
+from exo.stdlib.scheduling import *
+
+
+@instr("{dst_data} = _mm256_loadu_ps(&{src_data});")
+def vector_load(dst: [f32][8] @ AVX2, src: [f32][8] @ DRAM):
+    assert stride(src, 0) == 1
+    assert stride(dst, 0) == 1
+
+    for i in seq(0, 8):
+        dst[i] = src[i]
+
+
+@instr("_mm256_storeu_ps(&{dst_data}, {src_data});")
+def vector_store(dst: [f32][8] @ DRAM, src: [f32][8] @ AVX2):
+    assert stride(src, 0) == 1
+    assert stride(dst, 0) == 1
+
+    for i in seq(0, 8):
+        dst[i] = src[i]
+
+
+@instr("{out_data} = _mm256_mul_ps({x_data}, {y_data});")
+def vector_multiply(out: [f32][8] @ AVX2, x: [f32][8] @ AVX2, y: [f32][8] @ AVX2):
+    assert stride(out, 0) == 1
+    assert stride(x, 0) == 1
+    assert stride(y, 0) == 1
+
+    for i in seq(0, 8):
+        out[i] = x[i] * y[i]
+
+
+@instr("{out_data} = _mm256_broadcast_ss(2.0);")
+def vector_assign_two(out: [f32][8] @ AVX2):
+    assert stride(out, 0) == 1
+
+    for i in seq(0, 8):
+        out[i] = 2.0
+
+
+@proc
+def vec_double(N: size, inp: f32[N], out: f32[N]):
+    assert N % 8 == 0
+    for i in seq(0, N):
+        out[i] = 2.0 * inp[i]
+
+
+def wrong_schedule(p):
+    """
+    Forgot to set the memory types to be AVX2 vectors, so replace instruction
+    does not work as intended.
+    """
+    p = rename(p, "vec_double_optimized")
+    p = divide_loop(p, "i", 8, ["io", "ii"], perfect=True)
+
+    # Create a vector of twos
+    p = bind_expr(p, "2.0", "two_vec")
+    two_alloc = p.find("two_vec: _")
+    two_assign = p.find("two_vec = _")
+    p = expand_dim(p, two_alloc, 8, "ii")
+
+    # Hoist the allocation and assignment of two vector
+    p = lift_alloc(p, two_alloc, 2)
+    p = fission(p, two_assign.after(), 2)
+    p = remove_loop(p, two_assign.parent().parent())
+
+    # Create vectors for the input and output values
+    innermost_loop = p.find_loop("ii #1")
+    p = stage_mem(p, innermost_loop, "out[8*io:8*io+8]", "out_vec")
+    p = stage_mem(p, innermost_loop, "inp[8*io:8*io+8]", "inp_vec")
+    p = simplify(p)
+
+    # Replace with AVX instructinos
+    avx_instrs = [vector_assign_two, vector_multiply, vector_load, vector_store]
+    p = replace_all(p, avx_instrs)
+
+    return p
+
+
+w = wrong_schedule(vec_double)
+print(w)
diff --git a/examples/quiz1/quiz1/quiz1.c b/examples/quiz1/quiz1/quiz1.c
@@ -0,0 +1,56 @@
+#include "quiz1.h"
+
+#include <immintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// vec_double(
+//     N : size,
+//     inp : f32[N] @DRAM,
+//     out : f32[N] @DRAM
+// )
+void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ) {
+EXO_ASSUME(N % 8 == 0);
+for (int_fast32_t i = 0; i < N; i++) {
+  out[i] = 2.0f * inp[i];
+}
+}
+
+// vec_double_optimized(
+//     N : size,
+//     inp : f32[N] @DRAM,
+//     out : f32[N] @DRAM
+// )
+void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ) {
+EXO_ASSUME(N % 8 == 0);
+__m256 two_vec;
+two_vec = _mm256_broadcast_ss(2.0);
+for (int_fast32_t io = 0; io < ((N) / (8)); io++) {
+  __m256 out_vec;
+  __m256 inp_vec;
+  inp_vec = _mm256_loadu_ps(&inp[8 * io]);
+  out_vec = _mm256_mul_ps(two_vec, inp_vec);
+  _mm256_storeu_ps(&out[8 * io], out_vec);
+}
+}
+
+
+/* relying on the following instruction..."
+vector_assign_two(out)
+{out_data} = _mm256_broadcast_ss(2.0);
+*/
+
+/* relying on the following instruction..."
+vector_load(dst,src)
+{dst_data} = _mm256_loadu_ps(&{src_data});
+*/
+
+/* relying on the following instruction..."
+vector_multiply(out,x,y)
+{out_data} = _mm256_mul_ps({x_data}, {y_data});
+*/
+
+/* relying on the following instruction..."
+vector_store(dst,src)
+_mm256_storeu_ps(&{dst_data}, {src_data});
+*/
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,5 @@ If you are new to Exo, we recommend going through the examples in the following
		2. [Cursor](./cursors/README.md): This example shows how to use Cursors to efficiently write schedules and define a new scheduling operator.

		3. [RVM](./rvm_conv1d/README.md): This example illustrates how to use Exo to define and target a new hardware accelerator entirely in the user code.

		4. Quizzes ([quiz1](./quiz1/README.md), [quiz2](./quiz2/README.md), [quiz3](./quiz3/README.md)) contains common scheduling mistakes in Exo and wrong schedules. You are invited to solve the quiz.