Merge pull request #28 from firedrakeproject/connorjward/merge-upstream

Merge upstream
firedrakeproject · Nov 7, 2024 · 6f02d31 · 6f02d31
2 parents d9876d8 + 8993e31
commit 6f02d31
Show file tree

Hide file tree

Showing 71 changed files with 2,488 additions and 963 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,6 +7,10 @@ on:
     schedule:
         - cron:  '17 3 * * 0'
 
+concurrency:
+    group: ${{ github.head_ref || github.ref_name }}
+    cancel-in-progress: true
+
 jobs:
     ruff:
         name: Ruff
@@ -20,6 +24,13 @@ jobs:
                 pipx install ruff
                 ruff check
 
+    typos:
+        name: Typos
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v4
+        -   uses: crate-ci/typos@master
+
     pylint:
         name: Pylint
         runs-on: ubuntu-latest

diff --git a/.gitignore b/.gitignore
@@ -21,8 +21,6 @@ lextab.py
 yacctab.py
 .pytest_cache/*
 
-loopy/_git_rev.py
-
 .cache
 .env
 virtualenv-[0-9]*[0-9]

diff --git a/MEMO b/MEMO
@@ -7,7 +7,7 @@ Documentation Notes
 Things to consider
 ^^^^^^^^^^^^^^^^^^
 
-- Depedencies are pointwise for shared loop dimensions
+- Dependencies are pointwise for shared loop dimensions
   and global over non-shared ones (between dependent and ancestor)
 
 - multiple insns could fight over which iname gets local axis 0

diff --git a/README.rst b/README.rst
@@ -4,9 +4,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
 .. image:: https://gitlab.tiker.net/inducer/loopy/badges/main/pipeline.svg
     :alt: Gitlab Build Status
     :target: https://gitlab.tiker.net/inducer/loopy/commits/main
-.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main&event=push
+.. image:: https://github.com/inducer/loopy/workflows/CI/badge.svg?branch=main
     :alt: Github Build Status
-    :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI+event%3Apush
+    :target: https://github.com/inducer/loopy/actions?query=branch%3Amain+workflow%3ACI
 .. image:: https://badge.fury.io/py/loopy.png
     :alt: Python Package Index Release Page
     :target: https://pypi.org/project/loopy/

diff --git a/contrib/mem-pattern-explorer/pattern_vis.py b/contrib/mem-pattern-explorer/pattern_vis.py
@@ -76,7 +76,7 @@ def tick(self):
 class Array:
     def __init__(self, ctx, name, shape, strides, elements_per_row=None):
         # Each array element stores a tuple:
-        # (timestamp, subgroup, g0, g1, g2, ) of last acccess
+        # (timestamp, subgroup, g0, g1, g2, ) of last access
 
         assert len(shape) == len(strides)
 

diff --git a/doc/misc.rst b/doc/misc.rst
@@ -158,7 +158,7 @@ In the meantime, you can generate code simply by saying::
     print(cg_result.host_code())
     print(cg_result.device_code())
 
-Additionally, for C-based languages, header defintions are available via::
+Additionally, for C-based languages, header definitions are available via::
 
     loopy.generate_header(knl)
 
@@ -338,8 +338,8 @@ This list is always growing, but here are a few pointers:
 
   Use :func:`loopy.join_inames`.
 
-In what sense does Loopy suport vectorization?
-----------------------------------------------
+In what sense does Loopy support vectorization?
+-----------------------------------------------
 
 There are really two ways in which the OpenCL/CUDA model of computation exposes
 vectorization:
@@ -352,7 +352,7 @@ vectorization:
   e.g. ``float4``, which support arithmetic with implicit vector semantics
   as well as a number of 'intrinsic' functions.
 
-Loopy suports both. The first one, SIMT, is accessible by tagging inames with,
+Loopy supports both. The first one, SIMT, is accessible by tagging inames with,
 e.g., ``l.0```. Accessing the second one requires using both execution- and
 data-reshaping capabilities in loopy. To start with, you need an array that
 has an axis with the length of the desired vector. If that's not yet available,

diff --git a/doc/ref_internals.rst b/doc/ref_internals.rst
@@ -53,3 +53,7 @@ Schedule
 --------
 
 .. automodule:: loopy.schedule
+.. automodule:: loopy.schedule.tools
+.. automodule:: loopy.schedule.tree
+
+
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
@@ -262,6 +262,7 @@ Instructions
 
 .. {{{
 
+.. autoclass:: HappensAfter
 .. autoclass:: InstructionBase
 
 .. _assignments:

diff --git a/doc/ref_other.rst b/doc/ref_other.rst
@@ -1,6 +1,11 @@
 Reference: Other Functionality
 ==============================
 
+Auxiliary Data Types
+--------------------
+
+.. automodule:: loopy.typing
+
 Obtaining Kernel Performance Statistics
 ---------------------------------------
 

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -438,7 +438,8 @@ with identical bounds, for the use of the transpose:
     ...     out[ii,jj] = 2*out[ii,jj]  {dep=transpose}
     ...     """,
     ...     [lp.GlobalArg("out", shape=lp.auto, is_input=False), ...])
-    >>> knl = lp.prioritize_loops(knl, "i,j,ii,jj")
+    >>> knl = lp.prioritize_loops(knl, "i,j")
+    >>> knl = lp.prioritize_loops(knl, "ii,jj")
 
 :func:`loopy.duplicate_inames` can be used to achieve the same goal.
 Now the intended code is generated and our test passes.
@@ -613,7 +614,7 @@ commonly called 'loop tiling':
     ...     assumptions="n mod 16 = 0 and n >= 1")
     >>> knl = lp.split_iname(knl, "i", 16)
     >>> knl = lp.split_iname(knl, "j", 16)
-    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
+    >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
     >>> knl = lp.set_options(knl, write_code=True)
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
@@ -822,7 +823,7 @@ enabling some cost savings:
       {
         int const i_outer = -1 + n + -1 * ((3 * n) / 4);
     <BLANKLINE>
-        if (-1 + n >= 0)
+        if (i_outer >= 0)
         {
           a[4 * i_outer] = (float) (0.0f);
           if (-2 + -4 * i_outer + n >= 0)
@@ -957,7 +958,7 @@ Consider the following example:
     ...     "{ [i_outer,i_inner, k]:  "
     ...          "0<= 16*i_outer + i_inner <n and 0<= i_inner,k <16}",
     ...     """
-    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner] {priority=10}
+    ...     <> a_temp[i_inner] = a[16*i_outer + i_inner]
     ...     out[16*i_outer + i_inner] = sum(k, a_temp[k])
     ...     """)
     >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
@@ -1032,8 +1033,8 @@ transformation exists in :func:`loopy.add_prefetch`:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-        acc_k = 0.0f;
         a_fetch = a[16 * gid(0) + lid(0)];
+        acc_k = 0.0f;
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch;
         out[16 * gid(0) + lid(0)] = acc_k;
@@ -1056,12 +1057,11 @@ earlier:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
-        acc_k = 0.0f;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
         a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
+        acc_k = 0.0f;
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch[lid(0)];
         out[16 * gid(0) + lid(0)] = acc_k;
@@ -1209,6 +1209,12 @@ Let us start with an example. Consider the kernel from above with a
    ...     assumptions="n mod 16 = 0")
    >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
 
+.. testsetup::
+
+    >>> prog = prog.with_kernel(
+    ...    prog.default_entrypoint.copy(
+    ...        silenced_warnings=["v1_scheduler_fallback"]))
+
 Here is what happens when we try to generate code for the kernel:
 
    >>> cgr = lp.generate_code_v2(prog)
@@ -1312,7 +1318,7 @@ The kernel translates into two OpenCL kernels.
      int tmp;
    <BLANKLINE>
      tmp = tmp_save_slot[16 * gid(0) + lid(0)];
-     arr[(lid(0) + gid(0) * 16 + 1) % n] = tmp;
+     arr[(1 + lid(0) + gid(0) * 16) % n] = tmp;
    }
 
 Now we can execute the kernel.
@@ -1903,18 +1909,16 @@ Now to make things more interesting, we'll create a kernel with barriers:
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
-      {
-        int const k_outer = 0;
-    <BLANKLINE>
+      for (int i = 0; i <= 49; ++i)
         for (int j = 0; j <= 9; ++j)
-          for (int i = 0; i <= 49; ++i)
-          {
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
-            c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
-            barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
-            e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
-          }
-      }
+        {
+          int const k_outer = 0;
+    <BLANKLINE>
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
+          c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
+          barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
+          e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
+        }
     }
 
 In this kernel, when a work-item performs the second instruction it uses data

diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb
@@ -49,7 +49,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(prog)"
+    "print(prog)  # noqa: F821"
    ]
   },
   {
@@ -105,17 +105,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(prog)"
+    "print(prog)  # noqa: F821"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
@@ -24,10 +24,7 @@ def transform(knl, vars, stream_dtype):
         knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
     knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
 
-    knl = lp.add_and_infer_dtypes(knl, {
-        var: stream_dtype
-        for var in vars
-        })
+    knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
 
     knl = lp.set_argument_order(knl, vars + ["n"])
 

diff --git a/loopy/__init__.py b/loopy/__init__.py
@@ -56,6 +56,7 @@
     BarrierInstruction,
     CallInstruction,
     CInstruction,
+    HappensAfter,
     InstructionBase,
     LegacyStringInstructionTag,
     MemoryOrdering,
@@ -203,15 +204,9 @@
     find_rules_matching,
 )
 from loopy.translation_unit import TranslationUnit, for_each_kernel, make_program
-
-# }}}
 from loopy.type_inference import infer_unknown_types
 from loopy.types import to_loopy_type
-
-# {{{ imported user interface
 from loopy.typing import auto
-
-# {{{ import transforms
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION, VERSION
 
 
@@ -242,6 +237,7 @@
     "ExecutorBase",
     "GeneratedProgram",
     "GlobalArg",
+    "HappensAfter",
     "ISPCTarget",
     "ImageArg",
     "InKernelCallable",
@@ -563,18 +559,18 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None):
 
     indices = ["i%d" % i for i in range(rank)]
     shape = ["n%d" % i for i in range(rank)]
-    commad_indices = ", ".join(indices)
+    command_indices = ", ".join(indices)
     bounds = " and ".join(
             f"0<={ind}<{shape_i}"
             for ind, shape_i in zip(indices, shape))
 
     set_str = "{{[{}]: {} }}".format(
-                commad_indices,
+                command_indices,
                 bounds
                 )
     result = make_kernel(set_str,
             "output[%s] = input[%s]"
-            % (commad_indices, commad_indices),
+            % (command_indices, command_indices),
             lang_version=MOST_RECENT_LANGUAGE_VERSION,
             default_offset=auto)
-Original file line number
+Diff line change
@@ Expand Up / @@ -262,6 +262,7 @@ Instructions @@
     .. {{{
+    .. autoclass:: HappensAfter
     .. autoclass:: InstructionBase
     .. _assignments:
@@ Expand Down @@