From 755a00eef4dfe5c5abe3bd75b78e8c83169f245e Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 4 Oct 2023 15:58:21 +0200
Subject: [PATCH 01/16] Add tiling object

---
 examples/CMakeLists.txt                   |   1 +
 examples/vector_add/main.cu               |   2 +-
 examples/vector_add_tiling/CMakeLists.txt |  12 +
 examples/vector_add_tiling/main.cu        |  97 ++++
 include/kernel_float/tiling.h             | 516 ++++++++++++++++++++++
 tests/tiling.cu                           |  92 ++++
 6 files changed, 719 insertions(+), 1 deletion(-)
 create mode 100644 examples/vector_add_tiling/CMakeLists.txt
 create mode 100644 examples/vector_add_tiling/main.cu
 create mode 100644 tests/tiling.cu
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5342580..145f80b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(vector_add)
+add_subdirectory(vector_add_tiling)
diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
index ea78d1a..465c707 100644
--- a/examples/vector_add/main.cu
+++ b/examples/vector_add/main.cu
@@ -17,7 +17,7 @@ __global__ void my_kernel(int length, const khalf<N>* input, double constant, kf
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        output[i] = kf::cast<float>((input[i] * input[i]) * constant);
+        kf::cast_to(output[i]) = (input[i] * input[i]) * constant;
     }
 }
 
diff --git a/examples/vector_add_tiling/CMakeLists.txt b/examples/vector_add_tiling/CMakeLists.txt
new file mode 100644
index 0000000..a744c34
--- /dev/null
+++ b/examples/vector_add_tiling/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.17)
+
+set (PROJECT_NAME kernel_float_vecadd_tiling)
+project(${PROJECT_NAME} LANGUAGES CXX CUDA)
+set (CMAKE_CXX_STANDARD 17)
+
+add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
+target_link_libraries(${PROJECT_NAME} kernel_float)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+
+find_package(CUDA REQUIRED)
+target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu
new file mode 100644
index 0000000..1134778
--- /dev/null
+++ b/examples/vector_add_tiling/main.cu
@@ -0,0 +1,97 @@
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "kernel_float.h"
+#include "kernel_float/tiling.h"
+using namespace kernel_float::prelude;
+
+void cuda_check(cudaError_t code) {
+    if (code != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code));
+    }
+}
+
+template<int N, int B>
+__global__ void my_kernel(
+    int length,
+    kf::aligned_ptr<const __half> input,
+    double constant,
+    kf::aligned_ptr<float> output) {
+    auto tiling = kf::tiling<
+        kf::tile_factor<N>,
+        kf::block_size<B>,
+        kf::distributions<kf::dist::block_cyclic<2>>>();
+
+    auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
+    auto mask = tiling.local_mask();
+
+    auto a = kf::load(input.get(), points, mask);
+    auto b = (a * a) * constant;
+    kf::store(b, output.get(), points, mask);
+}
+
+template<int items_per_thread, int block_size = 256>
+void run_kernel(int n) {
+    double constant = 1.0;
+    std::vector<half> input(n);
+    std::vector<float> output_expected;
+    std::vector<float> output_result;
+
+    // Generate input data
+    for (int i = 0; i < n; i++) {
+        input[i] = half(i);
+        output_expected[i] = float(i + constant);
+    }
+
+    // Allocate device memory
+    __half* input_dev;
+    float* output_dev;
+    cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
+    cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
+
+    // Copy device memory
+    cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));
+
+    // Launch kernel!
+    int items_per_block = block_size * items_per_thread;
+    int grid_size = (n + items_per_block - 1) / items_per_block;
+    my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
+        n,
+        kf::aligned_ptr(input_dev),
+        constant,
+        kf::aligned_ptr(output_dev));
+
+    // Copy results back
+    cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
+
+    // Check results
+    for (int i = 0; i < n; i++) {
+        float result = output_result[i];
+        float answer = output_expected[i];
+
+        if (result != answer) {
+            std::stringstream msg;
+            msg << "error: index " << i << " is incorrect: " << result << " != " << answer;
+            throw std::runtime_error(msg.str());
+        }
+    }
+
+    cuda_check(cudaFree(input_dev));
+    cuda_check(cudaFree(output_dev));
+}
+
+int main() {
+    int n = 84000;  // divisible by 1, 2, 3, 4, 5, 6, 7, 8
+    cuda_check(cudaSetDevice(0));
+
+    run_kernel<1>(n);
+    run_kernel<2>(n);
+    run_kernel<3>(n);
+    run_kernel<4>(n);
+    run_kernel<8>(n);
+
+    std::cout << "result correct\n";
+    return EXIT_SUCCESS;
+}
diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h
index e69de29..0561d01 100644
--- a/include/kernel_float/tiling.h
+++ b/include/kernel_float/tiling.h
@@ -0,0 +1,516 @@
+#ifndef KERNEL_FLOAT_TILING_H
+#define KERNEL_FLOAT_TILING_H
+
+#include "iterate.h"
+#include "vector.h"
+
+namespace kernel_float {
+
+template<size_t... Ns>
+struct block_size {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    block_size(dim3 thread_index) {
+        if (rank > 0 && size(0) > 1) {
+            thread_index_[0] = thread_index.x;
+        }
+
+        if (rank > 1 && size(1) > 1) {
+            thread_index_[1] = thread_index.y;
+        }
+
+        if (rank > 2 && size(2) > 1) {
+            thread_index_[2] = thread_index.z;
+        }
+    }
+
+    KERNEL_FLOAT_INLINE
+    size_t thread_index(size_t axis) const {
+        return axis < rank ? thread_index_[axis] : 0;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis) {
+        size_t sizes[rank] = {Ns...};
+        return axis < rank ? sizes[axis] : 1;
+    }
+
+  private:
+    unsigned int thread_index_[rank] = {0};
+};
+
+template<size_t... Ns>
+struct virtual_block_size {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    virtual_block_size(dim3 thread_index) {
+        thread_index_ = thread_index.x;
+    }
+
+    KERNEL_FLOAT_INLINE
+    size_t thread_index(size_t axis) const {
+        size_t product_up_to_axis = 1;
+#pragma unroll
+        for (size_t i = 0; i < axis; i++) {
+            product_up_to_axis *= size(i);
+        }
+
+        return (thread_index_ / product_up_to_axis) % size(axis);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis) {
+        size_t sizes[rank] = {Ns...};
+        return axis < rank ? sizes[axis] : 1;
+    }
+
+  private:
+    unsigned int thread_index_ = 0;
+};
+
+template<size_t... Ns>
+struct tile_size {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis, size_t block_size = 0) {
+        size_t sizes[rank] = {Ns...};
+        return axis < rank ? sizes[axis] : 1;
+    }
+};
+
+template<size_t... Ns>
+struct tile_factor {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis, size_t block_size) {
+        size_t factors[rank] = {Ns...};
+        return block_size * (axis < rank ? factors[axis] : 1);
+    }
+};
+
+namespace dist {
+template<size_t N, size_t K>
+struct blocked_impl {
+    static constexpr bool is_exhaustive = N % K == 0;
+    static constexpr size_t items_per_thread = (N / K) + (is_exhaustive ? 0 : 1);
+
+    KERNEL_FLOAT_INLINE
+    static constexpr bool local_is_present(size_t thread_index, size_t local_index) {
+        return is_exhaustive || (local_to_global(thread_index, local_index) < N);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t local_to_global(size_t thread_index, size_t local_index) {
+        return thread_index * items_per_thread + local_index;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_local(size_t global_index) {
+        return global_index % items_per_thread;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_owner(size_t global_index) {
+        return global_index / items_per_thread;
+    }
+};
+
+struct blocked {
+    template<size_t N, size_t K>
+    using type = blocked_impl<N, K>;
+};
+
+template<size_t M, size_t N, size_t K>
+struct cyclic_impl {
+    static constexpr bool is_exhaustive = N % (K * M) == 0;
+    static constexpr size_t items_per_thread = ((N / (K * M)) + (is_exhaustive ? 0 : 1)) * M;
+
+    KERNEL_FLOAT_INLINE
+    static constexpr bool local_is_present(size_t thread_index, size_t local_index) {
+        return is_exhaustive || (local_to_global(thread_index, local_index) < N);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t local_to_global(size_t thread_index, size_t local_index) {
+        return (local_index / M) * M * K + thread_index * M + (local_index % M);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_local(size_t global_index) {
+        return (global_index / (M * K)) * M + (global_index % M);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_owner(size_t global_index) {
+        return (global_index / M) % K;
+    }
+};
+
+struct cyclic {
+    template<size_t N, size_t K>
+    using type = cyclic_impl<1, N, K>;
+};
+
+template<size_t M>
+struct block_cyclic {
+    template<size_t N, size_t K>
+    using type = cyclic_impl<M, N, K>;
+};
+}  // namespace dist
+
+template<typename... Ds>
+struct distributions {};
+
+namespace detail {
+template<size_t I, typename D>
+struct instantiate_distribution_impl {
+    template<size_t N, size_t K>
+    using type = dist::cyclic::type<N, K>;
+};
+
+template<typename First, typename... Rest>
+struct instantiate_distribution_impl<0, distributions<First, Rest...>> {
+    template<size_t N, size_t K>
+    using type = typename First::type<N, K>;
+};
+
+template<size_t I, typename First, typename... Rest>
+struct instantiate_distribution_impl<I, distributions<First, Rest...>>:
+    instantiate_distribution_impl<I + 1, distributions<Rest...>> {};
+
+template<
+    typename TileDim,
+    typename BlockDim,
+    typename Distributions,
+    typename = make_index_sequence<TileDim::rank>>
+struct tiling_impl;
+
+template<typename TileDim, typename BlockDim, typename Distributions, size_t... Is>
+struct tiling_impl<TileDim, BlockDim, Distributions, index_sequence<Is...>> {
+    template<size_t I>
+    using dist_type = typename instantiate_distribution_impl<I, Distributions>::
+        type<TileDim::size(I, BlockDim::size(I)), BlockDim::size(I)>;
+
+    static constexpr size_t rank = TileDim::rank;
+    static constexpr size_t items_per_thread = (dist_type<Is>::items_per_thread * ... * 1);
+    static constexpr bool is_exhaustive = (dist_type<Is>::is_exhaustive && ...);
+
+    template<typename IndexType>
+    KERNEL_FLOAT_INLINE static vector_storage<IndexType, rank>
+    local_to_global(const BlockDim& block, size_t item) {
+        vector_storage<IndexType, rank> result;
+        ((result.data()[Is] = dist_type<Is>::local_to_global(
+              block.thread_index(Is),
+              item % dist_type<Is>::items_per_thread),
+          item /= dist_type<Is>::items_per_thread),
+         ...);
+        return result;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static bool local_is_present(const BlockDim& block, size_t item) {
+        bool is_present = true;
+        ((is_present &= dist_type<Is>::local_is_present(
+              block.thread_index(Is),
+              item % dist_type<Is>::items_per_thread),
+          item /= dist_type<Is>::items_per_thread),
+         ...);
+        return is_present;
+    }
+};
+};  // namespace detail
+
+template<typename T>
+struct tiling_iterator;
+
+template<
+    typename TileDim,
+    typename BlockDim,
+    typename Distributions = distributions<>,
+    typename IndexType = int>
+struct tiling {
+    using self_type = tiling<TileDim, BlockDim, Distributions, IndexType>;
+    using impl_type = detail::tiling_impl<TileDim, BlockDim, Distributions>;
+    using block_type = BlockDim;
+    using tile_type = TileDim;
+
+    static constexpr size_t rank = tile_type::rank;
+    static constexpr size_t num_locals = impl_type::items_per_thread;
+
+    using index_type = IndexType;
+    using point_type = vector<index_type, extent<rank>>;
+
+#if KERNEL_FLOAT_IS_DEVICE
+    __forceinline__ __device__ tiling() : block_(threadIdx) {}
+#endif
+
+    KERNEL_FLOAT_INLINE
+    tiling(BlockDim block, vec<index_type, rank> offset = {}) : block_(block), offset_(offset) {}
+
+    /**
+     * Returns the number of items per thread in the tiling.
+     *
+     * Note that this method is ``constexpr`` and can be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size() {
+        return impl_type::items_per_thread;
+    }
+
+    /**
+     * Checks if the tiling is exhaustive, meaning all items are always present for all threads. If this returns
+     * `true`, then ``is_present`` will always true for any given index.
+     *
+     * Note that this method is ``constexpr`` and can thus be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr bool all_present() {
+        return impl_type::is_exhaustive;
+    }
+
+    /**
+     * Checks if a specific item is present for the current thread based on the distribution strategy. Not always
+     * is the number of items stored per thread equal to the number of items _owned_ by each thread (for example,
+     * if the tile size is not divisible by the block size). In this case, ``is_present`` will return `false` for
+     * certain items.
+     */
+    KERNEL_FLOAT_INLINE
+    bool is_present(size_t item) const {
+        return all_present() || impl_type::local_is_present(block_, item);
+    }
+
+    /**
+     * Returns the global coordinates of a specific item for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> at(size_t item) const {
+        return impl_type::template local_to_global<index_type>(block_, item) + offset_;
+    }
+
+    /**
+     * Returns the global coordinates of a specific item along a specified axis for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    index_type at(size_t item, size_t axis) const {
+        return axis < rank ? at(item)[axis] : index_type {};
+    }
+
+    /**
+     * Returns the global coordinates of a specific item for the current thread (alias of ``at``).
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> operator[](size_t item) const {
+        return at(item);
+    }
+
+    /**
+     * Returns a vector of global coordinates of all items present for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<vector<index_type, extent<rank>>, extent<num_locals>> local_points() const {
+        return range<num_locals>([&](size_t i) { return at(i); });
+    }
+
+    /**
+     * Returns a vector of coordinate values along a specified axis for all items present for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<num_locals>> local_points(size_t axis) const {
+        return range<num_locals>([&](size_t i) { return at(i, axis); });
+    }
+
+    /**
+     * Returns a vector of boolean values representing the result of ``is_present`` of the items for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<bool, extent<num_locals>> local_mask() const {
+        return range<num_locals>([&](size_t i) { return is_present(i); });
+    }
+
+    /**
+     * Returns the thread index (position) along a specified axis for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    index_type thread_index(size_t axis) const {
+        return index_type(block_.thread_index(axis));
+    }
+
+    /**
+     * Returns the size of the block (number of threads) along a specified axis.
+     *
+     * Note that this method is ``constexpr`` and can thus be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr index_type block_size(size_t axis) {
+        return index_type(block_type::size(axis));
+    }
+
+    /**
+     * Returns the size of the tile along a specified axis.
+     *
+     * Note that this method is ``constexpr`` and can thus be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr index_type tile_size(size_t axis) {
+        return index_type(tile_type::size(axis, block_size(axis)));
+    }
+
+    /**
+     * Returns the offset of the tile along a specified axis.
+     */
+    KERNEL_FLOAT_INLINE
+    index_type tile_offset(size_t axis) const {
+        return index_type(offset_[axis]);
+    }
+
+    /**
+     * Returns a vector of thread indices for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> thread_index() const {
+        return range<rank>([&](size_t i) { return thread_index(i); });
+    }
+
+    /**
+     * Returns a vector of block sizes for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    static vector<index_type, extent<rank>> block_size() {
+        return range<rank>([&](size_t i) { return block_size(i); });
+    }
+
+    /**
+     * Returns a vector of tile sizes for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    static vector<index_type, extent<rank>> tile_size() {
+        return range<rank>([&](size_t i) { return tile_size(i); });
+    }
+
+    /**
+     * Returns the offset of the tile for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> tile_offset() const {
+        return range<rank>([&](size_t i) { return tile_offset(i); });
+    }
+
+    /**
+     * Returns an iterator pointing to the beginning of the tiling.
+     */
+    KERNEL_FLOAT_INLINE
+    tiling_iterator<tiling> begin() const {
+        return {*this, 0};
+    }
+
+    /**
+     * Returns an iterator pointing to the end of the tiling.
+     */
+    KERNEL_FLOAT_INLINE
+    tiling_iterator<tiling> end() const {
+        return {*this, num_locals};
+    }
+
+    /**
+     * Applies a provided function to each item present in the tiling for the current thread.
+     * The function should take an index and a ``vector`` of global coordinates as arguments.
+     */
+    template<typename F>
+    KERNEL_FLOAT_INLINE void for_each(F fun) const {
+#pragma unroll
+        for (size_t i = 0; i < num_locals; i++) {
+            if (is_present(i)) {
+                fun(i, at(i));
+            }
+        }
+    }
+
+    /**
+     * Adds ``offset`` to all points of this tiling and returns a new tiling.
+     */
+    KERNEL_FLOAT_INLINE friend tiling
+    operator+(const tiling& self, const vector<index_type, extent<rank>>& offset) {
+        return tiling {self.block_, self.offset_ + offset};
+    }
+
+    /**
+     * Adds ``offset`` to all points of this tiling and returns a new tiling.
+     */
+    KERNEL_FLOAT_INLINE friend tiling
+    operator+(const vector<index_type, extent<rank>>& offset, const tiling& self) {
+        return self + offset;
+    }
+
+    /**
+     * Adds ``offset`` to all points of this tiling.
+     */
+    KERNEL_FLOAT_INLINE friend tiling&
+    operator+=(tiling& self, const vector<index_type, extent<rank>>& offset) {
+        return self = self + offset;
+    }
+
+  private:
+    BlockDim block_;
+    vector<index_type, extent<rank>> offset_;
+};
+
+template<typename T>
+struct tiling_iterator {
+    using value_type = vector<typename T::index_type, extent<T::rank>>;
+
+    KERNEL_FLOAT_INLINE
+    tiling_iterator(const T& inner, size_t position = 0) : inner_(&inner), position_(position) {
+        while (position_ < T::num_locals && !inner_->is_present(position_)) {
+            position_++;
+        }
+    }
+
+    KERNEL_FLOAT_INLINE
+    value_type operator*() const {
+        return inner_->at(position_);
+    }
+
+    KERNEL_FLOAT_INLINE
+    tiling_iterator& operator++() {
+        return *this = tiling_iterator(*inner_, position_ + 1);
+    }
+
+    KERNEL_FLOAT_INLINE
+    tiling_iterator operator++(int) {
+        tiling_iterator old = *this;
+        this ++;
+        return old;
+    }
+
+    KERNEL_FLOAT_INLINE
+    friend bool operator==(const tiling_iterator& a, const tiling_iterator& b) {
+        return a.position_ == b.position_;
+    }
+
+    KERNEL_FLOAT_INLINE
+    friend bool operator!=(const tiling_iterator& a, const tiling_iterator& b) {
+        return !operator==(a, b);
+    }
+
+    size_t position_ = 0;
+    const T* inner_;
+};
+
+template<size_t TileDim, size_t BlockDim, typename D = dist::cyclic, typename IndexType = int>
+using tiling_1d = tiling<tile_size<TileDim>, block_size<BlockDim>, distributions<D>, IndexType>;
+
+// clang-format off
+#define KERNEL_FLOAT_TILING_FOR(TILING_VARIABLE__, INDEX_VARIABLE__, POINT_VARIABLE__)                                      \
+        _Pragma("unroll")                                                                                                   \
+        for (::std::size_t INDEX_VARIABLE__ = 0; INDEX_VARIABLE__ < TILING_VARIABLE__.size(); INDEX_VARIABLE__++)           \
+            if (typename decltype(TILING_VARIABLE__)::point_type POINT_VARIABLE__ = TILING_VARIABLE__.at(INDEX_VARIABLE__); \
+                TILING_VARIABLE__.is_present(INDEX_VARIABLE__))
+// clang-format on
+
+}  // namespace kernel_float
+
+#endif  // KERNEL_FLOAT_TILING_H
\ No newline at end of file
diff --git a/tests/tiling.cu b/tests/tiling.cu
new file mode 100644
index 0000000..c1599aa
--- /dev/null
+++ b/tests/tiling.cu
@@ -0,0 +1,92 @@
+#include "common.h"
+#include "kernel_float/tiling.h"
+
+struct basic_tiling_test {
+    template<typename T>
+    __host__ __device__ void operator()(generator<T> gen) {
+        auto tiling = kf::tiling<
+            kf::tile_size<8, 8>,
+            kf::block_size<2, 4>,
+            kf::distributions<kf::dist::cyclic, kf::dist::blocked>>(dim3(1, 2, 0));
+
+        ASSERT_EQ(tiling.size(), size_t(8));
+
+        ASSERT_EQ(
+            tiling.local_points(),
+            kf::make_vec(
+                kf::make_vec(1, 2),
+                kf::make_vec(3, 2),
+                kf::make_vec(5, 2),
+                kf::make_vec(7, 2),
+                kf::make_vec(1, 6),
+                kf::make_vec(3, 6),
+                kf::make_vec(5, 6),
+                kf::make_vec(7, 6)));
+
+        ASSERT_EQ(tiling.local_points(0), kf::make_vec(1, 3, 5, 7, 1, 3, 5, 7));
+
+        ASSERT_EQ(tiling.local_points(1), kf::make_vec(2, 2, 2, 2, 6, 6, 6, 6));
+
+        ASSERT_EQ(tiling.at(0), kf::make_vec(1, 2));
+        ASSERT_EQ(tiling.at(1), kf::make_vec(3, 2));
+        ASSERT_EQ(tiling.at(2), kf::make_vec(5, 2));
+
+        ASSERT_EQ(tiling.at(0, 0), 1);
+        ASSERT_EQ(tiling.at(0, 1), 2);
+        ASSERT_EQ(tiling.at(1, 0), 3);
+        ASSERT_EQ(tiling.at(1, 1), 2);
+
+        ASSERT_EQ(tiling[0], kf::make_vec(1, 2));
+        ASSERT_EQ(tiling[1], kf::make_vec(3, 2));
+        ASSERT_EQ(tiling[2], kf::make_vec(5, 2));
+        ASSERT_EQ(tiling[3], kf::make_vec(7, 2));
+
+        ASSERT_EQ(
+            tiling.local_mask(),
+            kf::make_vec(true, true, true, true, true, true, true, true));
+        ASSERT_EQ(tiling.is_present(0), true);
+        ASSERT_EQ(tiling.is_present(1), true);
+        ASSERT_EQ(tiling.is_present(2), true);
+        ASSERT_EQ(tiling.is_present(3), true);
+
+        ASSERT_EQ(tiling.thread_index(0), 1);
+        ASSERT_EQ(tiling.thread_index(1), 2);
+        ASSERT_EQ(tiling.thread_index(2), 0);
+        ASSERT_EQ(tiling.thread_index(), kf::make_vec(1, 2));
+
+        ASSERT_EQ(tiling.block_size(0), 2);
+        ASSERT_EQ(tiling.block_size(1), 4);
+        ASSERT_EQ(tiling.block_size(2), 1);
+        ASSERT_EQ(tiling.block_size(), kf::make_vec(2, 4));
+
+        ASSERT_EQ(tiling.tile_size(0), 8);
+        ASSERT_EQ(tiling.tile_size(1), 8);
+        ASSERT_EQ(tiling.tile_size(2), 1);
+        ASSERT_EQ(tiling.tile_size(), kf::make_vec(8, 8));
+
+        ASSERT_EQ(tiling.size(), size_t(8));
+
+        size_t counter = 0;
+        const int points[8][2] = {
+            {1, 2},
+            {3, 2},
+            {5, 2},
+            {7, 2},
+            {1, 6},
+            {3, 6},
+            {5, 6},
+            {7, 6},
+        };
+
+        KERNEL_FLOAT_TILING_FOR(tiling, i, point) {
+            ASSERT_EQ(counter, i);
+            ASSERT_EQ(point[0], points[i][0]);
+            ASSERT_EQ(point[1], points[i][1]);
+            counter++;
+        }
+
+        ASSERT_EQ(counter, size_t(8));
+    }
+};
+
+REGISTER_TEST_CASE("basic tiling tests", basic_tiling_test, int)

From 4b68f09dd07ea9cba7611e63aa000d8dd157d62e Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 4 Oct 2023 15:58:41 +0200
Subject: [PATCH 02/16] Add `range(fun)` utility function

---
 include/kernel_float/iterate.h | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h
index 68c1645..1b3d683 100644
--- a/include/kernel_float/iterate.h
+++ b/include/kernel_float/iterate.h
@@ -30,13 +30,13 @@ void for_each(V&& input, F fun) {
 namespace detail {
 template<typename T, size_t N>
 struct range_impl {
-    KERNEL_FLOAT_INLINE
-    static vector_storage<T, N> call() {
+    template<typename F>
+    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(F fun) {
         vector_storage<T, N> result;
 
 #pragma unroll
         for (size_t i = 0; i < N; i++) {
-            result.data()[i] = T(i);
+            result.data()[i] = fun(i);
         }
 
         return result;
@@ -44,6 +44,21 @@ struct range_impl {
 };
 }  // namespace detail
 
+/**
+ * Generate vector consisting of the result `fun(0)...fun(N-1)`
+ *
+ * Example
+ * =======
+ * ```
+ * // Returns [0.0f, 2.0f, 4.0f]
+ * vec<float, 3> vec = range<3>([](auto i){ return float(i * 2.0f); });
+ * ```
+ */
+template<size_t N, typename F, typename T = result_t<F, size_t>>
+KERNEL_FLOAT_INLINE vector<T, extent<N>> range(F fun) {
+    return detail::range_impl<T, N>::call(fun);
+}
+
 /**
  * Generate vector consisting of the numbers `0...N-1` of type `T`
  *
@@ -56,7 +71,7 @@ struct range_impl {
  */
 template<typename T, size_t N>
 KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
-    return detail::range_impl<T, N>::call();
+    return detail::range_impl<T, N>::call(ops::cast<size_t, T>());
 }
 
 /**
@@ -71,7 +86,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
  */
 template<typename V>
 KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
-    return detail::range_impl<vector_value_type<V>, vector_extent<V>>::call();
+    return range<vector_value_type<V>, vector_extent<V>>();
 }
 
 /**
@@ -96,7 +111,7 @@ KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
  */
 template<typename T = size_t, typename V>
 KERNEL_FLOAT_INLINE vector<T, vector_extent_type<V>> each_index(const V& = {}) {
-    return detail::range_impl<T, vector_extent<V>>::call();
+    return range<T, vector_extent<V>>();
 }
 
 namespace detail {

From aaf864569f9339a6bb55528743e87f6f3f8f1953 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 4 Oct 2023 15:59:01 +0200
Subject: [PATCH 03/16] Fix bugs in `aligned_ptr`

---
 include/kernel_float/memory.h | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h
index 01fce2f..fbe60a1 100644
--- a/include/kernel_float/memory.h
+++ b/include/kernel_float/memory.h
@@ -215,13 +215,6 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m
     return store(values, ptr, indices, indices < max_length);
 }
 
-// TOOD: check if this way is support across all compilers
-#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
-#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment))
-#else
-#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (ptr)
-#endif
-
 template<typename T, size_t N>
 struct AssignConversionProxy {
     KERNEL_FLOAT_INLINE
@@ -263,6 +256,20 @@ KERNEL_FLOAT_INLINE AssignConversionProxy<T, E::value> cast_to(vector<T, E>& inp
     return AssignConversionProxy<T, E::value>(input.data());
 }
 
+/**
+ * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes.
+ * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful!
+ */
+template<typename T>
+KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) {
+// TOOD: check if this way is support across all compilers
+#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
+    return static_cast<T*>(__builtin_assume_aligned(ptr, alignment));
+#else
+    return ptr;
+#endif
+}
+
 /**
  * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes.
  */
@@ -281,7 +288,7 @@ struct aligned_ptr {
      */
     KERNEL_FLOAT_INLINE
     T* get() const {
-        return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment);
+        return unsafe_assume_aligned(ptr_, alignment);
     }
 
     KERNEL_FLOAT_INLINE
@@ -360,12 +367,18 @@ struct aligned_ptr<const T, alignment> {
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(const T* ptr) : ptr_(ptr) {}
 
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(const aligned_ptr<T>& ptr) : ptr_(ptr.get()) {}
+
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(const aligned_ptr<const T>& ptr) : ptr_(ptr.get()) {}
+
     /**
      * Return the pointer value.
      */
     KERNEL_FLOAT_INLINE
     const T* get() const {
-        return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment);
+        return unsafe_assume_aligned(ptr_, alignment);
     }
 
     KERNEL_FLOAT_INLINE
@@ -406,6 +419,9 @@ struct aligned_ptr<const T, alignment> {
     const T* ptr_ = nullptr;
 };
 
+template<typename T>
+aligned_ptr(T*) -> aligned_ptr<T>;
+
 }  // namespace kernel_float
 
 #endif  //KERNEL_FLOAT_MEMORY_H

From 2a51e52229bd78ac1600ccd357bf0bfcd63bc9e6 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Fri, 6 Oct 2023 15:18:33 +0200
Subject: [PATCH 04/16] Add ability for `KERNEL_FLOAT_TILING_FOR_IMPL` to
 accept two or three arguments

---
 include/kernel_float/macros.h |  5 ++++
 include/kernel_float/meta.h   |  2 +-
 include/kernel_float/tiling.h | 19 ++++++++++----
 tests/tiling.cu               | 47 +++++++++++++++++++++--------------
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h
index 8f42895..bfc8595 100644
--- a/include/kernel_float/macros.h
+++ b/include/kernel_float/macros.h
@@ -44,4 +44,9 @@
     } while (0)
 #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable()
 
+// Somet utility macros
+#define KERNEL_FLOAT_CONCAT_IMPL(A, B) A##B
+#define KERNEL_FLOAT_CONCAT(A, B)      KERNEL_FLOAT_CONCAT_IMPL(A, B)
+#define KERNEL_FLOAT_CALL(F, ...)      F(__VA_ARGS__)
+
 #endif  //KERNEL_FLOAT_MACROS_H
diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h
index 9c133a3..5141f82 100644
--- a/include/kernel_float/meta.h
+++ b/include/kernel_float/meta.h
@@ -77,7 +77,7 @@ template<typename T>
 using decay_t = typename detail::decay_impl<T>::type;
 
 template<typename A, typename B>
-struct promote_type;
+struct promote_type {};
 
 template<typename T>
 struct promote_type<T, T> {
diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h
index 0561d01..427d044 100644
--- a/include/kernel_float/tiling.h
+++ b/include/kernel_float/tiling.h
@@ -504,11 +504,20 @@ template<size_t TileDim, size_t BlockDim, typename D = dist::cyclic, typename In
 using tiling_1d = tiling<tile_size<TileDim>, block_size<BlockDim>, distributions<D>, IndexType>;
 
 // clang-format off
-#define KERNEL_FLOAT_TILING_FOR(TILING_VARIABLE__, INDEX_VARIABLE__, POINT_VARIABLE__)                                      \
-        _Pragma("unroll")                                                                                                   \
-        for (::std::size_t INDEX_VARIABLE__ = 0; INDEX_VARIABLE__ < TILING_VARIABLE__.size(); INDEX_VARIABLE__++)           \
-            if (typename decltype(TILING_VARIABLE__)::point_type POINT_VARIABLE__ = TILING_VARIABLE__.at(INDEX_VARIABLE__); \
-                TILING_VARIABLE__.is_present(INDEX_VARIABLE__))
+#define KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _)      \
+        _Pragma("unroll")                                                         \
+        for (size_t ITER_VAR = 0; ITER_VAR < (TILING).size(); ITER_VAR++)         \
+            if (POINT_VAR = (TILING).at(ITER_VAR); (TILING).is_present(ITER_VAR)) \
+
+#define KERNEL_FLOAT_TILING_FOR_IMPL2(ITER_VAR, TILING, INDEX_VAR, POINT_VAR)      \
+        KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \
+            if (INDEX_VAR = ITER_VAR; true)
+
+#define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \
+    KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B)
+
+#define KERNEL_FLOAT_TILING_FOR(...) \
+    KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1)
 // clang-format on
 
 }  // namespace kernel_float
diff --git a/tests/tiling.cu b/tests/tiling.cu
index c1599aa..5735e15 100644
--- a/tests/tiling.cu
+++ b/tests/tiling.cu
@@ -4,12 +4,14 @@
 struct basic_tiling_test {
     template<typename T>
     __host__ __device__ void operator()(generator<T> gen) {
-        auto tiling = kf::tiling<
+        using TestTiling = kf::tiling<
             kf::tile_size<8, 8>,
             kf::block_size<2, 4>,
-            kf::distributions<kf::dist::cyclic, kf::dist::blocked>>(dim3(1, 2, 0));
+            kf::distributions<kf::dist::cyclic, kf::dist::blocked>  //
+            >;
+        auto tiling = TestTiling(dim3(1, 2, 0));
 
-        ASSERT_EQ(tiling.size(), size_t(8));
+        ASSERT_EQ(TestTiling::size(), size_t(8));
 
         ASSERT_EQ(
             tiling.local_points(),
@@ -24,7 +26,6 @@ struct basic_tiling_test {
                 kf::make_vec(7, 6)));
 
         ASSERT_EQ(tiling.local_points(0), kf::make_vec(1, 3, 5, 7, 1, 3, 5, 7));
-
         ASSERT_EQ(tiling.local_points(1), kf::make_vec(2, 2, 2, 2, 6, 6, 6, 6));
 
         ASSERT_EQ(tiling.at(0), kf::make_vec(1, 2));
@@ -44,6 +45,7 @@ struct basic_tiling_test {
         ASSERT_EQ(
             tiling.local_mask(),
             kf::make_vec(true, true, true, true, true, true, true, true));
+        ASSERT_EQ(TestTiling::all_present(), true);
         ASSERT_EQ(tiling.is_present(0), true);
         ASSERT_EQ(tiling.is_present(1), true);
         ASSERT_EQ(tiling.is_present(2), true);
@@ -54,19 +56,16 @@ struct basic_tiling_test {
         ASSERT_EQ(tiling.thread_index(2), 0);
         ASSERT_EQ(tiling.thread_index(), kf::make_vec(1, 2));
 
-        ASSERT_EQ(tiling.block_size(0), 2);
-        ASSERT_EQ(tiling.block_size(1), 4);
-        ASSERT_EQ(tiling.block_size(2), 1);
-        ASSERT_EQ(tiling.block_size(), kf::make_vec(2, 4));
-
-        ASSERT_EQ(tiling.tile_size(0), 8);
-        ASSERT_EQ(tiling.tile_size(1), 8);
-        ASSERT_EQ(tiling.tile_size(2), 1);
-        ASSERT_EQ(tiling.tile_size(), kf::make_vec(8, 8));
+        ASSERT_EQ(TestTiling::block_size(0), 2);
+        ASSERT_EQ(TestTiling::block_size(1), 4);
+        ASSERT_EQ(TestTiling::block_size(2), 1);
+        ASSERT_EQ(TestTiling::block_size(), kf::make_vec(2, 4));
 
-        ASSERT_EQ(tiling.size(), size_t(8));
+        ASSERT_EQ(TestTiling::tile_size(0), 8);
+        ASSERT_EQ(TestTiling::tile_size(1), 8);
+        ASSERT_EQ(TestTiling::tile_size(2), 1);
+        ASSERT_EQ(TestTiling::tile_size(), kf::make_vec(8, 8));
 
-        size_t counter = 0;
         const int points[8][2] = {
             {1, 2},
             {3, 2},
@@ -78,14 +77,26 @@ struct basic_tiling_test {
             {7, 6},
         };
 
-        KERNEL_FLOAT_TILING_FOR(tiling, i, point) {
-            ASSERT_EQ(counter, i);
+        size_t counter = 0;
+        KERNEL_FLOAT_TILING_FOR(tiling, auto point) {
+            ASSERT(counter < 8);
+            ASSERT_EQ(point[0], points[counter][0]);
+            ASSERT_EQ(point[1], points[counter][1]);
+            counter++;
+        }
+
+        ASSERT(counter == 8);
+
+        counter = 0;
+        KERNEL_FLOAT_TILING_FOR(tiling, int i, auto point) {
+            ASSERT(counter < 8);
+            ASSERT_EQ(counter, size_t(i));
             ASSERT_EQ(point[0], points[i][0]);
             ASSERT_EQ(point[1], points[i][1]);
             counter++;
         }
 
-        ASSERT_EQ(counter, size_t(8));
+        ASSERT(counter == 8);
     }
 };
 

From c51828f289aa8309623e3431d580a6960c1392be Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Fri, 6 Oct 2023 15:18:57 +0200
Subject: [PATCH 05/16] Add missing `KERNEL_FLOAT_INLINE` on `for_each`

---
 include/kernel_float/iterate.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h
index 1b3d683..5e91180 100644
--- a/include/kernel_float/iterate.h
+++ b/include/kernel_float/iterate.h
@@ -18,7 +18,7 @@ namespace kernel_float {
  * ```
  */
 template<typename V, typename F>
-void for_each(V&& input, F fun) {
+KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) {
     auto storage = into_vector_storage(input);
 
 #pragma unroll
@@ -315,4 +315,4 @@ KERNEL_FLOAT_INLINE select_type<V, Is...> select(const V& input, const Is&... in
 
 }  // namespace kernel_float
 
-#endif
\ No newline at end of file
+#endif

From 257335f531058dda3871423c87f9d0f41771814f Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Fri, 6 Oct 2023 16:24:01 +0200
Subject: [PATCH 06/16] Move `cast_to` to from memory.h to conversion.h

---
 include/kernel_float/conversion.h | 41 +++++++++++++++++++++++++++++++
 include/kernel_float/memory.h     | 41 -------------------------------
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h
index 78a1927..61e0236 100644
--- a/include/kernel_float/conversion.h
+++ b/include/kernel_float/conversion.h
@@ -193,6 +193,47 @@ KERNEL_FLOAT_INLINE vector<R, extent<N>> convert(const V& input, extent<N> new_s
     return convert_storage(input);
 }
 
+template<typename T, RoundingMode M = RoundingMode::ANY>
+struct AssignConversionProxy {
+    KERNEL_FLOAT_INLINE
+    explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {}
+
+    template<typename U>
+    KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) {
+        *ptr_ = detail::convert_impl<
+            vector_value_type<U>,
+            vector_extent_type<U>,
+            vector_value_type<T>,
+            vector_extent_type<T>,
+            M>::call(into_vector_storage(values));
+
+        return *this;
+    }
+
+  private:
+    T* ptr_;
+};
+
+/**
+ * Takes a vector reference and gives back a helper object. This object helps when you want to assign one vector to another
+ * vector of a different type. It's a way to enable implicit type conversion.
+ *
+ * For example, if `x = expression;` does not compile because `x` and `expression` are different vector types, you can use
+ * `cast_to(x) = expression;` to make it work.
+ *
+ * Example
+ * =======
+ * ```
+ * vec<float, 2> x;
+ * vec<double, 2> y = {1.0, 2.0};
+ * cast_to(x) = y;  // Normally, `x = y;` would give an error, but `cast_to` fixes that.
+ * ```
+ */
+template<typename T, RoundingMode M = RoundingMode::ANY>
+KERNEL_FLOAT_INLINE AssignConversionProxy<T, M> cast_to(T& input) {
+    return AssignConversionProxy<T, M>(&input);
+}
+
 /**
  * Returns a vector containing `N` copies of `value`.
  *
diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h
index fbe60a1..06c1aac 100644
--- a/include/kernel_float/memory.h
+++ b/include/kernel_float/memory.h
@@ -215,47 +215,6 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m
     return store(values, ptr, indices, indices < max_length);
 }
 
-template<typename T, size_t N>
-struct AssignConversionProxy {
-    KERNEL_FLOAT_INLINE
-    explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {}
-
-    template<typename U>
-    KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) {
-        auto indices = range<size_t, N>();
-        detail::store_impl<T, N>::call(
-            ptr_,
-            convert_storage<T, N>(std::forward<U>(values)).data(),
-            indices.data());
-
-        return *this;
-    }
-
-  private:
-    T* ptr_;
-};
-
-/**
- * Takes a reference to a vector and returns a special proxy object that automatically performs the correct conversion
- * when a vector of a different element type is assigned. This is useful to perform implicit type conversions.
- *
- * For example, let assume that a line like `x = expression;` would not compile since `x` and `expressions` are
- * vectors of different element types. Then it is possible to use `cast_to(x) = expression;` to fix this error,
- * which possibly introduces a type conversion.
- *
- * Example
- * =======
- * ```
- * vec<float, 2> x;
- * vec<double, 2> y = {1.0, 2.0};
- * cast_to(x) = y;  // normally, the line `x = y;` would not compile, but `cast_to` make this possible
- * ```
- */
-template<typename T, typename E>
-KERNEL_FLOAT_INLINE AssignConversionProxy<T, E::value> cast_to(vector<T, E>& input) {
-    return AssignConversionProxy<T, E::value>(input.data());
-}
-
 /**
  * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes.
  * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful!

From 33245d7f1cd0d3069ed534012a23ab6d73c7f606 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 10 Oct 2023 14:20:42 +0200
Subject: [PATCH 07/16] Add fast math functions to documentation

---
 docs/api.rst      |  1 +
 docs/build_api.py | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index 39ca304..09f7789 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -8,6 +8,7 @@ API Reference
    api/binary_operators.rst
    api/reductions.rst
    api/mathematical.rst
+   api/fast_math.rst
    api/conditional.rst
    api/memory_read_write.rst
 
diff --git a/docs/build_api.py b/docs/build_api.py
index 26d182c..b31e6c6 100644
--- a/docs/build_api.py
+++ b/docs/build_api.py
@@ -90,7 +90,8 @@ def build_index_page(groups):
             "for_each",
         ],
         "Generation": [
-            "range",
+            ("range", "range()"),
+            ("range", "range(F fun)"),
             "range_like",
             "each_index",
             "fill",
@@ -193,6 +194,14 @@ def build_index_page(groups):
             "isinf",
             "isnan",
         ],
+        "Fast math": [
+                "fast_exp",
+                "fast_log",
+                "fast_cos",
+                "fast_sin",
+                "fast_tan",
+                "fast_div",
+        ],
         "Conditional": [
             ("where", "where(const C&, const L&, const R&)"),
             ("where", "where(const C&, const L&)"),
@@ -202,12 +211,12 @@ def build_index_page(groups):
             "cast_to",
             ("load", "load(const T*, const I&)"),
             ("load", "load(const T*, const I&, const M&)"),
-            ("loadn", "loadn(const T*, ptrdiff_t)"),
-            ("loadn", "loadn(const T*, ptrdiff_t, ptrdiff_t)"),
+            ("loadn", "loadn(const T*, size_t)"),
+            ("loadn", "loadn(const T*, size_t, size_t)"),
             ("store", "store(const V&, T *ptr, const I&)"),
             ("store", "store(const V&, T *ptr, const I&, const M&)"),
-            ("storen", "storen(const V&, T*, ptrdiff_t)"),
-            ("storen", "storen(const V&, T*, ptrdiff_t, ptrdiff_t)"),
+            ("storen", "storen(const V&, T*, size_t)"),
+            ("storen", "storen(const V&, T*, size_t, size_t)"),
             ("aligned_ptr", "aligned_ptr", "struct"),
         ]
 }

From 2851957ed755e5e9113e690389f07d387cc61bff Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 10 Oct 2023 15:27:24 +0200
Subject: [PATCH 08/16] Document `constant<T>` and `tiling`

---
 docs/api.rst            |  1 +
 docs/build_api.py       | 10 +++++++++-
 docs/guides.rst         |  1 +
 docs/guides/constant.md | 38 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 docs/guides/constant.md

diff --git a/docs/api.rst b/docs/api.rst
index 09f7789..d3b2e21 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -11,4 +11,5 @@ API Reference
    api/fast_math.rst
    api/conditional.rst
    api/memory_read_write.rst
+   api/utilities.rst
 
diff --git a/docs/build_api.py b/docs/build_api.py
index b31e6c6..b06f043 100644
--- a/docs/build_api.py
+++ b/docs/build_api.py
@@ -27,7 +27,10 @@ def build_doxygen_page(name, items):
         content += "-" * len(title) + "\n"
 
         for symbol in symbols:
-            content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
+            if directive == "define":
+                content += f".. doxygendefine:: {symbol}\n\n"
+            else:
+                content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
 
     stripped_name = name.lower().replace(" ", "_").replace("/", "_")
     filename = f"api/{stripped_name}.rst"
@@ -218,6 +221,11 @@ def build_index_page(groups):
             ("storen", "storen(const V&, T*, size_t)"),
             ("storen", "storen(const V&, T*, size_t, size_t)"),
             ("aligned_ptr", "aligned_ptr", "struct"),
+        ],
+        "Utilities": [
+            ("constant", "constant", "struct"),
+            ("tiling", "tiling", "struct"),
+            ("KERNEL_FLOAT_TILING_FOR", "KERNEL_FLOAT_TILING_FOR", "define"),
         ]
 }
 
diff --git a/docs/guides.rst b/docs/guides.rst
index 609868b..72c2ead 100644
--- a/docs/guides.rst
+++ b/docs/guides.rst
@@ -6,3 +6,4 @@ Guides
    guides/introduction.rst
    guides/promotion.rst
    guides/prelude.rst
+   guides/constant.rst
diff --git a/docs/guides/constant.md b/docs/guides/constant.md
new file mode 100644
index 0000000..5d38501
--- /dev/null
+++ b/docs/guides/constant.md
@@ -0,0 +1,38 @@
+Using `kernel_float::constant`
+===
+
+When working with mixed precision types, you will find that working with constants presents a bit a challenge.
+
+For example, a simple expression such as `3.14 * x` where `x` is of type `vec<float, 2>` will NOT be performed
+in `float` precision as you might expect, but instead in `double` precision.
+This happens since the left-hand side of this expression
+(a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`.
+
+To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represents
+constants. Any binary operations between a value of type `U` and a `constant<T>` will result in both
+operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes
+`constant<T>` useful for representing constant in your code.
+
+
+For example, consider the following code:
+
+```
+#include "kernel_float.h"
+namespace kf = kernel_float;
+
+int main() {
+  using Type = float;
+  const int N = 8;
+  static constexpr auto PI = kf::make_constant(3.14);
+
+  kf::vec<int, N> i = kf::range<int, N>();
+  kf::vec<Type, N> x = kf::cast<Type>(i) * PI;
+  kf::vec<Type, N> y = x * kf::sin(x);
+  Type result = kf::sum(y);
+  printf("result=%f", double(result));
+
+  return EXIT_SUCCESS;
+}
+```
+
+This code example uses the ``make_constant`` utility function to create `constant<T>`.

From be9bbba87ac02e239a40a88d53ba1db7aea4ae13 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 10 Oct 2023 15:27:52 +0200
Subject: [PATCH 09/16] Fix small spelling mistakes in the documentation

---
 docs/guides/constant.md           |  7 +++---
 include/kernel_float/constant.h   | 33 +++++++++++++++++++++++---
 include/kernel_float/conversion.h | 20 ++++++++--------
 include/kernel_float/iterate.h    | 16 ++++++-------
 include/kernel_float/reduce.h     |  2 +-
 include/kernel_float/tiling.h     | 39 ++++++++++++++++++++++++++++++-
 include/kernel_float/vector.h     | 10 ++++++--
 7 files changed, 98 insertions(+), 29 deletions(-)

diff --git a/docs/guides/constant.md b/docs/guides/constant.md
index 5d38501..628354b 100644
--- a/docs/guides/constant.md
+++ b/docs/guides/constant.md
@@ -1,18 +1,17 @@
 Using `kernel_float::constant`
 ===
 
-When working with mixed precision types, you will find that working with constants presents a bit a challenge.
+When working with mixed precision types, you will find that working with constants presents a bit of a challenge.
 
 For example, a simple expression such as `3.14 * x` where `x` is of type `vec<float, 2>` will NOT be performed
 in `float` precision as you might expect, but instead in `double` precision.
 This happens since the left-hand side of this expression
 (a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`.
 
-To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represents
+To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represent
 constants. Any binary operations between a value of type `U` and a `constant<T>` will result in both
 operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes
-`constant<T>` useful for representing constant in your code.
-
+`constant<T>` useful for representing constants in your code.
 
 For example, consider the following code:
 
diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h
index 1b98925..be49d6f 100644
--- a/include/kernel_float/constant.h
+++ b/include/kernel_float/constant.h
@@ -6,17 +6,44 @@
 
 namespace kernel_float {
 
+/**
+ * `constant<T>` represents a constant value of type `T`.
+ *
+ * The object has the property that for any binary operation involving
+ * a `constant<T>` and a value of type `U`, the constant is automatically
+ * cast to also be of type `U`.
+ *
+ * For example:
+ * ```
+ * float a = 5;
+ * constant<double> b = 3;
+ *
+ * auto c = a + b; // The result will be of type `float`
+ * ```
+ */
 template<typename T = double>
 struct constant {
+    /**
+     * Create a new constant from the given value.
+     */
+    KERNEL_FLOAT_INLINE
+    constexpr constant(T value = {}) : value_(value) {}
+
+    KERNEL_FLOAT_INLINE
+    constexpr constant(const constant<T>& that) : value_(that.value) {}
+
+    /**
+     * Create a new constant from another constant of type `R`.
+     */
     template<typename R>
     KERNEL_FLOAT_INLINE explicit constexpr constant(const constant<R>& that) {
         auto f = ops::cast<R, T>();
         value_ = f(that.get());
     }
 
-    KERNEL_FLOAT_INLINE
-    constexpr constant(T value = {}) : value_(value) {}
-
+    /**
+     * Return the value of the constant
+     */
     KERNEL_FLOAT_INLINE
     constexpr T get() const {
         return value_;
diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h
index 61e0236..6f9e4fd 100644
--- a/include/kernel_float/conversion.h
+++ b/include/kernel_float/conversion.h
@@ -215,11 +215,11 @@ struct AssignConversionProxy {
 };
 
 /**
- * Takes a vector reference and gives back a helper object. This object helps when you want to assign one vector to another
- * vector of a different type. It's a way to enable implicit type conversion.
+ * Takes a vector reference and gives back a helper object. This object allows you to assign
+ * a vector of a different type to another vector while perofrming implicit type converion.
  *
- * For example, if `x = expression;` does not compile because `x` and `expression` are different vector types, you can use
- * `cast_to(x) = expression;` to make it work.
+ * For example, if `x = expression;` does not compile because `x` and `expression` are
+ * different vector types, you can use `cast_to(x) = expression;` to make it work.
  *
  * Example
  * =======
@@ -240,7 +240,7 @@ KERNEL_FLOAT_INLINE AssignConversionProxy<T, M> cast_to(T& input) {
  * Example
  * =======
  * ```
- * vec<int, 3> a = fill<3>(42); // return [42, 42, 42]
+ * vec<int, 3> a = fill<3>(42); // returns [42, 42, 42]
  * ```
  */
 template<size_t N, typename T>
@@ -255,7 +255,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> fill(T value = {}, extent<N> = {}) {
  * Example
  * =======
  * ```
- * vec<int, 3> a = zeros<int, 3>(); // return [0, 0, 0]
+ * vec<int, 3> a = zeros<int, 3>(); // returns [0, 0, 0]
  * ```
  */
 template<typename T, size_t N>
@@ -270,7 +270,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> zeros(extent<N> = {}) {
  * Example
  * =======
  * ```
- * vec<int, 3> a = ones<int, 3>(); // return [1, 1, 1]
+ * vec<int, 3> a = ones<int, 3>(); // returns [1, 1, 1]
  * ```
  */
 template<typename T, size_t N>
@@ -286,7 +286,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> ones(extent<N> = {}) {
  * =======
  * ```
  * vec<int, 3> a = {1, 2, 3};
- * vec<int, 3> b = fill_like(a, 42); // return [42, 42, 42]
+ * vec<int, 3> b = fill_like(a, 42); // returns [42, 42, 42]
  * ```
  */
 template<typename V, typename T = vector_value_type<V>, typename E = vector_extent_type<V>>
@@ -301,7 +301,7 @@ KERNEL_FLOAT_INLINE vector<T, E> fill_like(const V&, T value) {
  * =======
  * ```
  * vec<int, 3> a = {1, 2, 3};
- * vec<int, 3> b = zeros_like(a); // return [0, 0, 0]
+ * vec<int, 3> b = zeros_like(a); // returns [0, 0, 0]
  * ```
  */
 template<typename V, typename T = vector_value_type<V>, typename E = vector_extent_type<V>>
@@ -316,7 +316,7 @@ KERNEL_FLOAT_INLINE vector<T, E> zeros_like(const V& = {}) {
  * =======
  * ```
  * vec<int, 3> a = {1, 2, 3};
- * vec<int, 3> b = ones_like(a); // return [1, 1, 1]
+ * vec<int, 3> b = ones_like(a); // returns [1, 1, 1]
  * ```
  */
 template<typename V, typename T = vector_value_type<V>, typename E = vector_extent_type<V>>
diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h
index 5e91180..884d167 100644
--- a/include/kernel_float/iterate.h
+++ b/include/kernel_float/iterate.h
@@ -45,7 +45,7 @@ struct range_impl {
 }  // namespace detail
 
 /**
- * Generate vector consisting of the result `fun(0)...fun(N-1)`
+ * Generate vector consisting of the result `fun(0), ..., fun(N-1)`
  *
  * Example
  * =======
@@ -60,7 +60,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> range(F fun) {
 }
 
 /**
- * Generate vector consisting of the numbers `0...N-1` of type `T`
+ * Generate vector consisting of the numbers `0, ..., N-1` of type `T`
  *
  * Example
  * =======
@@ -75,7 +75,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
 }
 
 /**
- * Takes a vector `vec<T, N>` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T``
+ * Takes a vector `vec<T, N>` and returns a new vector consisting of the numbers ``0, ..., N-1`` of type ``T``
  *
  * Example
  * =======
@@ -90,7 +90,7 @@ KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
 }
 
 /**
- * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used
+ * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0, ..., N-1``. The data type used
  * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when
  * needing to iterate over the indices of a vector.
  *
@@ -258,14 +258,14 @@ using concat_type = vector<concat_value_type<Vs...>, extent<concat_size<Vs...>>>
  * =======
  * ```
  * double vec1 = 1.0;
- * double3 vec2 = {3.0, 4.0, 5.0);
- * double4 vec3 = {6.0, 7.0, 8.0, 9.0};
- * vec<double, 9> concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ * double3 vec2 = {2.0, 3.0, 4.0);
+ * double4 vec3 = {5.0, 6.0, 7.0, 8.0};
+ * vec<double, 8> concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8]
  *
  * int num1 = 42;
  * float num2 = 3.14159;
  * int2 num3 = {-10, 10};
- * vec<float, 3> concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10]
+ * vec<float, 4> concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10]
  * ```
  */
 template<typename... Vs>
diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h
index dfa52c3..7056175 100644
--- a/include/kernel_float/reduce.h
+++ b/include/kernel_float/reduce.h
@@ -97,7 +97,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) {
  * =======
  * ```
  * vec<int, 5> x = {5, 0, 2, 1, 0};
- * int y = sum(x);  // Returns 5*0*2*1*0 = 0
+ * int y = product(x);  // Returns 5*0*2*1*0 = 0
  * ```
  */
 template<typename V, typename T = vector_value_type<V>>
diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h
index 427d044..9cac00d 100644
--- a/include/kernel_float/tiling.h
+++ b/include/kernel_float/tiling.h
@@ -227,6 +227,22 @@ struct tiling_impl<TileDim, BlockDim, Distributions, index_sequence<Is...>> {
 template<typename T>
 struct tiling_iterator;
 
+/**
+ * Represents a tiling where the elements given by `TileDim` are distributed over the
+ * threads given by `BlockDim` according to the distributions given by `Distributions`.
+ *
+ * The template parameters should be the following:
+ *
+ * * ``TileDim``: Should be an instance of ``tile_size<...>``. For example,
+ *   ``tile_size<16, 16>`` represents a 2-dimensional 16x16 tile.
+ * * ``BlockDim``: Should be an instance of ``block_dim<...>``. For example,
+ *    ``block_dim<16, 4>`` represents a thread block having X dimension 16
+ *    and Y-dimension 4 for a total of 64 threads per block.
+ * * ``Distributions``: Should be an instance of ``distributions<...>``. For example,
+ *   ``distributions<dist::cyclic, dist::blocked>`` will distribute elements in
+ *   cyclic fashion along the X-axis and blocked fashion along the Y-axis.
+ * * ``IndexType``: The type used for index values (``int`` by default)
+ */
 template<
     typename TileDim,
     typename BlockDim,
@@ -516,10 +532,31 @@ using tiling_1d = tiling<tile_size<TileDim>, block_size<BlockDim>, distributions
 #define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \
     KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B)
 
+/**
+ * Iterate over the points in a ``tiling<...>`` using a for loop.
+ *
+ * There are two ways to use this macro. Using the 1 variable form:
+ * ```
+ * auto t = tiling<tile_size<16, 16>, block_size<4, 4>>;
+ *
+ * KERNEL_FLOAT_TILING_FOR(t, auto point) {
+ *  printf("%d,%d\n", point[0], point[1]);
+ * }
+ * ```
+ *
+ * Or using the 2 variables form:
+ * ```
+ * auto t = tiling<tile_size<16, 16>, block_size<4, 4>>;
+ *
+ * KERNEL_FLOAT_TILING_FOR(t, auto index, auto point) {
+ *  printf("%d] %d,%d\n", index, point[0], point[1]);
+ * }
+ * ```
+ */
 #define KERNEL_FLOAT_TILING_FOR(...) \
     KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1)
 // clang-format on
 
 }  // namespace kernel_float
 
-#endif  // KERNEL_FLOAT_TILING_H
\ No newline at end of file
+#endif  // KERNEL_FLOAT_TILING_H
diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h
index 4ed5eff..56cd1fe 100644
--- a/include/kernel_float/vector.h
+++ b/include/kernel_float/vector.h
@@ -11,9 +11,9 @@
 namespace kernel_float {
 
 /**
- * Container that stores ``N`` elements of type ``T``.
+ * Container that store fixed number of elements of type ``T``.
  *
- * It is not recommended to use this class directly, but instead, use the type `vec<T, N>` which is an alias for
+ * It is not recommended to use this class directly, instead, use the type `vec<T, N>` which is an alias for
  * `vector<T, extent<N>, vector_storage<T, E>>`.
  *
  * @tparam T The type of the values stored within the vector.
@@ -64,11 +64,17 @@ struct vector: public S {
         return E::size;
     }
 
+    /**
+     * Returns a reference to the underlying storage type.
+     */
     KERNEL_FLOAT_INLINE
     storage_type& storage() {
         return *this;
     }
 
+    /**
+     * Returns a reference to the underlying storage type.
+     */
     KERNEL_FLOAT_INLINE
     const storage_type& storage() const {
         return *this;

From b1d8f9c7b2b3cad8c4b6a359a4cf3d4b36112423 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 11 Oct 2023 15:46:12 +0200
Subject: [PATCH 10/16] Update single_include

---
 single_include/kernel_float.h | 844 ++++++++++++++++++++++++++++++++--
 1 file changed, 817 insertions(+), 27 deletions(-)

diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index aea324e..313cc85 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,8 +16,13 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
+<<<<<<< HEAD
 // date: 2023-10-13 14:55:52.284209
 // git hash: 3da5ba08788e4d89a1b20b6a12bb4ba0f8de6b40
+=======
+// date: 2023-10-11 15:46:04.149164
+// git hash: b1f6c1b73c2212223b10142054a28806f56b5ee6
+>>>>>>> 9bf416c (Update single_include)
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -66,6 +71,11 @@
     } while (0)
 #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable()
 
+// Somet utility macros
+#define KERNEL_FLOAT_CONCAT_IMPL(A, B) A##B
+#define KERNEL_FLOAT_CONCAT(A, B)      KERNEL_FLOAT_CONCAT_IMPL(A, B)
+#define KERNEL_FLOAT_CALL(F, ...)      F(__VA_ARGS__)
+
 #endif  //KERNEL_FLOAT_MACROS_H
 #ifndef KERNEL_FLOAT_CORE_H
 #define KERNEL_FLOAT_CORE_H
@@ -146,7 +156,7 @@ template<typename T>
 using decay_t = typename detail::decay_impl<T>::type;
 
 template<typename A, typename B>
-struct promote_type;
+struct promote_type {};
 
 template<typename T>
 struct promote_type<T, T> {
@@ -1303,13 +1313,54 @@ KERNEL_FLOAT_INLINE vector<R, extent<N>> convert(const V& input, extent<N> new_s
     return convert_storage(input);
 }
 
+template<typename T, RoundingMode M = RoundingMode::ANY>
+struct AssignConversionProxy {
+    KERNEL_FLOAT_INLINE
+    explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {}
+
+    template<typename U>
+    KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) {
+        *ptr_ = detail::convert_impl<
+            vector_value_type<U>,
+            vector_extent_type<U>,
+            vector_value_type<T>,
+            vector_extent_type<T>,
+            M>::call(into_vector_storage(values));
+
+        return *this;
+    }
+
+  private:
+    T* ptr_;
+};
+
+/**
+ * Takes a vector reference and gives back a helper object. This object allows you to assign
+ * a vector of a different type to another vector while perofrming implicit type converion.
+ *
+ * For example, if `x = expression;` does not compile because `x` and `expression` are
+ * different vector types, you can use `cast_to(x) = expression;` to make it work.
+ *
+ * Example
+ * =======
+ * ```
+ * vec<float, 2> x;
+ * vec<double, 2> y = {1.0, 2.0};
+ * cast_to(x) = y;  // Normally, `x = y;` would give an error, but `cast_to` fixes that.
+ * ```
+ */
+template<typename T, RoundingMode M = RoundingMode::ANY>
+KERNEL_FLOAT_INLINE AssignConversionProxy<T, M> cast_to(T& input) {
+    return AssignConversionProxy<T, M>(&input);
+}
+
 /**
  * Returns a vector containing `N` copies of `value`.
  *
  * Example
  * =======
  * ```
- * vec<int, 3> a = fill<3>(42); // return [42, 42, 42]
+ * vec<int, 3> a = fill<3>(42); // returns [42, 42, 42]
  * ```
  */
 template<size_t N, typename T>
@@ -1324,7 +1375,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> fill(T value = {}, extent<N> = {}) {
  * Example
  * =======
  * ```
- * vec<int, 3> a = zeros<int, 3>(); // return [0, 0, 0]
+ * vec<int, 3> a = zeros<int, 3>(); // returns [0, 0, 0]
  * ```
  */
 template<typename T, size_t N>
@@ -1339,7 +1390,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> zeros(extent<N> = {}) {
  * Example
  * =======
  * ```
- * vec<int, 3> a = ones<int, 3>(); // return [1, 1, 1]
+ * vec<int, 3> a = ones<int, 3>(); // returns [1, 1, 1]
  * ```
  */
 template<typename T, size_t N>
@@ -1355,7 +1406,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> ones(extent<N> = {}) {
  * =======
  * ```
  * vec<int, 3> a = {1, 2, 3};
- * vec<int, 3> b = fill_like(a, 42); // return [42, 42, 42]
+ * vec<int, 3> b = fill_like(a, 42); // returns [42, 42, 42]
  * ```
  */
 template<typename V, typename T = vector_value_type<V>, typename E = vector_extent_type<V>>
@@ -1370,7 +1421,7 @@ KERNEL_FLOAT_INLINE vector<T, E> fill_like(const V&, T value) {
  * =======
  * ```
  * vec<int, 3> a = {1, 2, 3};
- * vec<int, 3> b = zeros_like(a); // return [0, 0, 0]
+ * vec<int, 3> b = zeros_like(a); // returns [0, 0, 0]
  * ```
  */
 template<typename V, typename T = vector_value_type<V>, typename E = vector_extent_type<V>>
@@ -1385,7 +1436,7 @@ KERNEL_FLOAT_INLINE vector<T, E> zeros_like(const V& = {}) {
  * =======
  * ```
  * vec<int, 3> a = {1, 2, 3};
- * vec<int, 3> b = ones_like(a); // return [1, 1, 1]
+ * vec<int, 3> b = ones_like(a); // returns [1, 1, 1]
  * ```
  */
 template<typename V, typename T = vector_value_type<V>, typename E = vector_extent_type<V>>
@@ -1765,17 +1816,44 @@ KERNEL_FLOAT_INLINE vector<T, extent<3>> cross(const L& left, const R& right) {
 
 namespace kernel_float {
 
+/**
+ * `constant<T>` represents a constant value of type `T`.
+ *
+ * The object has the property that for any binary operation involving
+ * a `constant<T>` and a value of type `U`, the constant is automatically
+ * cast to also be of type `U`.
+ *
+ * For example:
+ * ```
+ * float a = 5;
+ * constant<double> b = 3;
+ *
+ * auto c = a + b; // The result will be of type `float`
+ * ```
+ */
 template<typename T = double>
 struct constant {
+    /**
+     * Create a new constant from the given value.
+     */
+    KERNEL_FLOAT_INLINE
+    constexpr constant(T value = {}) : value_(value) {}
+
+    KERNEL_FLOAT_INLINE
+    constexpr constant(const constant<T>& that) : value_(that.value) {}
+
+    /**
+     * Create a new constant from another constant of type `R`.
+     */
     template<typename R>
     KERNEL_FLOAT_INLINE explicit constexpr constant(const constant<R>& that) {
         auto f = ops::cast<R, T>();
         value_ = f(that.get());
     }
 
-    KERNEL_FLOAT_INLINE
-    constexpr constant(T value = {}) : value_(value) {}
-
+    /**
+     * Return the value of the constant
+     */
     KERNEL_FLOAT_INLINE
     constexpr T get() const {
         return value_;
@@ -1893,7 +1971,7 @@ namespace kernel_float {
  * ```
  */
 template<typename V, typename F>
-void for_each(V&& input, F fun) {
+KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) {
     auto storage = into_vector_storage(input);
 
 #pragma unroll
@@ -1905,13 +1983,13 @@ void for_each(V&& input, F fun) {
 namespace detail {
 template<typename T, size_t N>
 struct range_impl {
-    KERNEL_FLOAT_INLINE
-    static vector_storage<T, N> call() {
+    template<typename F>
+    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(F fun) {
         vector_storage<T, N> result;
 
 #pragma unroll
         for (size_t i = 0; i < N; i++) {
-            result.data()[i] = T(i);
+            result.data()[i] = fun(i);
         }
 
         return result;
@@ -1920,7 +1998,22 @@ struct range_impl {
 }  // namespace detail
 
 /**
- * Generate vector consisting of the numbers `0...N-1` of type `T`
+ * Generate vector consisting of the result `fun(0), ..., fun(N-1)`
+ *
+ * Example
+ * =======
+ * ```
+ * // Returns [0.0f, 2.0f, 4.0f]
+ * vec<float, 3> vec = range<3>([](auto i){ return float(i * 2.0f); });
+ * ```
+ */
+template<size_t N, typename F, typename T = result_t<F, size_t>>
+KERNEL_FLOAT_INLINE vector<T, extent<N>> range(F fun) {
+    return detail::range_impl<T, N>::call(fun);
+}
+
+/**
+ * Generate vector consisting of the numbers `0, ..., N-1` of type `T`
  *
  * Example
  * =======
@@ -1931,11 +2024,11 @@ struct range_impl {
  */
 template<typename T, size_t N>
 KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
-    return detail::range_impl<T, N>::call();
+    return detail::range_impl<T, N>::call(ops::cast<size_t, T>());
 }
 
 /**
- * Takes a vector `vec<T, N>` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T``
+ * Takes a vector `vec<T, N>` and returns a new vector consisting of the numbers ``0, ..., N-1`` of type ``T``
  *
  * Example
  * =======
@@ -1946,11 +2039,11 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
  */
 template<typename V>
 KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
-    return detail::range_impl<vector_value_type<V>, vector_extent<V>>::call();
+    return range<vector_value_type<V>, vector_extent<V>>();
 }
 
 /**
- * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used
+ * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0, ..., N-1``. The data type used
  * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when
  * needing to iterate over the indices of a vector.
  *
@@ -1971,7 +2064,7 @@ KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
  */
 template<typename T = size_t, typename V>
 KERNEL_FLOAT_INLINE vector<T, vector_extent_type<V>> each_index(const V& = {}) {
-    return detail::range_impl<T, vector_extent<V>>::call();
+    return range<T, vector_extent<V>>();
 }
 
 namespace detail {
@@ -2118,14 +2211,14 @@ using concat_type = vector<concat_value_type<Vs...>, extent<concat_size<Vs...>>>
  * =======
  * ```
  * double vec1 = 1.0;
- * double3 vec2 = {3.0, 4.0, 5.0);
- * double4 vec3 = {6.0, 7.0, 8.0, 9.0};
- * vec<double, 9> concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ * double3 vec2 = {2.0, 3.0, 4.0);
+ * double4 vec3 = {5.0, 6.0, 7.0, 8.0};
+ * vec<double, 8> concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8]
  *
  * int num1 = 42;
  * float num2 = 3.14159;
  * int2 num3 = {-10, 10};
- * vec<float, 3> concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10]
+ * vec<float, 4> concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10]
  * ```
  */
 template<typename... Vs>
@@ -2393,6 +2486,7 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m
     return store(values, ptr, indices, indices < max_length);
 }
 
+<<<<<<< HEAD
 // TOOD: check if this way is support across all compilers
 #if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
 #define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment))
@@ -2537,13 +2631,135 @@ struct aligned_ptr<const T, alignment> {
 
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(const T* ptr) : ptr_(ptr) {}
+=======
+/**
+ * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes.
+ * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful!
+ */
+template<typename T>
+KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) {
+// TOOD: check if this way is support across all compilers
+#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
+    return static_cast<T*>(__builtin_assume_aligned(ptr, alignment));
+#else
+    return ptr;
+#endif
+}
+
+/**
+ * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes.
+ */
+template<typename T, size_t alignment = 256>
+struct aligned_ptr {
+    static_assert(alignment >= alignof(T), "invalid alignment");
+
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(nullptr_t = nullptr) {}
+
+    KERNEL_FLOAT_INLINE
+    explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
+
+    /**
+     * Return the pointer value.
+     */
+    KERNEL_FLOAT_INLINE
+    T* get() const {
+        return unsafe_assume_aligned(ptr_, alignment);
+    }
+
+    KERNEL_FLOAT_INLINE
+    operator T*() const {
+        return get();
+    }
+
+    template<typename I>
+    KERNEL_FLOAT_INLINE T& operator[](I&& index) const {
+        return get()[std::forward<I>(index)];
+    }
+
+    /**
+     * See ``kernel_float::load``
+     */
+    template<typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
+    KERNEL_FLOAT_INLINE vector<T, E> load(const I& indices, const M& mask = true) const {
+        return ::kernel_float::load(get(), indices, mask);
+    }
+
+    /**
+     * See ``kernel_float::loadn``
+     */
+    template<size_t N>
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset = 0) const {
+        return ::kernel_float::loadn<N>(get(), offset);
+    }
+
+    /**
+     * See ``kernel_float::loadn``
+     */
+    template<size_t N>
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset, size_t max_length) const {
+        return ::kernel_float::loadn<N>(get(), offset, max_length);
+    }
+
+    /**
+     * See ``kernel_float::store``
+     */
+    template<typename V, typename I, typename M, typename E = broadcast_vector_extent_type<V, I, M>>
+    KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const {
+        ::kernel_float::store(values, get(), indices, mask);
+    }
+    /**
+     * See ``kernel_float::storen``
+     */
+    template<typename V, size_t N = vector_extent<V>>
+    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const {
+        ::kernel_float::storen(values, get(), offset);
+    }
+    /**
+     * See ``kernel_float::storen``
+     */
+    template<typename V, size_t N = vector_extent<V>>
+    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const {
+        ::kernel_float::storen(values, get(), offset, max_length);
+    }
+
+  private:
+    T* ptr_ = nullptr;
+};
+
+/**
+ * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes.
+ */
+template<typename T, size_t alignment>
+struct aligned_ptr<const T, alignment> {
+    static_assert(alignment >= alignof(T), "invalid alignment");
+
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(nullptr_t = nullptr) {}
+
+    KERNEL_FLOAT_INLINE
+    explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
+
+    KERNEL_FLOAT_INLINE
+    explicit aligned_ptr(const T* ptr) : ptr_(ptr) {}
+
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(const aligned_ptr<T>& ptr) : ptr_(ptr.get()) {}
+
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(const aligned_ptr<const T>& ptr) : ptr_(ptr.get()) {}
+>>>>>>> 9bf416c (Update single_include)
 
     /**
      * Return the pointer value.
      */
     KERNEL_FLOAT_INLINE
     const T* get() const {
+<<<<<<< HEAD
         return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment);
+=======
+        return unsafe_assume_aligned(ptr_, alignment);
+>>>>>>> 9bf416c (Update single_include)
     }
 
     KERNEL_FLOAT_INLINE
@@ -2584,6 +2800,12 @@ struct aligned_ptr<const T, alignment> {
     const T* ptr_ = nullptr;
 };
 
+<<<<<<< HEAD
+=======
+template<typename T>
+aligned_ptr(T*) -> aligned_ptr<T>;
+
+>>>>>>> 9bf416c (Update single_include)
 }  // namespace kernel_float
 
 #endif  //KERNEL_FLOAT_MEMORY_H
@@ -2686,7 +2908,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) {
  * =======
  * ```
  * vec<int, 5> x = {5, 0, 2, 1, 0};
- * int y = sum(x);  // Returns 5*0*2*1*0 = 0
+ * int y = product(x);  // Returns 5*0*2*1*0 = 0
  * ```
  */
 template<typename V, typename T = vector_value_type<V>>
@@ -2994,9 +3216,9 @@ KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
 namespace kernel_float {
 
 /**
- * Container that stores ``N`` elements of type ``T``.
+ * Container that store fixed number of elements of type ``T``.
  *
- * It is not recommended to use this class directly, but instead, use the type `vec<T, N>` which is an alias for
+ * It is not recommended to use this class directly, instead, use the type `vec<T, N>` which is an alias for
  * `vector<T, extent<N>, vector_storage<T, E>>`.
  *
  * @tparam T The type of the values stored within the vector.
@@ -3047,11 +3269,17 @@ struct vector: public S {
         return E::size;
     }
 
+    /**
+     * Returns a reference to the underlying storage type.
+     */
     KERNEL_FLOAT_INLINE
     storage_type& storage() {
         return *this;
     }
 
+    /**
+     * Returns a reference to the underlying storage type.
+     */
     KERNEL_FLOAT_INLINE
     const storage_type& storage() const {
         return *this;
@@ -4147,3 +4375,565 @@ kconstant(T&&) -> kconstant<decay_t<T>>;
 }  // namespace kernel_float
 
 #endif
+#ifndef KERNEL_FLOAT_TILING_H
+#define KERNEL_FLOAT_TILING_H
+
+
+
+
+namespace kernel_float {
+
+template<size_t... Ns>
+struct block_size {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    block_size(dim3 thread_index) {
+        if (rank > 0 && size(0) > 1) {
+            thread_index_[0] = thread_index.x;
+        }
+
+        if (rank > 1 && size(1) > 1) {
+            thread_index_[1] = thread_index.y;
+        }
+
+        if (rank > 2 && size(2) > 1) {
+            thread_index_[2] = thread_index.z;
+        }
+    }
+
+    KERNEL_FLOAT_INLINE
+    size_t thread_index(size_t axis) const {
+        return axis < rank ? thread_index_[axis] : 0;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis) {
+        size_t sizes[rank] = {Ns...};
+        return axis < rank ? sizes[axis] : 1;
+    }
+
+  private:
+    unsigned int thread_index_[rank] = {0};
+};
+
+template<size_t... Ns>
+struct virtual_block_size {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    virtual_block_size(dim3 thread_index) {
+        thread_index_ = thread_index.x;
+    }
+
+    KERNEL_FLOAT_INLINE
+    size_t thread_index(size_t axis) const {
+        size_t product_up_to_axis = 1;
+#pragma unroll
+        for (size_t i = 0; i < axis; i++) {
+            product_up_to_axis *= size(i);
+        }
+
+        return (thread_index_ / product_up_to_axis) % size(axis);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis) {
+        size_t sizes[rank] = {Ns...};
+        return axis < rank ? sizes[axis] : 1;
+    }
+
+  private:
+    unsigned int thread_index_ = 0;
+};
+
+template<size_t... Ns>
+struct tile_size {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis, size_t block_size = 0) {
+        size_t sizes[rank] = {Ns...};
+        return axis < rank ? sizes[axis] : 1;
+    }
+};
+
+template<size_t... Ns>
+struct tile_factor {
+    static constexpr size_t rank = sizeof...(Ns);
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size(size_t axis, size_t block_size) {
+        size_t factors[rank] = {Ns...};
+        return block_size * (axis < rank ? factors[axis] : 1);
+    }
+};
+
+namespace dist {
+template<size_t N, size_t K>
+struct blocked_impl {
+    static constexpr bool is_exhaustive = N % K == 0;
+    static constexpr size_t items_per_thread = (N / K) + (is_exhaustive ? 0 : 1);
+
+    KERNEL_FLOAT_INLINE
+    static constexpr bool local_is_present(size_t thread_index, size_t local_index) {
+        return is_exhaustive || (local_to_global(thread_index, local_index) < N);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t local_to_global(size_t thread_index, size_t local_index) {
+        return thread_index * items_per_thread + local_index;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_local(size_t global_index) {
+        return global_index % items_per_thread;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_owner(size_t global_index) {
+        return global_index / items_per_thread;
+    }
+};
+
+struct blocked {
+    template<size_t N, size_t K>
+    using type = blocked_impl<N, K>;
+};
+
+template<size_t M, size_t N, size_t K>
+struct cyclic_impl {
+    static constexpr bool is_exhaustive = N % (K * M) == 0;
+    static constexpr size_t items_per_thread = ((N / (K * M)) + (is_exhaustive ? 0 : 1)) * M;
+
+    KERNEL_FLOAT_INLINE
+    static constexpr bool local_is_present(size_t thread_index, size_t local_index) {
+        return is_exhaustive || (local_to_global(thread_index, local_index) < N);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t local_to_global(size_t thread_index, size_t local_index) {
+        return (local_index / M) * M * K + thread_index * M + (local_index % M);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_local(size_t global_index) {
+        return (global_index / (M * K)) * M + (global_index % M);
+    }
+
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t global_to_owner(size_t global_index) {
+        return (global_index / M) % K;
+    }
+};
+
+struct cyclic {
+    template<size_t N, size_t K>
+    using type = cyclic_impl<1, N, K>;
+};
+
+template<size_t M>
+struct block_cyclic {
+    template<size_t N, size_t K>
+    using type = cyclic_impl<M, N, K>;
+};
+}  // namespace dist
+
+template<typename... Ds>
+struct distributions {};
+
+namespace detail {
+template<size_t I, typename D>
+struct instantiate_distribution_impl {
+    template<size_t N, size_t K>
+    using type = dist::cyclic::type<N, K>;
+};
+
+template<typename First, typename... Rest>
+struct instantiate_distribution_impl<0, distributions<First, Rest...>> {
+    template<size_t N, size_t K>
+    using type = typename First::type<N, K>;
+};
+
+template<size_t I, typename First, typename... Rest>
+struct instantiate_distribution_impl<I, distributions<First, Rest...>>:
+    instantiate_distribution_impl<I + 1, distributions<Rest...>> {};
+
+template<
+    typename TileDim,
+    typename BlockDim,
+    typename Distributions,
+    typename = make_index_sequence<TileDim::rank>>
+struct tiling_impl;
+
+template<typename TileDim, typename BlockDim, typename Distributions, size_t... Is>
+struct tiling_impl<TileDim, BlockDim, Distributions, index_sequence<Is...>> {
+    template<size_t I>
+    using dist_type = typename instantiate_distribution_impl<I, Distributions>::
+        type<TileDim::size(I, BlockDim::size(I)), BlockDim::size(I)>;
+
+    static constexpr size_t rank = TileDim::rank;
+    static constexpr size_t items_per_thread = (dist_type<Is>::items_per_thread * ... * 1);
+    static constexpr bool is_exhaustive = (dist_type<Is>::is_exhaustive && ...);
+
+    template<typename IndexType>
+    KERNEL_FLOAT_INLINE static vector_storage<IndexType, rank>
+    local_to_global(const BlockDim& block, size_t item) {
+        vector_storage<IndexType, rank> result;
+        ((result.data()[Is] = dist_type<Is>::local_to_global(
+              block.thread_index(Is),
+              item % dist_type<Is>::items_per_thread),
+          item /= dist_type<Is>::items_per_thread),
+         ...);
+        return result;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static bool local_is_present(const BlockDim& block, size_t item) {
+        bool is_present = true;
+        ((is_present &= dist_type<Is>::local_is_present(
+              block.thread_index(Is),
+              item % dist_type<Is>::items_per_thread),
+          item /= dist_type<Is>::items_per_thread),
+         ...);
+        return is_present;
+    }
+};
+};  // namespace detail
+
+template<typename T>
+struct tiling_iterator;
+
+/**
+ * Represents a tiling where the elements given by `TileDim` are distributed over the
+ * threads given by `BlockDim` according to the distributions given by `Distributions`.
+ *
+ * The template parameters should be the following:
+ *
+ * * ``TileDim``: Should be an instance of ``tile_size<...>``. For example,
+ *   ``tile_size<16, 16>`` represents a 2-dimensional 16x16 tile.
+ * * ``BlockDim``: Should be an instance of ``block_dim<...>``. For example,
+ *    ``block_dim<16, 4>`` represents a thread block having X dimension 16
+ *    and Y-dimension 4 for a total of 64 threads per block.
+ * * ``Distributions``: Should be an instance of ``distributions<...>``. For example,
+ *   ``distributions<dist::cyclic, dist::blocked>`` will distribute elements in
+ *   cyclic fashion along the X-axis and blocked fashion along the Y-axis.
+ * * ``IndexType``: The type used for index values (``int`` by default)
+ */
+template<
+    typename TileDim,
+    typename BlockDim,
+    typename Distributions = distributions<>,
+    typename IndexType = int>
+struct tiling {
+    using self_type = tiling<TileDim, BlockDim, Distributions, IndexType>;
+    using impl_type = detail::tiling_impl<TileDim, BlockDim, Distributions>;
+    using block_type = BlockDim;
+    using tile_type = TileDim;
+
+    static constexpr size_t rank = tile_type::rank;
+    static constexpr size_t num_locals = impl_type::items_per_thread;
+
+    using index_type = IndexType;
+    using point_type = vector<index_type, extent<rank>>;
+
+#if KERNEL_FLOAT_IS_DEVICE
+    __forceinline__ __device__ tiling() : block_(threadIdx) {}
+#endif
+
+    KERNEL_FLOAT_INLINE
+    tiling(BlockDim block, vec<index_type, rank> offset = {}) : block_(block), offset_(offset) {}
+
+    /**
+     * Returns the number of items per thread in the tiling.
+     *
+     * Note that this method is ``constexpr`` and can be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr size_t size() {
+        return impl_type::items_per_thread;
+    }
+
+    /**
+     * Checks if the tiling is exhaustive, meaning all items are always present for all threads. If this returns
+     * `true`, then ``is_present`` will always true for any given index.
+     *
+     * Note that this method is ``constexpr`` and can thus be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr bool all_present() {
+        return impl_type::is_exhaustive;
+    }
+
+    /**
+     * Checks if a specific item is present for the current thread based on the distribution strategy. Not always
+     * is the number of items stored per thread equal to the number of items _owned_ by each thread (for example,
+     * if the tile size is not divisible by the block size). In this case, ``is_present`` will return `false` for
+     * certain items.
+     */
+    KERNEL_FLOAT_INLINE
+    bool is_present(size_t item) const {
+        return all_present() || impl_type::local_is_present(block_, item);
+    }
+
+    /**
+     * Returns the global coordinates of a specific item for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> at(size_t item) const {
+        return impl_type::template local_to_global<index_type>(block_, item) + offset_;
+    }
+
+    /**
+     * Returns the global coordinates of a specific item along a specified axis for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    index_type at(size_t item, size_t axis) const {
+        return axis < rank ? at(item)[axis] : index_type {};
+    }
+
+    /**
+     * Returns the global coordinates of a specific item for the current thread (alias of ``at``).
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> operator[](size_t item) const {
+        return at(item);
+    }
+
+    /**
+     * Returns a vector of global coordinates of all items present for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<vector<index_type, extent<rank>>, extent<num_locals>> local_points() const {
+        return range<num_locals>([&](size_t i) { return at(i); });
+    }
+
+    /**
+     * Returns a vector of coordinate values along a specified axis for all items present for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<num_locals>> local_points(size_t axis) const {
+        return range<num_locals>([&](size_t i) { return at(i, axis); });
+    }
+
+    /**
+     * Returns a vector of boolean values representing the result of ``is_present`` of the items for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<bool, extent<num_locals>> local_mask() const {
+        return range<num_locals>([&](size_t i) { return is_present(i); });
+    }
+
+    /**
+     * Returns the thread index (position) along a specified axis for the current thread.
+     */
+    KERNEL_FLOAT_INLINE
+    index_type thread_index(size_t axis) const {
+        return index_type(block_.thread_index(axis));
+    }
+
+    /**
+     * Returns the size of the block (number of threads) along a specified axis.
+     *
+     * Note that this method is ``constexpr`` and can thus be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr index_type block_size(size_t axis) {
+        return index_type(block_type::size(axis));
+    }
+
+    /**
+     * Returns the size of the tile along a specified axis.
+     *
+     * Note that this method is ``constexpr`` and can thus be called at compile-time.
+     */
+    KERNEL_FLOAT_INLINE
+    static constexpr index_type tile_size(size_t axis) {
+        return index_type(tile_type::size(axis, block_size(axis)));
+    }
+
+    /**
+     * Returns the offset of the tile along a specified axis.
+     */
+    KERNEL_FLOAT_INLINE
+    index_type tile_offset(size_t axis) const {
+        return index_type(offset_[axis]);
+    }
+
+    /**
+     * Returns a vector of thread indices for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> thread_index() const {
+        return range<rank>([&](size_t i) { return thread_index(i); });
+    }
+
+    /**
+     * Returns a vector of block sizes for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    static vector<index_type, extent<rank>> block_size() {
+        return range<rank>([&](size_t i) { return block_size(i); });
+    }
+
+    /**
+     * Returns a vector of tile sizes for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    static vector<index_type, extent<rank>> tile_size() {
+        return range<rank>([&](size_t i) { return tile_size(i); });
+    }
+
+    /**
+     * Returns the offset of the tile for all axes.
+     */
+    KERNEL_FLOAT_INLINE
+    vector<index_type, extent<rank>> tile_offset() const {
+        return range<rank>([&](size_t i) { return tile_offset(i); });
+    }
+
+    /**
+     * Returns an iterator pointing to the beginning of the tiling.
+     */
+    KERNEL_FLOAT_INLINE
+    tiling_iterator<tiling> begin() const {
+        return {*this, 0};
+    }
+
+    /**
+     * Returns an iterator pointing to the end of the tiling.
+     */
+    KERNEL_FLOAT_INLINE
+    tiling_iterator<tiling> end() const {
+        return {*this, num_locals};
+    }
+
+    /**
+     * Applies a provided function to each item present in the tiling for the current thread.
+     * The function should take an index and a ``vector`` of global coordinates as arguments.
+     */
+    template<typename F>
+    KERNEL_FLOAT_INLINE void for_each(F fun) const {
+#pragma unroll
+        for (size_t i = 0; i < num_locals; i++) {
+            if (is_present(i)) {
+                fun(i, at(i));
+            }
+        }
+    }
+
+    /**
+     * Adds ``offset`` to all points of this tiling and returns a new tiling.
+     */
+    KERNEL_FLOAT_INLINE friend tiling
+    operator+(const tiling& self, const vector<index_type, extent<rank>>& offset) {
+        return tiling {self.block_, self.offset_ + offset};
+    }
+
+    /**
+     * Adds ``offset`` to all points of this tiling and returns a new tiling.
+     */
+    KERNEL_FLOAT_INLINE friend tiling
+    operator+(const vector<index_type, extent<rank>>& offset, const tiling& self) {
+        return self + offset;
+    }
+
+    /**
+     * Adds ``offset`` to all points of this tiling.
+     */
+    KERNEL_FLOAT_INLINE friend tiling&
+    operator+=(tiling& self, const vector<index_type, extent<rank>>& offset) {
+        return self = self + offset;
+    }
+
+  private:
+    BlockDim block_;
+    vector<index_type, extent<rank>> offset_;
+};
+
+template<typename T>
+struct tiling_iterator {
+    using value_type = vector<typename T::index_type, extent<T::rank>>;
+
+    KERNEL_FLOAT_INLINE
+    tiling_iterator(const T& inner, size_t position = 0) : inner_(&inner), position_(position) {
+        while (position_ < T::num_locals && !inner_->is_present(position_)) {
+            position_++;
+        }
+    }
+
+    KERNEL_FLOAT_INLINE
+    value_type operator*() const {
+        return inner_->at(position_);
+    }
+
+    KERNEL_FLOAT_INLINE
+    tiling_iterator& operator++() {
+        return *this = tiling_iterator(*inner_, position_ + 1);
+    }
+
+    KERNEL_FLOAT_INLINE
+    tiling_iterator operator++(int) {
+        tiling_iterator old = *this;
+        this ++;
+        return old;
+    }
+
+    KERNEL_FLOAT_INLINE
+    friend bool operator==(const tiling_iterator& a, const tiling_iterator& b) {
+        return a.position_ == b.position_;
+    }
+
+    KERNEL_FLOAT_INLINE
+    friend bool operator!=(const tiling_iterator& a, const tiling_iterator& b) {
+        return !operator==(a, b);
+    }
+
+    size_t position_ = 0;
+    const T* inner_;
+};
+
+template<size_t TileDim, size_t BlockDim, typename D = dist::cyclic, typename IndexType = int>
+using tiling_1d = tiling<tile_size<TileDim>, block_size<BlockDim>, distributions<D>, IndexType>;
+
+// clang-format off
+#define KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _)      \
+        _Pragma("unroll")                                                         \
+        for (size_t ITER_VAR = 0; ITER_VAR < (TILING).size(); ITER_VAR++)         \
+            if (POINT_VAR = (TILING).at(ITER_VAR); (TILING).is_present(ITER_VAR)) \
+
+#define KERNEL_FLOAT_TILING_FOR_IMPL2(ITER_VAR, TILING, INDEX_VAR, POINT_VAR)      \
+        KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \
+            if (INDEX_VAR = ITER_VAR; true)
+
+#define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \
+    KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B)
+
+/**
+ * Iterate over the points in a ``tiling<...>`` using a for loop.
+ *
+ * There are two ways to use this macro. Using the 1 variable form:
+ * ```
+ * auto t = tiling<tile_size<16, 16>, block_size<4, 4>>;
+ *
+ * KERNEL_FLOAT_TILING_FOR(t, auto point) {
+ *  printf("%d,%d\n", point[0], point[1]);
+ * }
+ * ```
+ *
+ * Or using the 2 variables form:
+ * ```
+ * auto t = tiling<tile_size<16, 16>, block_size<4, 4>>;
+ *
+ * KERNEL_FLOAT_TILING_FOR(t, auto index, auto point) {
+ *  printf("%d] %d,%d\n", index, point[0], point[1]);
+ * }
+ * ```
+ */
+#define KERNEL_FLOAT_TILING_FOR(...) \
+    KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1)
+// clang-format on
+
+}  // namespace kernel_float
+
+#endif  // KERNEL_FLOAT_TILING_H

From 79449e3375ef3c8ee245aea3d5310bedee189485 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Sun, 22 Oct 2023 22:12:27 +0200
Subject: [PATCH 11/16] Refactor functions/types in `memory.h`

---
 examples/vector_add/main.cu        |  16 +-
 examples/vector_add_tiling/main.cu |   4 +-
 include/kernel_float/macros.h      |  28 +-
 include/kernel_float/memory.h      | 523 ++++++++++++++++++-----------
 tests/common.h                     |   3 +-
 tests/memory.cu                    |  81 +++--
 6 files changed, 407 insertions(+), 248 deletions(-)

diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
index 465c707..5d10239 100644
--- a/examples/vector_add/main.cu
+++ b/examples/vector_add/main.cu
@@ -13,11 +13,13 @@ void cuda_check(cudaError_t code) {
 }
 
 template<int N>
-__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
+__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        kf::cast_to(output[i]) = (input[i] * input[i]) * constant;
+        auto a = kf::read_aligned<N>(input + i * N);
+        auto b = (a * a) * constant;
+        kf::write_aligned(output + i * N, b);
     }
 }
 
@@ -35,8 +37,8 @@ void run_kernel(int n) {
     }
 
     // Allocate device memory
-    khalf<items_per_thread>* input_dev;
-    kfloat<items_per_thread>* output_dev;
+    __half* input_dev;
+    float* output_dev;
     cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
     cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
 
@@ -47,7 +49,11 @@ void run_kernel(int n) {
     int block_size = 256;
     int items_per_block = block_size * items_per_thread;
     int grid_size = (n + items_per_block - 1) / items_per_block;
-    my_kernel<items_per_thread><<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
+    my_kernel<items_per_thread><<<grid_size, block_size>>>(
+        n,
+        kf::aligned_ptr(input_dev),
+        constant,
+        kf::aligned_ptr(output_dev));
 
     // Copy results back
     cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu
index 1134778..291cb05 100644
--- a/examples/vector_add_tiling/main.cu
+++ b/examples/vector_add_tiling/main.cu
@@ -27,9 +27,9 @@ __global__ void my_kernel(
     auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
     auto mask = tiling.local_mask();
 
-    auto a = kf::load(input.get(), points, mask);
+    auto a = input.read(points, mask);
     auto b = (a * a) * constant;
-    kf::store(b, output.get(), points, mask);
+    output.write(points, b, mask);
 }
 
 template<int items_per_thread, int block_size = 256>
diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h
index bfc8595..689b6b2 100644
--- a/include/kernel_float/macros.h
+++ b/include/kernel_float/macros.h
@@ -9,35 +9,35 @@
 #define KERNEL_FLOAT_IS_DEVICE (1)
 #define KERNEL_FLOAT_IS_HOST   (0)
 #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__)
-#else
+#else  // __CUDA_ARCH__
 #define KERNEL_FLOAT_INLINE    __forceinline__ __host__
 #define KERNEL_FLOAT_IS_DEVICE (0)
 #define KERNEL_FLOAT_IS_HOST   (1)
 #define KERNEL_FLOAT_CUDA_ARCH (0)
-#endif
-#else
+#endif  // __CUDA_ARCH__
+#else  // __CUDACC__
 #define KERNEL_FLOAT_INLINE    inline
 #define KERNEL_FLOAT_CUDA      (0)
 #define KERNEL_FLOAT_IS_HOST   (1)
 #define KERNEL_FLOAT_IS_DEVICE (0)
 #define KERNEL_FLOAT_CUDA_ARCH (0)
-#endif
+#endif  // __CUDACC__
 
 #ifndef KERNEL_FLOAT_FP16_AVAILABLE
 #define KERNEL_FLOAT_FP16_AVAILABLE (1)
-#endif
+#endif  // KERNEL_FLOAT_FP16_AVAILABLE
 
 #ifndef KERNEL_FLOAT_BF16_AVAILABLE
 #define KERNEL_FLOAT_BF16_AVAILABLE (1)
-#endif
+#endif  // KERNEL_FLOAT_BF16_AVAILABLE
 
 #ifndef KERNEL_FLOAT_FP8_AVAILABLE
 #ifdef __CUDACC_VER_MAJOR__
 #define KERNEL_FLOAT_FP8_AVAILABLE (__CUDACC_VER_MAJOR__ >= 12)
-#else
+#else  // __CUDACC_VER_MAJOR__
 #define KERNEL_FLOAT_FP8_AVAILABLE (0)
-#endif
-#endif
+#endif  // __CUDACC_VER_MAJOR__
+#endif  // KERNEL_FLOAT_FP8_AVAILABLE
 
 #define KERNEL_FLOAT_ASSERT(expr) \
     do {                          \
@@ -49,4 +49,14 @@
 #define KERNEL_FLOAT_CONCAT(A, B)      KERNEL_FLOAT_CONCAT_IMPL(A, B)
 #define KERNEL_FLOAT_CALL(F, ...)      F(__VA_ARGS__)
 
+// TOOD: check if this way is support across all compilers
+#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) && 0
+#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) \
+    static_cast<TYPE*>(__builtin_assume_aligned(static_cast<TYPE*>(PTR), (ALIGNMENT)))
+#else
+#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) (PTR)
+#endif
+
+#define KERNEL_FLOAT_MAX_ALIGNMENT (32)
+
 #endif  //KERNEL_FLOAT_MACROS_H
diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h
index 06c1aac..badfcf4 100644
--- a/include/kernel_float/memory.h
+++ b/include/kernel_float/memory.h
@@ -6,30 +6,20 @@
 #include "iterate.h"
 
 namespace kernel_float {
-
 namespace detail {
 template<typename T, size_t N, typename Is = make_index_sequence<N>>
-struct load_impl;
+struct copy_impl;
 
 template<typename T, size_t N, size_t... Is>
-struct load_impl<T, N, index_sequence<Is...>> {
+struct copy_impl<T, N, index_sequence<Is...>> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, N> call(const T* input, const size_t* offsets) {
-        return {input[offsets[Is]]...};
+    static vector_storage<T, N> load(const T* input, const size_t* offsets, const bool* mask) {
+        return {(mask[Is] ? input[offsets[Is]] : T {})...};
     }
 
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, N> call(const T* input, const size_t* offsets, const bool* mask) {
-        bool all_valid = true;
-        for (size_t i = 0; i < N; i++) {
-            all_valid &= mask[i];
-        }
-
-        if (all_valid) {
-            return {input[offsets[Is]]...};
-        } else {
-            return {(mask[Is] ? input[offsets[Is]] : T())...};
-        }
+    static void store(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) {
+        ((mask[Is] ? outputs[offsets[Is]] = inputs[Is] : T {}), ...);
     }
 };
 }  // namespace detail
@@ -37,202 +27,266 @@ struct load_impl<T, N, index_sequence<Is...>> {
 /**
  * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``.
  *
+ * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should
+ * be loaded and ``false`` indicates that the value should be skipped. This can be used
+ * to prevent reading out of bounds.
+ *
  * ```
- * // Load 4 elements at data[0], data[2], data[4], data[8]
- * vec<T, 4> values = load(data, make_vec(0, 2, 4, 8));
+ * // Load 2 elements at data[0] and data[8], skip data[2] and data[4]
+ * vec<T, 4> values = = read(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true));
  * ```
  */
-template<typename T, typename I>
-KERNEL_FLOAT_INLINE vector<T, vector_extent_type<I>> load(const T* ptr, const I& indices) {
-    return detail::load_impl<T, vector_extent<I>>::call(ptr, cast<size_t>(indices).data());
+template<typename T, typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
+KERNEL_FLOAT_INLINE vector<T, E> read(const T* ptr, const I& indices, const M& mask = true) {
+    return detail::copy_impl<T, E::value>::load(
+        ptr,
+        convert_storage<size_t>(indices, E()).data(),
+        convert_storage<bool>(mask, E()).data());
 }
 
 /**
- * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``.
+ * Store the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``.
  *
  * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should
- * be loaded and ``false`` indicates that the value should be skipped. This can be used
- * to prevent reading out of bounds.
+ * be store and ``false`` indicates that the value should be skipped. This can be used
+ * to prevent writing out of bounds.
  *
  * ```
- * // Load 2 elements at data[0] and data[8], skip data[2] and data[4]
- * vec<T, 4> values = = load(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true));
+ * // Store 2 elements at data[0] and data[8], skip data[2] and data[4]
+ * auto values = make_vec(42, 13, 87, 12);
+ * auto mask = make_vec(true, false, false, true);
+ * write(data, make_vec(0, 2, 4, 8), values, mask);
  * ```
  */
-template<typename T, typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-KERNEL_FLOAT_INLINE vector<T, E> load(const T* ptr, const I& indices, const M& mask) {
-    static constexpr E new_size = {};
-
-    return detail::load_impl<T, E::value>::call(
+template<
+    typename T,
+    typename V,
+    typename I,
+    typename M = bool,
+    typename E = broadcast_vector_extent_type<V, I, M>>
+KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) {
+    return detail::copy_impl<T, E::value>::store(
         ptr,
-        convert_storage<size_t>(indices, new_size).data(),
-        convert_storage<bool>(mask, new_size).data());
+        convert_storage<T>(values, E()).data(),
+        convert_storage<size_t>(indices, E()).data(),
+        convert_storage<bool>(mask, E()).data());
 }
 
 /**
- * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an
- * ``offset`` can be given that shifts all the indices by a fixed amount.
+ * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``.
  *
  * ```
  * // Load 4 elements at locations data[0], data[1], data[2], data[3]
- * vec<T, 4> values = loadn<4>(data);
+ * vec<T, 4> values = read<4>(data);
  *
  * // Load 4 elements at locations data[10], data[11], data[12], data[13]
- * vec<T, 4> values2 = loadn<4>(data, 10);
+ * vec<T, 4> values = read<4>(values + 10, data);
  * ```
  */
 template<size_t N, typename T>
-KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(const T* ptr, size_t offset = 0) {
-    return load(ptr, offset + range<size_t, N>());
+KERNEL_FLOAT_INLINE vector<T, extent<N>> read(const T* ptr) {
+    return read(ptr, range<size_t, N>());
 }
 
 /**
- * Load ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``.
- * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used
- * to prevent reading out of bounds.
+ * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``.
  *
  * ```
- * // Returns {ptr[8], ptr[9], 0, 0};
- * vec<T, 4> values = loadn<4>(data, 8, 10);
+ * // Store 4 elements at locations data[0], data[1], data[2], data[3]
+ * vec<float, 4> values = {1.0f, 2.0f, 3.0f, 4.0f};
+ * write(data, values);
+ *
+ * // Store 4 elements at locations data[10], data[11], data[12], data[13]
+ * write(data + 10, values);
  * ```
  */
-template<size_t N, typename T>
-KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(const T* ptr, size_t offset, size_t max_length) {
-    auto indices = offset + range<size_t, N>();
-    return load(ptr, indices, indices < max_length);
+template<typename V, typename T>
+KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) {
+    static constexpr size_t N = vector_extent<V>;
+    write(ptr, range<size_t, N>(), values);
 }
 
 namespace detail {
-template<typename T, size_t N, typename Is = make_index_sequence<N>>
-struct store_impl;
+KERNEL_FLOAT_INLINE
+constexpr size_t gcd(size_t a, size_t b) {
+    return b == 0 ? a : gcd(b, a % b);
+}
+
+template<typename T, size_t N, size_t alignment>
+struct copy_aligned_impl {
+    static constexpr size_t half = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1));
+    static constexpr size_t new_alignment = gcd(alignment, sizeof(T) * half);
 
-template<typename T, size_t N, size_t... Is>
-struct store_impl<T, N, index_sequence<Is...>> {
     KERNEL_FLOAT_INLINE
-    static void call(T* outputs, const T* inputs, const size_t* offsets) {
-        ((outputs[offsets[Is]] = inputs[Is]), ...);
+    static void load(T* output, const T* input) {
+        copy_aligned_impl<T, half, new_alignment>::load(output, input);
+        copy_aligned_impl<T, N - half, new_alignment>::load(output + half, input + half);
     }
 
     KERNEL_FLOAT_INLINE
-    static void call(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) {
-        bool all_valid = true;
-        for (size_t i = 0; i < N; i++) {
-            all_valid &= mask[i];
-        }
-
-        if (all_valid) {
-            ((outputs[offsets[Is]] = inputs[Is]), ...);
-        } else {
-#pragma unroll
-            for (size_t i = 0; i < N; i++) {
-                if (mask[i]) {
-                    outputs[offsets[i]] = inputs[i];
-                }
-            }
-        }
+    static void store(T* output, const T* input) {
+        copy_aligned_impl<T, half, new_alignment>::store(output, input);
+        copy_aligned_impl<T, N - half, new_alignment>::store(output + half, input + half);
     }
 };
-}  // namespace detail
 
-/**
- * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``.
- *
- * ```
- * // Store 4 elements at data[0], data[2], data[4], data[8]
- * auto values = make_vec(42, 13, 87, 12);
- * store(values, data, make_vec(0, 2, 4, 8));
- * ```
- */
-template<
-    typename T,
-    typename V,
-    typename I,
-    typename M,
-    typename E = broadcast_vector_extent_type<V, I, M>>
-KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices, const M& mask) {
-    return detail::store_impl<T, E::value>::call(
-        ptr,
-        convert_storage<T>(values, E()).data(),
-        convert_storage<size_t>(indices, E()).data(),
-        convert_storage<bool>(mask, E()).data());
-}
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 0, alignment> {
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {}
+
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {}
+};
+
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 1, alignment> {
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {
+        output[0] = input[0];
+    }
+
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        output[0] = input[0];
+    }
+};
+
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 2, alignment> {
+    static constexpr size_t new_alignment = gcd(alignment, 2 * sizeof(T));
+    struct alignas(new_alignment) storage_type {
+        T v0, v1;
+    };
+
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {
+        storage_type storage = *reinterpret_cast<const storage_type*>(input);
+        output[0] = storage.v0;
+        output[1] = storage.v1;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        *reinterpret_cast<storage_type*>(output) = storage_type {input[0], input[1]};
+    }
+};
+
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 4, alignment> {
+    static constexpr size_t new_alignment = gcd(alignment, 4 * sizeof(T));
+    struct alignas(new_alignment) storage_type {
+        T v0, v1, v2, v3;
+    };
+
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {
+        storage_type storage = *reinterpret_cast<const storage_type*>(input);
+        output[0] = storage.v0;
+        output[1] = storage.v1;
+        output[2] = storage.v2;
+        output[3] = storage.v3;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        *reinterpret_cast<storage_type*>(output) = storage_type {
+            input[0],  //
+            input[1],
+            input[2],
+            input[3]};
+    }
+};
+
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 8, alignment> {
+    static constexpr size_t new_alignment = gcd(alignment, 8 * sizeof(T));
+    struct alignas(new_alignment) storage_type {
+        T v0, v1, v2, v3, v4, v5, v6, v7;
+    };
+
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {
+        storage_type storage = *reinterpret_cast<const storage_type*>(input);
+        output[0] = storage.v0;
+        output[1] = storage.v1;
+        output[2] = storage.v2;
+        output[3] = storage.v3;
+        output[4] = storage.v4;
+        output[5] = storage.v5;
+        output[6] = storage.v6;
+        output[7] = storage.v7;
+    }
+
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        *reinterpret_cast<storage_type*>(output) = storage_type {
+            input[0],  //
+            input[1],
+            input[2],
+            input[3],
+            input[4],
+            input[5],
+            input[6],
+            input[7]};
+    }
+};
+
+}  // namespace detail
 
 /**
- * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``.
+ * Load ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``.
  *
- * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should
- * be store and ``false`` indicates that the value should be skipped. This can be used
- * to prevent writing out of bounds.
+ * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector
+ * operation. If the pointer is not aligned, undefined behavior will occur.
  *
  * ```
- * // Store 2 elements at data[0] and data[8], skip data[2] and data[4]
- * auto values = make_vec(42, 13, 87, 12);
- * auto mask = make_vec(true, false, false, true);
- * store(values, data, make_vec(0, 2, 4, 8), mask);
+ * // Load 4 elements at locations data[0], data[1], data[2], data[3]
+ * vec<T, 4> values = read_aligned<4>(data);
+ *
+ * // Load 4 elements at locations data[10], data[11], data[12], data[13]
+ * vec<T, 4> values2 = read_aligned<4>(data + 10);
  * ```
  */
-template<typename T, typename V, typename I, typename E = broadcast_vector_extent_type<V, I>>
-KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices) {
-    return detail::store_impl<T, E::value>::call(
-        ptr,
-        convert_storage<T>(values, E()).data(),
-        convert_storage<size_t>(indices, E()).data());
+template<size_t N, typename T>
+KERNEL_FLOAT_INLINE vector<T, extent<N>> read_aligned(const T* ptr) {
+    static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
+    vector_storage<T, N> result;
+    detail::copy_aligned_impl<T, N, alignment>::load(
+        result.data(),
+        KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr, alignment));
+    return result;
 }
 
 /**
- * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an
- * ``offset`` can be given that shifts all the indices by a fixed amount.
+ * Store ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``.
+ *
+ * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector
+ * operation. If the pointer is not aligned, undefined behavior will occur.
  *
  * ```
  * // Store 4 elements at locations data[0], data[1], data[2], data[3]
  * vec<float, 4> values = {1.0f, 2.0f, 3.0f, 4.0f};
- * storen<4>(values, data);
+ * write_aligned(data, values);
  *
  * // Load 4 elements at locations data[10], data[11], data[12], data[13]
- * storen<4>(values, data, 10);
+ * write_aligned(data + 10, values);
  * ```
  */
-template<typename T, typename V, size_t N = vector_extent<V>>
-KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset = 0) {
-    auto indices = offset + range<size_t, N>();
-    return store(values, ptr, indices);
-}
-
-/**
- * Store ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``.
- * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used
- * to prevent reading out of bounds.
- *
- * ```
- * // Store 1.0f at data[8] and 2.0f at data[9]. Ignores remaining values.
- * vec<float, 4> values = {1.0f, 2.0f, 3.0f, 4.0f};
- * storen<4>(values, data, 8, 10);
- * ```
- */
-template<typename T, typename V, size_t N = vector_extent<V>>
-KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t max_length) {
-    auto indices = offset + range<size_t, N>();
-    return store(values, ptr, indices, indices < max_length);
-}
-
-/**
- * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes.
- * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful!
- */
-template<typename T>
-KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) {
-// TOOD: check if this way is support across all compilers
-#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
-    return static_cast<T*>(__builtin_assume_aligned(ptr, alignment));
-#else
-    return ptr;
-#endif
+template<typename V, typename T>
+KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) {
+    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
+
+    return detail::copy_aligned_impl<T, N, alignment>::store(
+        KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr, alignment),
+        convert_storage<T, N>(values).data());
 }
 
 /**
  * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes.
  */
-template<typename T, size_t alignment = 256>
+template<typename T, size_t alignment = KERNEL_FLOAT_MAX_ALIGNMENT>
 struct aligned_ptr {
     static_assert(alignment >= alignof(T), "invalid alignment");
 
@@ -242,12 +296,15 @@ struct aligned_ptr {
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
 
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(const aligned_ptr<T>& ptr) : ptr_(ptr.get()) {}
+
     /**
      * Return the pointer value.
      */
     KERNEL_FLOAT_INLINE
     T* get() const {
-        return unsafe_assume_aligned(ptr_, alignment);
+        return KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr_, alignment);
     }
 
     KERNEL_FLOAT_INLINE
@@ -261,58 +318,111 @@ struct aligned_ptr {
     }
 
     /**
-     * See ``kernel_float::load``
+     * Returns a new pointer that is offset by ``Step * offset`` items
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 4
+     * ptr.offset(8);
+     *
+     * // Returns a pointer to element `vector[4]` with alignment of 16
+     * ptr.offset<4>();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 16
+     * ptr.offset<4>(2);
+     * ```
      */
-    template<typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-    KERNEL_FLOAT_INLINE vector<T, E> load(const I& indices, const M& mask = true) const {
-        return ::kernel_float::load(get(), indices, mask);
+    template<size_t Step = 1>
+    KERNEL_FLOAT_INLINE aligned_ptr<T, detail::gcd(sizeof(T) * Step, alignment)>
+    offset(size_t n = 1) const {
+        return aligned_ptr<T, detail::gcd(sizeof(T) * Step, alignment)> {get() + Step * n};
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::read``
      */
     template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset = 0) const {
-        return ::kernel_float::loadn<N>(get(), offset);
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read() const {
+        vector_storage<T, N> result;
+        detail::copy_aligned_impl<T, N, alignment>::load(result.data(), get());
+        return result;
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::write``
      */
-    template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset, size_t max_length) const {
-        return ::kernel_float::loadn<N>(get(), offset, max_length);
+    template<typename V>
+    KERNEL_FLOAT_INLINE void write(const V& values) const {
+        constexpr size_t N = vector_extent<V>;
+        return detail::copy_aligned_impl<T, N, alignment>::store(
+            get(),
+            convert_storage<T, N>(values).data());
     }
 
     /**
-     * See ``kernel_float::store``
+     * See ``kernel_float::read``
      */
-    template<typename V, typename I, typename M, typename E = broadcast_vector_extent_type<V, I, M>>
-    KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const {
-        ::kernel_float::store(values, get(), indices, mask);
+    template<typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
+    KERNEL_FLOAT_INLINE vector<T, E> read(const I& indices, const M& mask = true) {
+        return ::kernel_float::read(get(), indices, mask);
     }
+
     /**
-     * See ``kernel_float::storen``
+     * See ``kernel_float::write``
      */
-    template<typename V, size_t N = vector_extent<V>>
-    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const {
-        ::kernel_float::storen(values, get(), offset);
+    template<typename V, typename I, typename M = bool>
+    KERNEL_FLOAT_INLINE void write(const I& indices, const V& values, const M& mask = true) {
+        return ::kernel_float::write(get(), indices, values, mask);
     }
+
+    /**
+     * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items.
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns vector[40], vector[41], vector[42], vector[43]
+     * ptr.read_at<4>(10);
+     *
+     * // Returns vector[20], vector[21], vector[22]
+     * ptr.read_at<2, 3>(10);
+     * ```
+     */
+    template<size_t Step = 1, size_t N = Step>
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read_at(size_t offset) const {
+        return this->offset<Step>(offset).template read<N>();
+    }
+
     /**
-     * See ``kernel_float::storen``
+     * Offsets the pointer by `Step * offset` items and then writes the subsequent `N` items.
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     * vec<int, 4> values = {1, 2, 3, 4};
+     *
+     * // Writes to vector[40], vector[41], vector[42], vector[43]
+     * ptr.write_at<4>(10, values);
+     *
+     * // Returns vector[20], vector[21], vector[22], vector[23]
+     * ptr.write_at<2>(10, values);
+     * ```
      */
-    template<typename V, size_t N = vector_extent<V>>
-    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const {
-        ::kernel_float::storen(values, get(), offset, max_length);
+    template<size_t Step = 1, typename V>
+    KERNEL_FLOAT_INLINE void write_at(size_t offset, const V& values) const {
+        return this->offset<Step>(offset).template write(values);
     }
 
   private:
     T* ptr_ = nullptr;
 };
 
-/**
- * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes.
- */
 template<typename T, size_t alignment>
 struct aligned_ptr<const T, alignment> {
     static_assert(alignment >= alignof(T), "invalid alignment");
@@ -337,7 +447,7 @@ struct aligned_ptr<const T, alignment> {
      */
     KERNEL_FLOAT_INLINE
     const T* get() const {
-        return unsafe_assume_aligned(ptr_, alignment);
+        return KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr_, alignment);
     }
 
     KERNEL_FLOAT_INLINE
@@ -351,36 +461,71 @@ struct aligned_ptr<const T, alignment> {
     }
 
     /**
-     * See ``kernel_float::load``
+     * Returns a new pointer that is offset by ``Step * offset`` items
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 4
+     * ptr.offset(8);
+     *
+     * // Returns a pointer to element `vector[4]` with alignment of 16
+     * ptr.offset<4>();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 16
+     * ptr.offset<4>(2);
+     * ```
      */
-    template<typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-    KERNEL_FLOAT_INLINE vector<T, E> load(const I& indices, const M& mask = true) const {
-        return ::kernel_float::load(get(), indices, mask);
+    template<size_t Step = 1>
+    KERNEL_FLOAT_INLINE aligned_ptr<const T, detail::gcd(sizeof(T) * Step, alignment)>
+    offset(size_t n = 1) const {
+        return aligned_ptr<const T, detail::gcd(sizeof(T) * Step, alignment)> {get() + Step * n};
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::read``
      */
     template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset = 0) const {
-        return ::kernel_float::loadn<N>(get(), offset);
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read() const {
+        vector_storage<T, N> result;
+        detail::copy_aligned_impl<T, N, alignment>::load(result.data(), get());
+        return result;
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::write``
      */
-    template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset, size_t max_length) const {
-        return ::kernel_float::loadn<N>(get(), offset, max_length);
+    template<typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
+    KERNEL_FLOAT_INLINE vector<T, E> read(const I& indices, const M& mask = true) {
+        return ::kernel_float::read(get(), indices, mask);
+    }
+
+    /**
+     * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items.
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns vector[40], vector[41], vector[42], vector[43]
+     * ptr.read_at<4>(10);
+     *
+     * // Returns vector[20], vector[21], vector[22]
+     * ptr.read_at<2, 3>(10);
+     * ```
+     */
+    template<size_t Step = 1, size_t N = Step>
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read_at(size_t offset) const {
+        return this->offset<Step>(offset).template read<N>();
     }
 
   private:
     const T* ptr_ = nullptr;
 };
 
-template<typename T>
-aligned_ptr(T*) -> aligned_ptr<T>;
-
 }  // namespace kernel_float
 
 #endif  //KERNEL_FLOAT_MEMORY_H
diff --git a/tests/common.h b/tests/common.h
index dac12bb..ff8072d 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -11,7 +11,8 @@
 namespace kf = kernel_float;
 
 namespace detail {
-static __host__ __device__ void __assertion_failed(const char* expr, const char* file, int line) {
+__attribute__((noinline)) static __host__ __device__ void
+__assertion_failed(const char* expr, const char* file, int line) {
 #ifndef __CUDA_ARCH__
     std::string msg =
         "assertion failed: " + std::string(expr) + " (" + file + ":" + std::to_string(line) + ")";
diff --git a/tests/memory.cu b/tests/memory.cu
index 6fdc90d..ac29b46 100644
--- a/tests/memory.cu
+++ b/tests/memory.cu
@@ -7,44 +7,37 @@ struct load_test {
 
         {
             auto expected = kf::make_vec(T(3.0), T(2.0), T(7.0));
-            auto output = kf::load(data, kf::make_vec(3, 2, 7));
+            auto output = kf::read(data, kf::make_vec(3, 2, 7));
             ASSERT_EQ(expected, output);
         }
 
         {
             auto expected = kf::make_vec(T(3.0), T(2.0), T(7.0));
-            auto output = kf::load(data, kf::make_vec(3, 2, 7), kf::make_vec(true, true, true));
+            auto output = kf::read(data, kf::make_vec(3, 2, 7), kf::make_vec(true, true, true));
             ASSERT_EQ(expected, output);
         }
 
         {
             auto expected = kf::make_vec(T(3.0), T(), T(7.0));
-            auto output = kf::load(data, kf::make_vec(3, 100, 7), kf::make_vec(true, false, true));
+            auto output = kf::read(data, kf::make_vec(3, 100, 7), kf::make_vec(true, false, true));
             ASSERT_EQ(expected, output);
         }
 
         {
             auto expected = kf::make_vec(T(0.0), T(1.0), T(2.0));
-            auto output = kf::loadn<3>(data);
+            auto output = kf::read<3>(data);
             ASSERT_EQ(expected, output);
         }
 
         {
             auto expected = kf::make_vec(T(2.0), T(3.0), T(4.0));
-            auto output = kf::loadn<3>(data, 2);
-            ASSERT_EQ(expected, output);
-        }
-
-        {
-            auto expected = kf::make_vec(T(6.0), T(7.0), T());
-            auto output = kf::loadn<3>(data, 6, 8);
+            auto output = kf::read<3>(data + 2);
             ASSERT_EQ(expected, output);
         }
     }
 };
 
-REGISTER_TEST_CASE("load", load_test, int, float, double)
-REGISTER_TEST_CASE_GPU("load", load_test, __half, __nv_bfloat16)
+REGISTER_TEST_CASE("load", load_test, int, float, double, __half, __nv_bfloat16)
 
 struct store_test {
     template<typename T>
@@ -53,7 +46,7 @@ struct store_test {
             T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)};
             auto values = kf::make_vec(T(100.0), T(200.0));
             auto offsets = kf::make_vec(1, 3);
-            kf::store(values, data, offsets);
+            kf::write(data, offsets, values);
             ASSERT_EQ(data[0], T(0.0));
             ASSERT_EQ(data[1], T(100.0));
             ASSERT_EQ(data[2], T(2.0));
@@ -65,7 +58,7 @@ struct store_test {
             auto values = kf::make_vec(T(100.0), T(200.0));
             auto offsets = kf::make_vec(1, 3);
             auto mask = kf::make_vec(true, true);
-            kf::store(values, data, offsets, mask);
+            kf::write(data, offsets, values, mask);
             ASSERT_EQ(data[0], T(0.0));
             ASSERT_EQ(data[1], T(100.0));
             ASSERT_EQ(data[2], T(2.0));
@@ -77,7 +70,7 @@ struct store_test {
             auto values = kf::make_vec(T(100.0), T(200.0));
             auto offsets = kf::make_vec(1, 3);
             auto mask = kf::make_vec(true, false);
-            kf::store(values, data, offsets, mask);
+            kf::write(data, offsets, values, mask);
             ASSERT_EQ(data[0], T(0.0));
             ASSERT_EQ(data[1], T(100.0));
             ASSERT_EQ(data[2], T(2.0));
@@ -87,7 +80,7 @@ struct store_test {
         {
             T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)};
             auto values = kf::make_vec(T(100.0), T(200.0));
-            kf::storen(values, data);
+            kf::write(data, values);
             ASSERT_EQ(data[0], T(100.0));
             ASSERT_EQ(data[1], T(200.0));
             ASSERT_EQ(data[2], T(2.0));
@@ -97,37 +90,16 @@ struct store_test {
         {
             T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)};
             auto values = kf::make_vec(T(100.0), T(200.0));
-            kf::storen(values, data, 1);
+            kf::write(data + 1, values);
             ASSERT_EQ(data[0], T(0.0));
             ASSERT_EQ(data[1], T(100.0));
             ASSERT_EQ(data[2], T(200.0));
             ASSERT_EQ(data[3], T(3.0));
         }
-
-        {
-            T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)};
-            auto values = kf::make_vec(T(100.0), T(200.0));
-            kf::storen(values, data, 1, 4);
-            ASSERT_EQ(data[0], T(0.0));
-            ASSERT_EQ(data[1], T(100.0));
-            ASSERT_EQ(data[2], T(200.0));
-            ASSERT_EQ(data[3], T(3.0));
-        }
-
-        {
-            T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)};
-            auto values = kf::make_vec(T(100.0), T(200.0));
-            kf::storen(values, data, 3, 4);
-            ASSERT_EQ(data[0], T(0.0));
-            ASSERT_EQ(data[1], T(1.0));
-            ASSERT_EQ(data[2], T(2.0));
-            ASSERT_EQ(data[3], T(100.0));
-        }
     }
 };
 
-REGISTER_TEST_CASE("store", store_test, int, float, double)
-REGISTER_TEST_CASE_GPU("store", store_test, __half, __nv_bfloat16)
+REGISTER_TEST_CASE("store", store_test, int, float, double, __half, __nv_bfloat16)
 
 struct assign_conversion_test {
     template<typename T, size_t... I, size_t N = sizeof...(I)>
@@ -141,5 +113,30 @@ struct assign_conversion_test {
     }
 };
 
-REGISTER_TEST_CASE("assign conversion", assign_conversion_test, int, float, double)
-REGISTER_TEST_CASE_GPU("assign conversion", assign_conversion_test, __half, __nv_bfloat16)
\ No newline at end of file
+REGISTER_TEST_CASE(
+    "assign conversion",
+    assign_conversion_test,
+    int,
+    float,
+    double,
+    __half,
+    __nv_bfloat16)
+
+struct aligned_ptr_test {
+    template<typename T, size_t... I, size_t N = sizeof...(I)>
+    __host__ __device__ void operator()(generator<T> gen, std::index_sequence<I...>) {
+        struct alignas(32) storage_type {
+            T data[N];
+        };
+
+        storage_type input = {T(double(I))...};
+        storage_type output = {T(double(I * 0))...};
+
+        auto v = kf::read_aligned<N>(input.data);
+        kf::write_aligned(output.data, v);
+
+        ASSERT_EQ_ALL(output.data[I], T(double(I)));
+    }
+};
+
+REGISTER_TEST_CASE("aligned pointer", aligned_ptr_test, int, float, double, __half, __nv_bfloat16)
\ No newline at end of file

From ed7143d9b83a89f043191f4957760081bda60b4d Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 23 Oct 2023 10:39:30 +0200
Subject: [PATCH 12/16] Change function signature of `read/write_aligned`

---
 examples/vector_add/main.cu   |   2 +-
 include/kernel_float/memory.h |  42 ++-
 single_include/kernel_float.h | 673 +++++++++++++++++-----------------
 tests/memory.cu               |  10 +-
 4 files changed, 363 insertions(+), 364 deletions(-)

diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
index 5d10239..7b88292 100644
--- a/examples/vector_add/main.cu
+++ b/examples/vector_add/main.cu
@@ -19,7 +19,7 @@ __global__ void my_kernel(int length, const __half* input, double constant, floa
     if (i * N < length) {
         auto a = kf::read_aligned<N>(input + i * N);
         auto b = (a * a) * constant;
-        kf::write_aligned(output + i * N, b);
+        kf::write_aligned<N>(output + i * N, b);
     }
 }
 
diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h
index badfcf4..d75cddf 100644
--- a/include/kernel_float/memory.h
+++ b/include/kernel_float/memory.h
@@ -112,21 +112,21 @@ constexpr size_t gcd(size_t a, size_t b) {
     return b == 0 ? a : gcd(b, a % b);
 }
 
-template<typename T, size_t N, size_t alignment>
+template<typename T, size_t N, size_t alignment, typename = void>
 struct copy_aligned_impl {
-    static constexpr size_t half = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1));
-    static constexpr size_t new_alignment = gcd(alignment, sizeof(T) * half);
+    static constexpr size_t K = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1));
+    static constexpr size_t alignment_K = gcd(alignment, sizeof(T) * K);
 
     KERNEL_FLOAT_INLINE
     static void load(T* output, const T* input) {
-        copy_aligned_impl<T, half, new_alignment>::load(output, input);
-        copy_aligned_impl<T, N - half, new_alignment>::load(output + half, input + half);
+        copy_aligned_impl<T, K, alignment>::load(output, input);
+        copy_aligned_impl<T, N - K, alignment_K>::load(output + K, input + K);
     }
 
     KERNEL_FLOAT_INLINE
     static void store(T* output, const T* input) {
-        copy_aligned_impl<T, half, new_alignment>::store(output, input);
-        copy_aligned_impl<T, N - half, new_alignment>::store(output + half, input + half);
+        copy_aligned_impl<T, K, alignment>::store(output, input);
+        copy_aligned_impl<T, N - K, alignment_K>::store(output + K, input + K);
     }
 };
 
@@ -141,6 +141,8 @@ struct copy_aligned_impl<T, 0, alignment> {
 
 template<typename T, size_t alignment>
 struct copy_aligned_impl<T, 1, alignment> {
+    using storage_type = T;
+
     KERNEL_FLOAT_INLINE
     static void load(T* output, const T* input) {
         output[0] = input[0];
@@ -153,9 +155,9 @@ struct copy_aligned_impl<T, 1, alignment> {
 };
 
 template<typename T, size_t alignment>
-struct copy_aligned_impl<T, 2, alignment> {
-    static constexpr size_t new_alignment = gcd(alignment, 2 * sizeof(T));
-    struct alignas(new_alignment) storage_type {
+struct copy_aligned_impl<T, 2, alignment, enable_if_t<(alignment > sizeof(T))>> {
+    static constexpr size_t storage_alignment = gcd(alignment, 2 * sizeof(T));
+    struct alignas(storage_alignment) storage_type {
         T v0, v1;
     };
 
@@ -173,9 +175,9 @@ struct copy_aligned_impl<T, 2, alignment> {
 };
 
 template<typename T, size_t alignment>
-struct copy_aligned_impl<T, 4, alignment> {
-    static constexpr size_t new_alignment = gcd(alignment, 4 * sizeof(T));
-    struct alignas(new_alignment) storage_type {
+struct copy_aligned_impl<T, 4, alignment, enable_if_t<(alignment > 2 * sizeof(T))>> {
+    static constexpr size_t storage_alignment = gcd(alignment, 4 * sizeof(T));
+    struct alignas(storage_alignment) storage_type {
         T v0, v1, v2, v3;
     };
 
@@ -199,9 +201,9 @@ struct copy_aligned_impl<T, 4, alignment> {
 };
 
 template<typename T, size_t alignment>
-struct copy_aligned_impl<T, 8, alignment> {
-    static constexpr size_t new_alignment = gcd(alignment, 8 * sizeof(T));
-    struct alignas(new_alignment) storage_type {
+struct copy_aligned_impl<T, 8, alignment, enable_if_t<(alignment > 4 * sizeof(T))>> {
+    static constexpr size_t storage_alignment = gcd(alignment, 8 * sizeof(T));
+    struct alignas(storage_alignment) storage_type {
         T v0, v1, v2, v3, v4, v5, v6, v7;
     };
 
@@ -248,9 +250,9 @@ struct copy_aligned_impl<T, 8, alignment> {
  * vec<T, 4> values2 = read_aligned<4>(data + 10);
  * ```
  */
-template<size_t N, typename T>
+template<size_t Align, size_t N = Align, typename T>
 KERNEL_FLOAT_INLINE vector<T, extent<N>> read_aligned(const T* ptr) {
-    static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
+    static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
     vector_storage<T, N> result;
     detail::copy_aligned_impl<T, N, alignment>::load(
         result.data(),
@@ -273,10 +275,10 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> read_aligned(const T* ptr) {
  * write_aligned(data + 10, values);
  * ```
  */
-template<typename V, typename T>
+template<size_t Align, typename V, typename T>
 KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) {
     static constexpr size_t N = vector_extent<V>;
-    static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
+    static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
 
     return detail::copy_aligned_impl<T, N, alignment>::store(
         KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr, alignment),
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index 313cc85..c849e90 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,13 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-<<<<<<< HEAD
-// date: 2023-10-13 14:55:52.284209
-// git hash: 3da5ba08788e4d89a1b20b6a12bb4ba0f8de6b40
-=======
-// date: 2023-10-11 15:46:04.149164
-// git hash: b1f6c1b73c2212223b10142054a28806f56b5ee6
->>>>>>> 9bf416c (Update single_include)
+// date: 2023-10-23 10:41:33.267957
+// git hash: 5844245b6d2d679f23d9ccc9693a3274b75917f9
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -36,35 +31,35 @@
 #define KERNEL_FLOAT_IS_DEVICE (1)
 #define KERNEL_FLOAT_IS_HOST   (0)
 #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__)
-#else
+#else  // __CUDA_ARCH__
 #define KERNEL_FLOAT_INLINE    __forceinline__ __host__
 #define KERNEL_FLOAT_IS_DEVICE (0)
 #define KERNEL_FLOAT_IS_HOST   (1)
 #define KERNEL_FLOAT_CUDA_ARCH (0)
-#endif
-#else
+#endif  // __CUDA_ARCH__
+#else  // __CUDACC__
 #define KERNEL_FLOAT_INLINE    inline
 #define KERNEL_FLOAT_CUDA      (0)
 #define KERNEL_FLOAT_IS_HOST   (1)
 #define KERNEL_FLOAT_IS_DEVICE (0)
 #define KERNEL_FLOAT_CUDA_ARCH (0)
-#endif
+#endif  // __CUDACC__
 
 #ifndef KERNEL_FLOAT_FP16_AVAILABLE
 #define KERNEL_FLOAT_FP16_AVAILABLE (1)
-#endif
+#endif  // KERNEL_FLOAT_FP16_AVAILABLE
 
 #ifndef KERNEL_FLOAT_BF16_AVAILABLE
 #define KERNEL_FLOAT_BF16_AVAILABLE (1)
-#endif
+#endif  // KERNEL_FLOAT_BF16_AVAILABLE
 
 #ifndef KERNEL_FLOAT_FP8_AVAILABLE
 #ifdef __CUDACC_VER_MAJOR__
 #define KERNEL_FLOAT_FP8_AVAILABLE (__CUDACC_VER_MAJOR__ >= 12)
-#else
+#else  // __CUDACC_VER_MAJOR__
 #define KERNEL_FLOAT_FP8_AVAILABLE (0)
-#endif
-#endif
+#endif  // __CUDACC_VER_MAJOR__
+#endif  // KERNEL_FLOAT_FP8_AVAILABLE
 
 #define KERNEL_FLOAT_ASSERT(expr) \
     do {                          \
@@ -76,6 +71,16 @@
 #define KERNEL_FLOAT_CONCAT(A, B)      KERNEL_FLOAT_CONCAT_IMPL(A, B)
 #define KERNEL_FLOAT_CALL(F, ...)      F(__VA_ARGS__)
 
+// TOOD: check if this way is support across all compilers
+#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) && 0
+#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) \
+    static_cast<TYPE*>(__builtin_assume_aligned(static_cast<TYPE*>(PTR), (ALIGNMENT)))
+#else
+#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) (PTR)
+#endif
+
+#define KERNEL_FLOAT_MAX_ALIGNMENT (32)
+
 #endif  //KERNEL_FLOAT_MACROS_H
 #ifndef KERNEL_FLOAT_CORE_H
 #define KERNEL_FLOAT_CORE_H
@@ -855,7 +860,6 @@ struct promote_type<L, complex_type<R>> {
 }  // namespace kernel_float
 
 #endif
-
 #ifndef KERNEL_FLOAT_UNOPS_H
 #define KERNEL_FLOAT_UNOPS_H
 
@@ -2277,47 +2281,24 @@ KERNEL_FLOAT_INLINE select_type<V, Is...> select(const V& input, const Is&... in
 
 
 namespace kernel_float {
-
 namespace detail {
 template<typename T, size_t N, typename Is = make_index_sequence<N>>
-struct load_impl;
+struct copy_impl;
 
 template<typename T, size_t N, size_t... Is>
-struct load_impl<T, N, index_sequence<Is...>> {
+struct copy_impl<T, N, index_sequence<Is...>> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, N> call(const T* input, const size_t* offsets) {
-        return {input[offsets[Is]]...};
+    static vector_storage<T, N> load(const T* input, const size_t* offsets, const bool* mask) {
+        return {(mask[Is] ? input[offsets[Is]] : T {})...};
     }
 
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, N> call(const T* input, const size_t* offsets, const bool* mask) {
-        bool all_valid = true;
-        for (size_t i = 0; i < N; i++) {
-            all_valid &= mask[i];
-        }
-
-        if (all_valid) {
-            return {input[offsets[Is]]...};
-        } else {
-            return {(mask[Is] ? input[offsets[Is]] : T())...};
-        }
+    static void store(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) {
+        ((mask[Is] ? outputs[offsets[Is]] = inputs[Is] : T {}), ...);
     }
 };
 }  // namespace detail
 
-/**
- * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``.
- *
- * ```
- * // Load 4 elements at data[0], data[2], data[4], data[8]
- * vec<T, 4> values = load(data, make_vec(0, 2, 4, 8));
- * ```
- */
-template<typename T, typename I>
-KERNEL_FLOAT_INLINE vector<T, vector_extent_type<I>> load(const T* ptr, const I& indices) {
-    return detail::load_impl<T, vector_extent<I>>::call(ptr, cast<size_t>(indices).data());
-}
-
 /**
  * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``.
  *
@@ -2327,101 +2308,39 @@ KERNEL_FLOAT_INLINE vector<T, vector_extent_type<I>> load(const T* ptr, const I&
  *
  * ```
  * // Load 2 elements at data[0] and data[8], skip data[2] and data[4]
- * vec<T, 4> values = = load(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true));
+ * vec<T, 4> values = = read(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true));
  * ```
  */
-template<typename T, typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-KERNEL_FLOAT_INLINE vector<T, E> load(const T* ptr, const I& indices, const M& mask) {
-    static constexpr E new_size = {};
-
-    return detail::load_impl<T, E::value>::call(
+template<typename T, typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
+KERNEL_FLOAT_INLINE vector<T, E> read(const T* ptr, const I& indices, const M& mask = true) {
+    return detail::copy_impl<T, E::value>::load(
         ptr,
-        convert_storage<size_t>(indices, new_size).data(),
-        convert_storage<bool>(mask, new_size).data());
-}
-
-/**
- * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an
- * ``offset`` can be given that shifts all the indices by a fixed amount.
- *
- * ```
- * // Load 4 elements at locations data[0], data[1], data[2], data[3]
- * vec<T, 4> values = loadn<4>(data);
- *
- * // Load 4 elements at locations data[10], data[11], data[12], data[13]
- * vec<T, 4> values2 = loadn<4>(data, 10);
- * ```
- */
-template<size_t N, typename T>
-KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(const T* ptr, size_t offset = 0) {
-    return load(ptr, offset + range<size_t, N>());
+        convert_storage<size_t>(indices, E()).data(),
+        convert_storage<bool>(mask, E()).data());
 }
 
 /**
- * Load ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``.
- * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used
- * to prevent reading out of bounds.
+ * Store the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``.
  *
- * ```
- * // Returns {ptr[8], ptr[9], 0, 0};
- * vec<T, 4> values = loadn<4>(data, 8, 10);
- * ```
- */
-template<size_t N, typename T>
-KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(const T* ptr, size_t offset, size_t max_length) {
-    auto indices = offset + range<size_t, N>();
-    return load(ptr, indices, indices < max_length);
-}
-
-namespace detail {
-template<typename T, size_t N, typename Is = make_index_sequence<N>>
-struct store_impl;
-
-template<typename T, size_t N, size_t... Is>
-struct store_impl<T, N, index_sequence<Is...>> {
-    KERNEL_FLOAT_INLINE
-    static void call(T* outputs, const T* inputs, const size_t* offsets) {
-        ((outputs[offsets[Is]] = inputs[Is]), ...);
-    }
-
-    KERNEL_FLOAT_INLINE
-    static void call(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) {
-        bool all_valid = true;
-        for (size_t i = 0; i < N; i++) {
-            all_valid &= mask[i];
-        }
-
-        if (all_valid) {
-            ((outputs[offsets[Is]] = inputs[Is]), ...);
-        } else {
-#pragma unroll
-            for (size_t i = 0; i < N; i++) {
-                if (mask[i]) {
-                    outputs[offsets[i]] = inputs[i];
-                }
-            }
-        }
-    }
-};
-}  // namespace detail
-
-/**
- * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``.
+ * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should
+ * be store and ``false`` indicates that the value should be skipped. This can be used
+ * to prevent writing out of bounds.
  *
  * ```
- * // Store 4 elements at data[0], data[2], data[4], data[8]
+ * // Store 2 elements at data[0] and data[8], skip data[2] and data[4]
  * auto values = make_vec(42, 13, 87, 12);
- * store(values, data, make_vec(0, 2, 4, 8));
+ * auto mask = make_vec(true, false, false, true);
+ * write(data, make_vec(0, 2, 4, 8), values, mask);
  * ```
  */
 template<
     typename T,
     typename V,
     typename I,
-    typename M,
+    typename M = bool,
     typename E = broadcast_vector_extent_type<V, I, M>>
-KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices, const M& mask) {
-    return detail::store_impl<T, E::value>::call(
+KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) {
+    return detail::copy_impl<T, E::value>::store(
         ptr,
         convert_storage<T>(values, E()).data(),
         convert_storage<size_t>(indices, E()).data(),
@@ -2429,227 +2348,222 @@ KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices, const
 }
 
 /**
- * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``.
- *
- * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should
- * be store and ``false`` indicates that the value should be skipped. This can be used
- * to prevent writing out of bounds.
+ * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``.
  *
  * ```
- * // Store 2 elements at data[0] and data[8], skip data[2] and data[4]
- * auto values = make_vec(42, 13, 87, 12);
- * auto mask = make_vec(true, false, false, true);
- * store(values, data, make_vec(0, 2, 4, 8), mask);
+ * // Load 4 elements at locations data[0], data[1], data[2], data[3]
+ * vec<T, 4> values = read<4>(data);
+ *
+ * // Load 4 elements at locations data[10], data[11], data[12], data[13]
+ * vec<T, 4> values = read<4>(values + 10, data);
  * ```
  */
-template<typename T, typename V, typename I, typename E = broadcast_vector_extent_type<V, I>>
-KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices) {
-    return detail::store_impl<T, E::value>::call(
-        ptr,
-        convert_storage<T>(values, E()).data(),
-        convert_storage<size_t>(indices, E()).data());
+template<size_t N, typename T>
+KERNEL_FLOAT_INLINE vector<T, extent<N>> read(const T* ptr) {
+    return read(ptr, range<size_t, N>());
 }
 
 /**
- * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an
- * ``offset`` can be given that shifts all the indices by a fixed amount.
+ * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``.
  *
  * ```
  * // Store 4 elements at locations data[0], data[1], data[2], data[3]
  * vec<float, 4> values = {1.0f, 2.0f, 3.0f, 4.0f};
- * storen<4>(values, data);
+ * write(data, values);
  *
- * // Load 4 elements at locations data[10], data[11], data[12], data[13]
- * storen<4>(values, data, 10);
+ * // Store 4 elements at locations data[10], data[11], data[12], data[13]
+ * write(data + 10, values);
  * ```
  */
-template<typename T, typename V, size_t N = vector_extent<V>>
-KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset = 0) {
-    auto indices = offset + range<size_t, N>();
-    return store(values, ptr, indices);
+template<typename V, typename T>
+KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) {
+    static constexpr size_t N = vector_extent<V>;
+    write(ptr, range<size_t, N>(), values);
 }
 
-/**
- * Store ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``.
- * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used
- * to prevent reading out of bounds.
- *
- * ```
- * // Store 1.0f at data[8] and 2.0f at data[9]. Ignores remaining values.
- * vec<float, 4> values = {1.0f, 2.0f, 3.0f, 4.0f};
- * storen<4>(values, data, 8, 10);
- * ```
- */
-template<typename T, typename V, size_t N = vector_extent<V>>
-KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t max_length) {
-    auto indices = offset + range<size_t, N>();
-    return store(values, ptr, indices, indices < max_length);
+namespace detail {
+KERNEL_FLOAT_INLINE
+constexpr size_t gcd(size_t a, size_t b) {
+    return b == 0 ? a : gcd(b, a % b);
 }
 
-<<<<<<< HEAD
-// TOOD: check if this way is support across all compilers
-#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
-#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment))
-#else
-#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (ptr)
-#endif
+template<typename T, size_t N, size_t alignment, typename = void>
+struct copy_aligned_impl {
+    static constexpr size_t K = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1));
+    static constexpr size_t alignment_K = gcd(alignment, sizeof(T) * K);
 
-template<typename T, size_t N>
-struct AssignConversionProxy {
     KERNEL_FLOAT_INLINE
-    explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {}
-
-    template<typename U>
-    KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) {
-        auto indices = range<size_t, N>();
-        detail::store_impl<T, N>::call(
-            ptr_,
-            convert_storage<T, N>(std::forward<U>(values)).data(),
-            indices.data());
-
-        return *this;
+    static void load(T* output, const T* input) {
+        copy_aligned_impl<T, K, alignment>::load(output, input);
+        copy_aligned_impl<T, N - K, alignment_K>::load(output + K, input + K);
     }
 
-  private:
-    T* ptr_;
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        copy_aligned_impl<T, K, alignment>::store(output, input);
+        copy_aligned_impl<T, N - K, alignment_K>::store(output + K, input + K);
+    }
 };
 
-/**
- * Takes a reference to a vector and returns a special proxy object that automatically performs the correct conversion
- * when a vector of a different element type is assigned. This is useful to perform implicit type conversions.
- *
- * For example, let assume that a line like `x = expression;` would not compile since `x` and `expressions` are
- * vectors of different element types. Then it is possible to use `cast_to(x) = expression;` to fix this error,
- * which possibly introduces a type conversion.
- *
- * Example
- * =======
- * ```
- * vec<float, 2> x;
- * vec<double, 2> y = {1.0, 2.0};
- * cast_to(x) = y;  // normally, the line `x = y;` would not compile, but `cast_to` make this possible
- * ```
- */
-template<typename T, typename E>
-KERNEL_FLOAT_INLINE AssignConversionProxy<T, E::value> cast_to(vector<T, E>& input) {
-    return AssignConversionProxy<T, E::value>(input.data());
-}
-
-/**
- * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes.
- */
-template<typename T, size_t alignment = 256>
-struct aligned_ptr {
-    static_assert(alignment >= alignof(T), "invalid alignment");
-
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 0, alignment> {
     KERNEL_FLOAT_INLINE
-    aligned_ptr(nullptr_t = nullptr) {}
+    static void load(T* output, const T* input) {}
 
     KERNEL_FLOAT_INLINE
-    explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
+    static void store(T* output, const T* input) {}
+};
+
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 1, alignment> {
+    using storage_type = T;
 
-    /**
-     * Return the pointer value.
-     */
     KERNEL_FLOAT_INLINE
-    T* get() const {
-        return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment);
+    static void load(T* output, const T* input) {
+        output[0] = input[0];
     }
 
     KERNEL_FLOAT_INLINE
-    operator T*() const {
-        return get();
+    static void store(T* output, const T* input) {
+        output[0] = input[0];
     }
+};
 
-    template<typename I>
-    KERNEL_FLOAT_INLINE T& operator[](I&& index) const {
-        return get()[std::forward<I>(index)];
-    }
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 2, alignment, enable_if_t<(alignment > sizeof(T))>> {
+    static constexpr size_t storage_alignment = gcd(alignment, 2 * sizeof(T));
+    struct alignas(storage_alignment) storage_type {
+        T v0, v1;
+    };
 
-    /**
-     * See ``kernel_float::load``
-     */
-    template<typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-    KERNEL_FLOAT_INLINE vector<T, E> load(const I& indices, const M& mask = true) const {
-        return ::kernel_float::load(get(), indices, mask);
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {
+        storage_type storage = *reinterpret_cast<const storage_type*>(input);
+        output[0] = storage.v0;
+        output[1] = storage.v1;
     }
 
-    /**
-     * See ``kernel_float::loadn``
-     */
-    template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset = 0) const {
-        return ::kernel_float::loadn<N>(get(), offset);
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        *reinterpret_cast<storage_type*>(output) = storage_type {input[0], input[1]};
     }
+};
 
-    /**
-     * See ``kernel_float::loadn``
-     */
-    template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset, size_t max_length) const {
-        return ::kernel_float::loadn<N>(get(), offset, max_length);
-    }
+template<typename T, size_t alignment>
+struct copy_aligned_impl<T, 4, alignment, enable_if_t<(alignment > 2 * sizeof(T))>> {
+    static constexpr size_t storage_alignment = gcd(alignment, 4 * sizeof(T));
+    struct alignas(storage_alignment) storage_type {
+        T v0, v1, v2, v3;
+    };
 
-    /**
-     * See ``kernel_float::store``
-     */
-    template<typename V, typename I, typename M, typename E = broadcast_vector_extent_type<V, I, M>>
-    KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const {
-        ::kernel_float::store(values, get(), indices, mask);
-    }
-    /**
-     * See ``kernel_float::storen``
-     */
-    template<typename V, size_t N = vector_extent<V>>
-    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const {
-        ::kernel_float::storen(values, get(), offset);
-    }
-    /**
-     * See ``kernel_float::storen``
-     */
-    template<typename V, size_t N = vector_extent<V>>
-    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const {
-        ::kernel_float::storen(values, get(), offset, max_length);
+    KERNEL_FLOAT_INLINE
+    static void load(T* output, const T* input) {
+        storage_type storage = *reinterpret_cast<const storage_type*>(input);
+        output[0] = storage.v0;
+        output[1] = storage.v1;
+        output[2] = storage.v2;
+        output[3] = storage.v3;
     }
 
-  private:
-    T* ptr_ = nullptr;
+    KERNEL_FLOAT_INLINE
+    static void store(T* output, const T* input) {
+        *reinterpret_cast<storage_type*>(output) = storage_type {
+            input[0],  //
+            input[1],
+            input[2],
+            input[3]};
+    }
 };
 
-/**
- * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes.
- */
 template<typename T, size_t alignment>
-struct aligned_ptr<const T, alignment> {
-    static_assert(alignment >= alignof(T), "invalid alignment");
+struct copy_aligned_impl<T, 8, alignment, enable_if_t<(alignment > 4 * sizeof(T))>> {
+    static constexpr size_t storage_alignment = gcd(alignment, 8 * sizeof(T));
+    struct alignas(storage_alignment) storage_type {
+        T v0, v1, v2, v3, v4, v5, v6, v7;
+    };
 
     KERNEL_FLOAT_INLINE
-    aligned_ptr(nullptr_t = nullptr) {}
+    static void load(T* output, const T* input) {
+        storage_type storage = *reinterpret_cast<const storage_type*>(input);
+        output[0] = storage.v0;
+        output[1] = storage.v1;
+        output[2] = storage.v2;
+        output[3] = storage.v3;
+        output[4] = storage.v4;
+        output[5] = storage.v5;
+        output[6] = storage.v6;
+        output[7] = storage.v7;
+    }
 
     KERNEL_FLOAT_INLINE
-    explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
+    static void store(T* output, const T* input) {
+        *reinterpret_cast<storage_type*>(output) = storage_type {
+            input[0],  //
+            input[1],
+            input[2],
+            input[3],
+            input[4],
+            input[5],
+            input[6],
+            input[7]};
+    }
+};
+
+}  // namespace detail
 
-    KERNEL_FLOAT_INLINE
-    explicit aligned_ptr(const T* ptr) : ptr_(ptr) {}
-=======
 /**
- * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes.
- * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful!
+ * Load ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``.
+ *
+ * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector
+ * operation. If the pointer is not aligned, undefined behavior will occur.
+ *
+ * ```
+ * // Load 4 elements at locations data[0], data[1], data[2], data[3]
+ * vec<T, 4> values = read_aligned<4>(data);
+ *
+ * // Load 4 elements at locations data[10], data[11], data[12], data[13]
+ * vec<T, 4> values2 = read_aligned<4>(data + 10);
+ * ```
  */
-template<typename T>
-KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) {
-// TOOD: check if this way is support across all compilers
-#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned)
-    return static_cast<T*>(__builtin_assume_aligned(ptr, alignment));
-#else
-    return ptr;
-#endif
+template<size_t Align, size_t N = Align, typename T>
+KERNEL_FLOAT_INLINE vector<T, extent<N>> read_aligned(const T* ptr) {
+    static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
+    vector_storage<T, N> result;
+    detail::copy_aligned_impl<T, N, alignment>::load(
+        result.data(),
+        KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr, alignment));
+    return result;
+}
+
+/**
+ * Store ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``.
+ *
+ * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector
+ * operation. If the pointer is not aligned, undefined behavior will occur.
+ *
+ * ```
+ * // Store 4 elements at locations data[0], data[1], data[2], data[3]
+ * vec<float, 4> values = {1.0f, 2.0f, 3.0f, 4.0f};
+ * write_aligned(data, values);
+ *
+ * // Load 4 elements at locations data[10], data[11], data[12], data[13]
+ * write_aligned(data + 10, values);
+ * ```
+ */
+template<size_t Align, typename V, typename T>
+KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) {
+    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
+
+    return detail::copy_aligned_impl<T, N, alignment>::store(
+        KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr, alignment),
+        convert_storage<T, N>(values).data());
 }
 
 /**
  * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes.
  */
-template<typename T, size_t alignment = 256>
+template<typename T, size_t alignment = KERNEL_FLOAT_MAX_ALIGNMENT>
 struct aligned_ptr {
     static_assert(alignment >= alignof(T), "invalid alignment");
 
@@ -2659,12 +2573,15 @@ struct aligned_ptr {
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
 
+    KERNEL_FLOAT_INLINE
+    aligned_ptr(const aligned_ptr<T>& ptr) : ptr_(ptr.get()) {}
+
     /**
      * Return the pointer value.
      */
     KERNEL_FLOAT_INLINE
     T* get() const {
-        return unsafe_assume_aligned(ptr_, alignment);
+        return KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr_, alignment);
     }
 
     KERNEL_FLOAT_INLINE
@@ -2678,58 +2595,111 @@ struct aligned_ptr {
     }
 
     /**
-     * See ``kernel_float::load``
+     * Returns a new pointer that is offset by ``Step * offset`` items
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 4
+     * ptr.offset(8);
+     *
+     * // Returns a pointer to element `vector[4]` with alignment of 16
+     * ptr.offset<4>();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 16
+     * ptr.offset<4>(2);
+     * ```
      */
-    template<typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-    KERNEL_FLOAT_INLINE vector<T, E> load(const I& indices, const M& mask = true) const {
-        return ::kernel_float::load(get(), indices, mask);
+    template<size_t Step = 1>
+    KERNEL_FLOAT_INLINE aligned_ptr<T, detail::gcd(sizeof(T) * Step, alignment)>
+    offset(size_t n = 1) const {
+        return aligned_ptr<T, detail::gcd(sizeof(T) * Step, alignment)> {get() + Step * n};
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::read``
      */
     template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset = 0) const {
-        return ::kernel_float::loadn<N>(get(), offset);
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read() const {
+        vector_storage<T, N> result;
+        detail::copy_aligned_impl<T, N, alignment>::load(result.data(), get());
+        return result;
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::write``
      */
-    template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset, size_t max_length) const {
-        return ::kernel_float::loadn<N>(get(), offset, max_length);
+    template<typename V>
+    KERNEL_FLOAT_INLINE void write(const V& values) const {
+        constexpr size_t N = vector_extent<V>;
+        return detail::copy_aligned_impl<T, N, alignment>::store(
+            get(),
+            convert_storage<T, N>(values).data());
     }
 
     /**
-     * See ``kernel_float::store``
+     * See ``kernel_float::read``
      */
-    template<typename V, typename I, typename M, typename E = broadcast_vector_extent_type<V, I, M>>
-    KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const {
-        ::kernel_float::store(values, get(), indices, mask);
+    template<typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
+    KERNEL_FLOAT_INLINE vector<T, E> read(const I& indices, const M& mask = true) {
+        return ::kernel_float::read(get(), indices, mask);
     }
+
     /**
-     * See ``kernel_float::storen``
+     * See ``kernel_float::write``
+     */
+    template<typename V, typename I, typename M = bool>
+    KERNEL_FLOAT_INLINE void write(const I& indices, const V& values, const M& mask = true) {
+        return ::kernel_float::write(get(), indices, values, mask);
+    }
+
+    /**
+     * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items.
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns vector[40], vector[41], vector[42], vector[43]
+     * ptr.read_at<4>(10);
+     *
+     * // Returns vector[20], vector[21], vector[22]
+     * ptr.read_at<2, 3>(10);
+     * ```
      */
-    template<typename V, size_t N = vector_extent<V>>
-    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const {
-        ::kernel_float::storen(values, get(), offset);
+    template<size_t Step = 1, size_t N = Step>
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read_at(size_t offset) const {
+        return this->offset<Step>(offset).template read<N>();
     }
+
     /**
-     * See ``kernel_float::storen``
+     * Offsets the pointer by `Step * offset` items and then writes the subsequent `N` items.
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     * vec<int, 4> values = {1, 2, 3, 4};
+     *
+     * // Writes to vector[40], vector[41], vector[42], vector[43]
+     * ptr.write_at<4>(10, values);
+     *
+     * // Returns vector[20], vector[21], vector[22], vector[23]
+     * ptr.write_at<2>(10, values);
+     * ```
      */
-    template<typename V, size_t N = vector_extent<V>>
-    KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const {
-        ::kernel_float::storen(values, get(), offset, max_length);
+    template<size_t Step = 1, typename V>
+    KERNEL_FLOAT_INLINE void write_at(size_t offset, const V& values) const {
+        return this->offset<Step>(offset).template write(values);
     }
 
   private:
     T* ptr_ = nullptr;
 };
 
-/**
- * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes.
- */
 template<typename T, size_t alignment>
 struct aligned_ptr<const T, alignment> {
     static_assert(alignment >= alignof(T), "invalid alignment");
@@ -2748,18 +2718,13 @@ struct aligned_ptr<const T, alignment> {
 
     KERNEL_FLOAT_INLINE
     aligned_ptr(const aligned_ptr<const T>& ptr) : ptr_(ptr.get()) {}
->>>>>>> 9bf416c (Update single_include)
 
     /**
      * Return the pointer value.
      */
     KERNEL_FLOAT_INLINE
     const T* get() const {
-<<<<<<< HEAD
-        return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment);
-=======
-        return unsafe_assume_aligned(ptr_, alignment);
->>>>>>> 9bf416c (Update single_include)
+        return KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr_, alignment);
     }
 
     KERNEL_FLOAT_INLINE
@@ -2773,39 +2738,71 @@ struct aligned_ptr<const T, alignment> {
     }
 
     /**
-     * See ``kernel_float::load``
+     * Returns a new pointer that is offset by ``Step * offset`` items
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 4
+     * ptr.offset(8);
+     *
+     * // Returns a pointer to element `vector[4]` with alignment of 16
+     * ptr.offset<4>();
+     *
+     * // Returns a pointer to element `vector[8]` with alignment of 16
+     * ptr.offset<4>(2);
+     * ```
      */
-    template<typename I, typename M, typename E = broadcast_vector_extent_type<I, M>>
-    KERNEL_FLOAT_INLINE vector<T, E> load(const I& indices, const M& mask = true) const {
-        return ::kernel_float::load(get(), indices, mask);
+    template<size_t Step = 1>
+    KERNEL_FLOAT_INLINE aligned_ptr<const T, detail::gcd(sizeof(T) * Step, alignment)>
+    offset(size_t n = 1) const {
+        return aligned_ptr<const T, detail::gcd(sizeof(T) * Step, alignment)> {get() + Step * n};
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::read``
      */
     template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset = 0) const {
-        return ::kernel_float::loadn<N>(get(), offset);
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read() const {
+        vector_storage<T, N> result;
+        detail::copy_aligned_impl<T, N, alignment>::load(result.data(), get());
+        return result;
     }
 
     /**
-     * See ``kernel_float::loadn``
+     * See ``kernel_float::write``
      */
-    template<size_t N>
-    KERNEL_FLOAT_INLINE vector<T, extent<N>> loadn(size_t offset, size_t max_length) const {
-        return ::kernel_float::loadn<N>(get(), offset, max_length);
+    template<typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
+    KERNEL_FLOAT_INLINE vector<T, E> read(const I& indices, const M& mask = true) {
+        return ::kernel_float::read(get(), indices, mask);
+    }
+
+    /**
+     * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items.
+     *
+     * Example
+     * =======
+     * ```
+     * aligned_ptr<int> ptr = vector.data();
+     *
+     * // Returns vector[40], vector[41], vector[42], vector[43]
+     * ptr.read_at<4>(10);
+     *
+     * // Returns vector[20], vector[21], vector[22]
+     * ptr.read_at<2, 3>(10);
+     * ```
+     */
+    template<size_t Step = 1, size_t N = Step>
+    KERNEL_FLOAT_INLINE vector<T, extent<N>> read_at(size_t offset) const {
+        return this->offset<Step>(offset).template read<N>();
     }
 
   private:
     const T* ptr_ = nullptr;
 };
 
-<<<<<<< HEAD
-=======
-template<typename T>
-aligned_ptr(T*) -> aligned_ptr<T>;
-
->>>>>>> 9bf416c (Update single_include)
 }  // namespace kernel_float
 
 #endif  //KERNEL_FLOAT_MEMORY_H
diff --git a/tests/memory.cu b/tests/memory.cu
index ac29b46..3b281bd 100644
--- a/tests/memory.cu
+++ b/tests/memory.cu
@@ -124,19 +124,19 @@ REGISTER_TEST_CASE(
 
 struct aligned_ptr_test {
     template<typename T, size_t... I, size_t N = sizeof...(I)>
-    __host__ __device__ void operator()(generator<T> gen, std::index_sequence<I...>) {
+    __host__ __device__ void operator()(generator<T>, std::index_sequence<I...>) {
         struct alignas(32) storage_type {
             T data[N];
         };
 
         storage_type input = {T(double(I))...};
-        storage_type output = {T(double(I * 0))...};
-
         auto v = kf::read_aligned<N>(input.data);
-        kf::write_aligned(output.data, v);
+        ASSERT_EQ_ALL(v[I], T(double(I)));
 
+        storage_type output = {T(double(I * 0))...};
+        kf::write_aligned<N>(output.data, v);
         ASSERT_EQ_ALL(output.data[I], T(double(I)));
     }
 };
 
-REGISTER_TEST_CASE("aligned pointer", aligned_ptr_test, int, float, double, __half, __nv_bfloat16)
\ No newline at end of file
+REGISTER_TEST_CASE("aligned pointer", aligned_ptr_test, int, float, double, __half, __nv_bfloat16)

From f668fa44813fadb51dc48192391b51502ba17618 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 23 Oct 2023 12:10:22 +0200
Subject: [PATCH 13/16] Replace `nullptr_t` by `decltype(nullptr)`

---
 include/kernel_float/memory.h | 4 ++--
 single_include/kernel_float.h | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h
index d75cddf..8768391 100644
--- a/include/kernel_float/memory.h
+++ b/include/kernel_float/memory.h
@@ -293,7 +293,7 @@ struct aligned_ptr {
     static_assert(alignment >= alignof(T), "invalid alignment");
 
     KERNEL_FLOAT_INLINE
-    aligned_ptr(nullptr_t = nullptr) {}
+    aligned_ptr(decltype(nullptr) = nullptr) {}
 
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
@@ -430,7 +430,7 @@ struct aligned_ptr<const T, alignment> {
     static_assert(alignment >= alignof(T), "invalid alignment");
 
     KERNEL_FLOAT_INLINE
-    aligned_ptr(nullptr_t = nullptr) {}
+    aligned_ptr(decltype(nullptr) = nullptr) {}
 
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index c849e90..229727b 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2023-10-23 10:41:33.267957
-// git hash: 5844245b6d2d679f23d9ccc9693a3274b75917f9
+// date: 2023-10-23 12:07:20.751837
+// git hash: ed7143d9b83a89f043191f4957760081bda60b4d
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -2568,7 +2568,7 @@ struct aligned_ptr {
     static_assert(alignment >= alignof(T), "invalid alignment");
 
     KERNEL_FLOAT_INLINE
-    aligned_ptr(nullptr_t = nullptr) {}
+    aligned_ptr(decltype(nullptr) = nullptr) {}
 
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(T* ptr) : ptr_(ptr) {}
@@ -2705,7 +2705,7 @@ struct aligned_ptr<const T, alignment> {
     static_assert(alignment >= alignof(T), "invalid alignment");
 
     KERNEL_FLOAT_INLINE
-    aligned_ptr(nullptr_t = nullptr) {}
+    aligned_ptr(decltype(nullptr) = nullptr) {}
 
     KERNEL_FLOAT_INLINE
     explicit aligned_ptr(T* ptr) : ptr_(ptr) {}

From a35b9f600525b2253f9b2e1fb3cb91d382ac2a7d Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 23 Oct 2023 12:19:28 +0200
Subject: [PATCH 14/16] Add missing  keywords necessary for nvrtc

---
 include/kernel_float/tiling.h | 4 ++--
 single_include/kernel_float.h | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h
index 9cac00d..978d175 100644
--- a/include/kernel_float/tiling.h
+++ b/include/kernel_float/tiling.h
@@ -175,7 +175,7 @@ struct instantiate_distribution_impl {
 template<typename First, typename... Rest>
 struct instantiate_distribution_impl<0, distributions<First, Rest...>> {
     template<size_t N, size_t K>
-    using type = typename First::type<N, K>;
+    using type = typename First::template type<N, K>;
 };
 
 template<size_t I, typename First, typename... Rest>
@@ -193,7 +193,7 @@ template<typename TileDim, typename BlockDim, typename Distributions, size_t...
 struct tiling_impl<TileDim, BlockDim, Distributions, index_sequence<Is...>> {
     template<size_t I>
     using dist_type = typename instantiate_distribution_impl<I, Distributions>::
-        type<TileDim::size(I, BlockDim::size(I)), BlockDim::size(I)>;
+        template type<TileDim::size(I, BlockDim::size(I)), BlockDim::size(I)>;
 
     static constexpr size_t rank = TileDim::rank;
     static constexpr size_t items_per_thread = (dist_type<Is>::items_per_thread * ... * 1);
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index 229727b..035e87f 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2023-10-23 12:07:20.751837
-// git hash: ed7143d9b83a89f043191f4957760081bda60b4d
+// date: 2023-10-23 12:19:00.301788
+// git hash: f668fa44813fadb51dc48192391b51502ba17618
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -4549,7 +4549,7 @@ struct instantiate_distribution_impl {
 template<typename First, typename... Rest>
 struct instantiate_distribution_impl<0, distributions<First, Rest...>> {
     template<size_t N, size_t K>
-    using type = typename First::type<N, K>;
+    using type = typename First::template type<N, K>;
 };
 
 template<size_t I, typename First, typename... Rest>
@@ -4567,7 +4567,7 @@ template<typename TileDim, typename BlockDim, typename Distributions, size_t...
 struct tiling_impl<TileDim, BlockDim, Distributions, index_sequence<Is...>> {
     template<size_t I>
     using dist_type = typename instantiate_distribution_impl<I, Distributions>::
-        type<TileDim::size(I, BlockDim::size(I)), BlockDim::size(I)>;
+        template type<TileDim::size(I, BlockDim::size(I)), BlockDim::size(I)>;
 
     static constexpr size_t rank = TileDim::rank;
     static constexpr size_t items_per_thread = (dist_type<Is>::items_per_thread * ... * 1);

From 28f811af866d73bef37acd541bac6a95df9a94c3 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 24 Oct 2023 14:04:27 +0200
Subject: [PATCH 15/16] Refactor how `apply_impl` is performed to simplify
 fp16/bf16

---
 include/kernel_float/apply.h      | 189 ++++++++
 include/kernel_float/bf16.h       | 190 +++-----
 include/kernel_float/conversion.h | 112 -----
 include/kernel_float/fp16.h       | 167 +++----
 include/kernel_float/meta.h       |   8 +
 include/kernel_float/reduce.h     |  49 +-
 include/kernel_float/unops.h      |  42 +-
 include/kernel_float/vector.h     |   1 +
 single_include/kernel_float.h     | 760 +++++++++++++++---------------
 9 files changed, 749 insertions(+), 769 deletions(-)
 create mode 100644 include/kernel_float/apply.h

diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h
new file mode 100644
index 0000000..72c301a
--- /dev/null
+++ b/include/kernel_float/apply.h
@@ -0,0 +1,189 @@
+#ifndef KERNEL_FLOAT_APPLY_H
+#define KERNEL_FLOAT_APPLY_H
+
+#include "base.h"
+
+namespace kernel_float {
+namespace detail {
+
+template<typename... Es>
+struct broadcast_extent_helper;
+
+template<typename E>
+struct broadcast_extent_helper<E> {
+    using type = E;
+};
+
+template<size_t N>
+struct broadcast_extent_helper<extent<N>, extent<N>> {
+    using type = extent<N>;
+};
+
+template<size_t N>
+struct broadcast_extent_helper<extent<1>, extent<N>> {
+    using type = extent<N>;
+};
+
+template<size_t N>
+struct broadcast_extent_helper<extent<N>, extent<1>> {
+    using type = extent<N>;
+};
+
+template<>
+struct broadcast_extent_helper<extent<1>, extent<1>> {
+    using type = extent<1>;
+};
+
+template<typename A, typename B, typename C, typename... Rest>
+struct broadcast_extent_helper<A, B, C, Rest...>:
+    broadcast_extent_helper<typename broadcast_extent_helper<A, B>::type, C, Rest...> {};
+
+}  // namespace detail
+
+template<typename... Es>
+using broadcast_extent = typename detail::broadcast_extent_helper<Es...>::type;
+
+template<typename... Vs>
+using broadcast_vector_extent_type = broadcast_extent<vector_extent_type<Vs>...>;
+
+template<typename From, typename To>
+static constexpr bool is_broadcastable = is_same_type<broadcast_extent<From, To>, To>;
+
+template<typename V, typename To>
+static constexpr bool is_vector_broadcastable = is_broadcastable<vector_extent_type<V>, To>;
+
+namespace detail {
+
+template<typename T, typename From, typename To>
+struct broadcast_impl;
+
+template<typename T, size_t N>
+struct broadcast_impl<T, extent<1>, extent<N>> {
+    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(const vector_storage<T, 1>& input) {
+        vector_storage<T, N> output;
+        for (size_t i = 0; i < N; i++) {
+            output.data()[i] = input.data()[0];
+        }
+        return output;
+    }
+};
+
+template<typename T, size_t N>
+struct broadcast_impl<T, extent<N>, extent<N>> {
+    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(vector_storage<T, N> input) {
+        return input;
+    }
+};
+
+template<typename T>
+struct broadcast_impl<T, extent<1>, extent<1>> {
+    KERNEL_FLOAT_INLINE static vector_storage<T, 1> call(vector_storage<T, 1> input) {
+        return input;
+    }
+};
+
+}  // namespace detail
+
+/**
+ * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input`
+ * is 1 or `N`.
+ *
+ * Example
+ * =======
+ * ```
+ * vec<float, 1> a = {1.0f};
+ * vec<float, 5> x = broadcast<5>(a);  // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f]
+ *
+ * vec<float, 5> b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+ * vec<float, 5> y = broadcast<5>(b);  // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f]
+ * ```
+ */
+template<size_t N, typename V>
+KERNEL_FLOAT_INLINE vector<vector_value_type<V>, extent<N>>
+broadcast(const V& input, extent<N> new_size = {}) {
+    using T = vector_value_type<V>;
+    return detail::broadcast_impl<T, vector_extent_type<V>, extent<N>>::call(
+        into_vector_storage(input));
+}
+
+/**
+ * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the
+ * size of `input` is 1 or the same as `other`.
+ */
+template<typename V, typename R>
+KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<R>>
+broadcast_like(const V& input, const R& other) {
+    return broadcast(input, vector_extent_type<R> {});
+}
+
+namespace detail {
+
+template<size_t N>
+struct apply_recur_impl;
+
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_impl {
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
+        apply_recur_impl<N>::call(fun, result, inputs...);
+    }
+};
+
+template<size_t N>
+struct apply_recur_impl {
+    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+
+    template<typename F, typename Output, typename... Args>
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
+        apply_impl<F, K, Output, Args...>::call(fun, result, inputs...);
+        apply_impl<F, N - K, Output, Args...>::call(fun, result + K, (inputs + K)...);
+    }
+};
+
+template<>
+struct apply_recur_impl<0> {
+    template<typename F, typename Output, typename... Args>
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {}
+};
+
+template<>
+struct apply_recur_impl<1> {
+    template<typename F, typename Output, typename... Args>
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
+        result[0] = fun(inputs[0]...);
+    }
+};
+}  // namespace detail
+
+template<typename F, typename... Args>
+using map_type =
+    vector<result_t<F, vector_value_type<Args>...>, broadcast_vector_extent_type<Args...>>;
+
+/**
+ * Apply the function `F` to each element from the vector `input` and return the results as a new vector.
+ *
+ * Examples
+ * ========
+ * ```
+ * vec<float, 4> input = {1.0f, 2.0f, 3.0f, 4.0f};
+ * vec<float, 4> squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f]
+ * ```
+ */
+template<typename F, typename... Args>
+KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
+    using Output = result_t<F, vector_value_type<Args>...>;
+    using E = broadcast_vector_extent_type<Args...>;
+    vector_storage<Output, E::value> result;
+
+    detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>::call(
+        fun,
+        result.data(),
+        (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
+             into_vector_storage(args))
+             .data())...);
+
+    return result;
+}
+
+}  // namespace kernel_float
+
+#endif  // KERNEL_FLOAT_APPLY_H
\ No newline at end of file
diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h
index b37d817..8488dee 100644
--- a/include/kernel_float/bf16.h
+++ b/include/kernel_float/bf16.h
@@ -31,103 +31,27 @@ template<>
 struct allow_float_fallback<__nv_bfloat16> {
     static constexpr bool value = true;
 };
-
-template<typename F>
-struct map_bfloat16x2 {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat162 call(F fun, __nv_bfloat162 input) {
-        __nv_bfloat16 a = fun(input.x);
-        __nv_bfloat16 b = fun(input.y);
-        return {a, b};
-    }
-};
-
-template<typename F>
-struct zip_bfloat16x2 {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) {
-        __nv_bfloat16 a = fun(left.x, left.y);
-        __nv_bfloat16 b = fun(right.y, right.y);
-        return {a, b};
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16> {
-    KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]};
-            __nv_bfloat162 b = map_bfloat16x2<F>::call(fun, a);
-            result[2 * i + 0] = b.x;
-            result[2 * i + 1] = b.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(input[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> {
-    KERNEL_FLOAT_INLINE static void
-    call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]};
-            __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]};
-            __nv_bfloat162 c = zip_bfloat16x2<F>::call(fun, a, b);
-            result[2 * i + 0] = c.x;
-            result[2 * i + 1] = c.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(left[N - 1], right[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct reduce_impl<F, N, __nv_bfloat16, enable_if_t<(N >= 2)>> {
-    KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) {
-        __nv_bfloat162 accum = {input[0], input[1]};
-
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]};
-            accum = zip_bfloat16x2<F>::call(fun, accum, a);
-        }
-
-        __nv_bfloat16 result = fun(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            result = fun(result, input[N - 1]);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
+};  // namespace detail
 
 #if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                       \
-    namespace ops {                                                         \
-    template<>                                                              \
-    struct NAME<__nv_bfloat16> {                                            \
-        KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \
-            return FUN1(input);                                             \
-        }                                                                   \
-    };                                                                      \
-    }                                                                       \
-    namespace detail {                                                      \
-    template<>                                                              \
-    struct map_bfloat16x2<ops::NAME<__nv_bfloat16>> {                       \
-        KERNEL_FLOAT_INLINE static __nv_bfloat162                           \
-        call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) {              \
-            return FUN2(input);                                             \
-        }                                                                   \
-    };                                                                      \
+#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                                   \
+    namespace ops {                                                                     \
+    template<>                                                                          \
+    struct NAME<__nv_bfloat16> {                                                        \
+        KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) {             \
+            return FUN1(input);                                                         \
+        }                                                                               \
+    };                                                                                  \
+    }                                                                                   \
+    namespace detail {                                                                  \
+    template<>                                                                          \
+    struct apply_impl<ops::NAME<__nv_bfloat16>, 2, __nv_bfloat16, __nv_bfloat16> {      \
+        KERNEL_FLOAT_INLINE static void                                                 \
+        call(ops::NAME<__nv_bfloat16>, __nv_bfloat16* result, const __nv_bfloat16* a) { \
+            __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]});                       \
+            result[0] = r.x, result[1] = r.y;                                           \
+        }                                                                               \
+    };                                                                                  \
     }
 #else
 #define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)
@@ -156,24 +80,28 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
 #endif
 
 #if KERNEL_FLOAT_CUDA_ARCH >= 800
-#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2)                              \
-    namespace ops {                                                                 \
-    template<>                                                                      \
-    struct NAME<__nv_bfloat16> {                                                    \
-        KERNEL_FLOAT_INLINE __nv_bfloat16                                           \
-        operator()(__nv_bfloat16 left, __nv_bfloat16 right) const {                 \
-            return FUN1(left, right);                                               \
-        }                                                                           \
-    };                                                                              \
-    }                                                                               \
-    namespace detail {                                                              \
-    template<>                                                                      \
-    struct zip_bfloat16x2<ops::NAME<__nv_bfloat16>> {                               \
-        KERNEL_FLOAT_INLINE static __nv_bfloat162                                   \
-        call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \
-            return FUN2(left, right);                                               \
-        }                                                                           \
-    };                                                                              \
+#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2)                                            \
+    namespace ops {                                                                               \
+    template<>                                                                                    \
+    struct NAME<__nv_bfloat16> {                                                                  \
+        KERNEL_FLOAT_INLINE __nv_bfloat16                                                         \
+        operator()(__nv_bfloat16 left, __nv_bfloat16 right) const {                               \
+            return FUN1(left, right);                                                             \
+        }                                                                                         \
+    };                                                                                            \
+    }                                                                                             \
+    namespace detail {                                                                            \
+    template<>                                                                                    \
+    struct apply_impl<ops::NAME<__nv_bfloat16>, 2, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> { \
+        KERNEL_FLOAT_INLINE static void call(                                                     \
+            ops::NAME<__nv_bfloat16>,                                                             \
+            __nv_bfloat16* result,                                                                \
+            const __nv_bfloat16* a,                                                               \
+            const __nv_bfloat16* b) {                                                             \
+            __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]}, __nv_bfloat162 {b[0], b[1]});    \
+            result[0] = r.x, result[1] = r.y;                                                     \
+        }                                                                                         \
+    };                                                                                            \
     }
 #else
 #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2)
@@ -195,6 +123,42 @@ KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2)
 KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2)
 KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2)
 
+#if KERNEL_FLOAT_CUDA_ARCH >= 800
+namespace ops {
+template<>
+struct fma<__nv_bfloat16> {
+    KERNEL_FLOAT_INLINE __nv_bfloat16
+    operator()(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) const {
+        return __hfma(a, b, c);
+    }
+};
+}  // namespace ops
+
+namespace detail {
+template<>
+struct apply_impl<
+    ops::fma<__nv_bfloat16>,
+    2,
+    __nv_bfloat16,
+    __nv_bfloat16,
+    __nv_bfloat16,
+    __nv_bfloat16> {
+    KERNEL_FLOAT_INLINE static void call(
+        ops::fma<__nv_bfloat16>,
+        __nv_bfloat16* result,
+        const __nv_bfloat16* a,
+        const __nv_bfloat16* b,
+        const __nv_bfloat16* c) {
+        __nv_bfloat162 r = __hfma2(
+            __nv_bfloat162 {a[0], a[1]},
+            __nv_bfloat162 {b[0], b[1]},
+            __nv_bfloat162 {c[0], c[1]});
+        result[0] = r.x, result[1] = r.y;
+    }
+};
+}  // namespace detail
+#endif
+
 namespace ops {
 template<>
 struct cast<double, __nv_bfloat16> {
diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h
index 6f9e4fd..9faaeec 100644
--- a/include/kernel_float/conversion.h
+++ b/include/kernel_float/conversion.h
@@ -6,118 +6,6 @@
 
 namespace kernel_float {
 
-namespace detail {
-
-template<typename... Es>
-struct broadcast_extent_helper;
-
-template<typename E>
-struct broadcast_extent_helper<E> {
-    using type = E;
-};
-
-template<size_t N>
-struct broadcast_extent_helper<extent<N>, extent<N>> {
-    using type = extent<N>;
-};
-
-template<size_t N>
-struct broadcast_extent_helper<extent<1>, extent<N>> {
-    using type = extent<N>;
-};
-
-template<size_t N>
-struct broadcast_extent_helper<extent<N>, extent<1>> {
-    using type = extent<N>;
-};
-
-template<>
-struct broadcast_extent_helper<extent<1>, extent<1>> {
-    using type = extent<1>;
-};
-
-template<typename A, typename B, typename C, typename... Rest>
-struct broadcast_extent_helper<A, B, C, Rest...>:
-    broadcast_extent_helper<typename broadcast_extent_helper<A, B>::type, C, Rest...> {};
-
-}  // namespace detail
-
-template<typename... Es>
-using broadcast_extent = typename detail::broadcast_extent_helper<Es...>::type;
-
-template<typename... Vs>
-using broadcast_vector_extent_type = broadcast_extent<vector_extent_type<Vs>...>;
-
-template<typename From, typename To>
-static constexpr bool is_broadcastable = is_same_type<broadcast_extent<From, To>, To>;
-
-template<typename V, typename To>
-static constexpr bool is_vector_broadcastable = is_broadcastable<vector_extent_type<V>, To>;
-
-namespace detail {
-
-template<typename T, typename From, typename To>
-struct broadcast_impl;
-
-template<typename T, size_t N>
-struct broadcast_impl<T, extent<1>, extent<N>> {
-    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(const vector_storage<T, 1>& input) {
-        vector_storage<T, N> output;
-        for (size_t i = 0; i < N; i++) {
-            output.data()[i] = input.data()[0];
-        }
-        return output;
-    }
-};
-
-template<typename T, size_t N>
-struct broadcast_impl<T, extent<N>, extent<N>> {
-    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(vector_storage<T, N> input) {
-        return input;
-    }
-};
-
-template<typename T>
-struct broadcast_impl<T, extent<1>, extent<1>> {
-    KERNEL_FLOAT_INLINE static vector_storage<T, 1> call(vector_storage<T, 1> input) {
-        return input;
-    }
-};
-
-}  // namespace detail
-
-/**
- * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input`
- * is 1 or `N`.
- *
- * Example
- * =======
- * ```
- * vec<float, 1> a = {1.0f};
- * vec<float, 5> x = broadcast<5>(a);  // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f]
- *
- * vec<float, 5> b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
- * vec<float, 5> y = broadcast<5>(b);  // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f]
- * ```
- */
-template<size_t N, typename V>
-KERNEL_FLOAT_INLINE vector<vector_value_type<V>, extent<N>>
-broadcast(const V& input, extent<N> new_size = {}) {
-    using T = vector_value_type<V>;
-    return detail::broadcast_impl<T, vector_extent_type<V>, extent<N>>::call(
-        into_vector_storage(input));
-}
-
-/**
- * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the
- * size of `input` is 1 or the same as `other`.
- */
-template<typename V, typename R>
-KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<R>>
-broadcast_like(const V& input, const R& other) {
-    return broadcast(input, vector_extent_type<R> {});
-}
-
 namespace detail {
 /**
  * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`.
diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h
index f36836a..7546fe3 100644
--- a/include/kernel_float/fp16.h
+++ b/include/kernel_float/fp16.h
@@ -29,103 +29,26 @@ template<>
 struct allow_float_fallback<__half> {
     static constexpr bool value = true;
 };
-
-template<typename F>
-struct map_halfx2 {
-    KERNEL_FLOAT_INLINE
-    static __half2 call(F fun, __half2 input) {
-        __half a = fun(input.x);
-        __half b = fun(input.y);
-        return {a, b};
-    }
-};
-
-template<typename F>
-struct zip_halfx2 {
-    KERNEL_FLOAT_INLINE
-    static __half2 call(F fun, __half2 left, __half2 right) {
-        __half a = fun(left.x, left.y);
-        __half b = fun(right.y, right.y);
-        return {a, b};
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __half, __half> {
-    KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __half2 a = {input[2 * i], input[2 * i + 1]};
-            __half2 b = map_halfx2<F>::call(fun, a);
-            result[2 * i + 0] = b.x;
-            result[2 * i + 1] = b.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(input[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __half, __half, __half> {
-    KERNEL_FLOAT_INLINE static void
-    call(F fun, __half* result, const __half* left, const __half* right) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __half2 a = {left[2 * i], left[2 * i + 1]};
-            __half2 b = {right[2 * i], right[2 * i + 1]};
-            __half2 c = zip_halfx2<F>::call(fun, a, b);
-            result[2 * i + 0] = c.x;
-            result[2 * i + 1] = c.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(left[N - 1], right[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct reduce_impl<F, N, __half, enable_if_t<(N >= 2)>> {
-    KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) {
-        __half2 accum = {input[0], input[1]};
-
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __half2 a = {input[2 * i], input[2 * i + 1]};
-            accum = zip_halfx2<F>::call(fun, accum, a);
-        }
-
-        __half result = fun(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            result = fun(result, input[N - 1]);
-        }
-
-        return result;
-    }
-};
-
 };  // namespace detail
 
 #if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2)                               \
-    namespace ops {                                                                 \
-    template<>                                                                      \
-    struct NAME<__half> {                                                           \
-        KERNEL_FLOAT_INLINE __half operator()(__half input) {                       \
-            return FUN1(input);                                                     \
-        }                                                                           \
-    };                                                                              \
-    }                                                                               \
-    namespace detail {                                                              \
-    template<>                                                                      \
-    struct map_halfx2<ops::NAME<__half>> {                                          \
-        KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \
-            return FUN2(input);                                                     \
-        }                                                                           \
-    };                                                                              \
+#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2)                                              \
+    namespace ops {                                                                                \
+    template<>                                                                                     \
+    struct NAME<__half> {                                                                          \
+        KERNEL_FLOAT_INLINE __half operator()(__half input) {                                      \
+            return FUN1(input);                                                                    \
+        }                                                                                          \
+    };                                                                                             \
+    }                                                                                              \
+    namespace detail {                                                                             \
+    template<>                                                                                     \
+    struct apply_impl<ops::NAME<__half>, 2, __half, __half> {                                      \
+        KERNEL_FLOAT_INLINE static void call(ops::NAME<__half>, __half* result, const __half* a) { \
+            __half2 r = FUN2(__half2 {a[0], a[1]});                                                \
+            result[0] = r.x, result[1] = r.y;                                                      \
+        }                                                                                          \
+    };                                                                                             \
     }
 #else
 #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2)
@@ -152,22 +75,24 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos)
 KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
 
 #if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)                                            \
-    namespace ops {                                                                               \
-    template<>                                                                                    \
-    struct NAME<__half> {                                                                         \
-        KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const {                  \
-            return FUN1(left, right);                                                             \
-        }                                                                                         \
-    };                                                                                            \
-    }                                                                                             \
-    namespace detail {                                                                            \
-    template<>                                                                                    \
-    struct zip_halfx2<ops::NAME<__half>> {                                                        \
-        KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \
-            return FUN2(left, right);                                                             \
-        }                                                                                         \
-    };                                                                                            \
+#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)                              \
+    namespace ops {                                                                 \
+    template<>                                                                      \
+    struct NAME<__half> {                                                           \
+        KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const {    \
+            return FUN1(left, right);                                               \
+        }                                                                           \
+    };                                                                              \
+    }                                                                               \
+    namespace detail {                                                              \
+    template<>                                                                      \
+    struct apply_impl<ops::NAME<__half>, 2, __half, __half, __half> {               \
+        KERNEL_FLOAT_INLINE static void                                             \
+        call(ops::NAME<__half>, __half* result, const __half* a, const __half* b) { \
+            __half2 r = FUN2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]});           \
+            result[0] = r.x, result[1] = r.y;                                       \
+        }                                                                           \
+    };                                                                              \
     }
 #else
 #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)
@@ -188,6 +113,28 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2)
 KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2)
 KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2)
 
+#if KERNEL_FLOAT_IS_DEVICE
+namespace ops {
+template<>
+struct fma<__half> {
+    KERNEL_FLOAT_INLINE __half operator()(__half a, __half b, __half c) const {
+        return __hfma(a, b, c);
+    }
+};
+}  // namespace ops
+
+namespace detail {
+template<>
+struct apply_impl<ops::fma<__half>, 2, __half, __half, __half, __half> {
+    KERNEL_FLOAT_INLINE static void
+    call(ops::fma<__half>, __half* result, const __half* a, const __half* b, const __half* c) {
+        __half2 r = __hfma2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]}, __half2 {c[0], c[1]});
+        result[0] = r.x, result[1] = r.y;
+    }
+};
+}  // namespace detail
+#endif
+
 #define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF)    \
     namespace ops {                                      \
     template<>                                           \
diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h
index 5141f82..8becc0e 100644
--- a/include/kernel_float/meta.h
+++ b/include/kernel_float/meta.h
@@ -270,6 +270,14 @@ struct enable_if_impl<true, T> {
 template<bool C, typename T = void>
 using enable_if_t = typename detail::enable_if_impl<C, T>::type;
 
+constexpr size_t round_up_to_power_of_two(size_t n) {
+    size_t result = 1;
+    while (result < n) {
+        result *= 2;
+    }
+    return result;
+}
+
 }  // namespace kernel_float
 
 #endif
diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h
index 7056175..8ec790d 100644
--- a/include/kernel_float/reduce.h
+++ b/include/kernel_float/reduce.h
@@ -5,23 +5,56 @@
 
 namespace kernel_float {
 namespace detail {
+
+template<size_t N>
+struct reduce_recur_impl;
+
 template<typename F, size_t N, typename T, typename = void>
 struct reduce_impl {
     KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
-        return call(fun, input, make_index_sequence<N> {});
+        return reduce_recur_impl<N>::call(fun, input);
     }
+};
+
+template<size_t N>
+struct reduce_recur_impl {
+    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+
+    template<typename F, typename T>
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
+        vector_storage<T, K> temp;
+        apply_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
 
-  private:
-    template<size_t... Is>
-    KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) {
-        T result = input[0];
+        if constexpr (N < 2 * K) {
 #pragma unroll
-        for (size_t i = 1; i < N; i++) {
-            result = fun(result, input[i]);
+            for (size_t i = N - K; i < K; i++) {
+                temp.data()[i] = input[i];
+            }
         }
-        return result;
+
+        return reduce_impl<F, K, T>::call(fun, temp.data());
+    }
+};
+
+template<>
+struct reduce_recur_impl<0> {};
+
+template<>
+struct reduce_recur_impl<1> {
+    template<typename F, typename T>
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
+        return input[0];
+    }
+};
+
+template<>
+struct reduce_recur_impl<2> {
+    template<typename F, typename T>
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
+        return fun(input[0], input[1]);
     }
 };
+
 }  // namespace detail
 
 /**
diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h
index b0900ae..d45e1cc 100644
--- a/include/kernel_float/unops.h
+++ b/include/kernel_float/unops.h
@@ -1,49 +1,9 @@
 #ifndef KERNEL_FLOAT_UNOPS_H
 #define KERNEL_FLOAT_UNOPS_H
 
-#include "base.h"
+#include "apply.h"
 
 namespace kernel_float {
-namespace detail {
-
-template<typename F, size_t N, typename Output, typename... Args>
-struct apply_impl {
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-#pragma unroll
-        for (size_t i = 0; i < N; i++) {
-            result[i] = fun(inputs[i]...);
-        }
-    }
-};
-}  // namespace detail
-
-template<typename F, typename V>
-using map_type = vector<result_t<F, vector_value_type<V>>, vector_extent_type<V>>;
-
-/**
- * Apply the function `F` to each element from the vector `input` and return the results as a new vector.
- *
- * Examples
- * ========
- * ```
- * vec<float, 4> input = {1.0f, 2.0f, 3.0f, 4.0f};
- * vec<float, 4> squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f]
- * ```
- */
-template<typename F, typename V>
-KERNEL_FLOAT_INLINE map_type<F, V> map(F fun, const V& input) {
-    using Input = vector_value_type<V>;
-    using Output = result_t<F, Input>;
-    vector_storage<Output, vector_extent<V>> result;
-
-    detail::apply_impl<F, vector_extent<V>, Output, Input>::call(
-        fun,
-        result.data(),
-        into_vector_storage(input).data());
-
-    return result;
-}
-
 namespace detail {
 // Indicates that elements of type `T` offer less precision than floats, thus operations
 // on elements of type `T` can be performed by upcasting them to ` float`.
diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h
index 56cd1fe..0880d00 100644
--- a/include/kernel_float/vector.h
+++ b/include/kernel_float/vector.h
@@ -6,6 +6,7 @@
 #include "iterate.h"
 #include "macros.h"
 #include "reduce.h"
+#include "triops.h"
 #include "unops.h"
 
 namespace kernel_float {
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index 035e87f..8c2ef41 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2023-10-23 12:19:00.301788
-// git hash: f668fa44813fadb51dc48192391b51502ba17618
+// date: 2023-10-24 14:03:49.849025
+// git hash: a35b9f600525b2253f9b2e1fb3cb91d382ac2a7d
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -354,6 +354,14 @@ struct enable_if_impl<true, T> {
 template<bool C, typename T = void>
 using enable_if_t = typename detail::enable_if_impl<C, T>::type;
 
+constexpr size_t round_up_to_power_of_two(size_t n) {
+    size_t result = 1;
+    while (result < n) {
+        result *= 2;
+    }
+    return result;
+}
+
 }  // namespace kernel_float
 
 #endif
@@ -611,6 +619,195 @@ KERNEL_FLOAT_INLINE vector_storage_type<V> into_vector_storage(V&& input) {
 }  // namespace kernel_float
 
 #endif
+#ifndef KERNEL_FLOAT_APPLY_H
+#define KERNEL_FLOAT_APPLY_H
+
+
+
+namespace kernel_float {
+namespace detail {
+
+template<typename... Es>
+struct broadcast_extent_helper;
+
+template<typename E>
+struct broadcast_extent_helper<E> {
+    using type = E;
+};
+
+template<size_t N>
+struct broadcast_extent_helper<extent<N>, extent<N>> {
+    using type = extent<N>;
+};
+
+template<size_t N>
+struct broadcast_extent_helper<extent<1>, extent<N>> {
+    using type = extent<N>;
+};
+
+template<size_t N>
+struct broadcast_extent_helper<extent<N>, extent<1>> {
+    using type = extent<N>;
+};
+
+template<>
+struct broadcast_extent_helper<extent<1>, extent<1>> {
+    using type = extent<1>;
+};
+
+template<typename A, typename B, typename C, typename... Rest>
+struct broadcast_extent_helper<A, B, C, Rest...>:
+    broadcast_extent_helper<typename broadcast_extent_helper<A, B>::type, C, Rest...> {};
+
+}  // namespace detail
+
+template<typename... Es>
+using broadcast_extent = typename detail::broadcast_extent_helper<Es...>::type;
+
+template<typename... Vs>
+using broadcast_vector_extent_type = broadcast_extent<vector_extent_type<Vs>...>;
+
+template<typename From, typename To>
+static constexpr bool is_broadcastable = is_same_type<broadcast_extent<From, To>, To>;
+
+template<typename V, typename To>
+static constexpr bool is_vector_broadcastable = is_broadcastable<vector_extent_type<V>, To>;
+
+namespace detail {
+
+template<typename T, typename From, typename To>
+struct broadcast_impl;
+
+template<typename T, size_t N>
+struct broadcast_impl<T, extent<1>, extent<N>> {
+    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(const vector_storage<T, 1>& input) {
+        vector_storage<T, N> output;
+        for (size_t i = 0; i < N; i++) {
+            output.data()[i] = input.data()[0];
+        }
+        return output;
+    }
+};
+
+template<typename T, size_t N>
+struct broadcast_impl<T, extent<N>, extent<N>> {
+    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(vector_storage<T, N> input) {
+        return input;
+    }
+};
+
+template<typename T>
+struct broadcast_impl<T, extent<1>, extent<1>> {
+    KERNEL_FLOAT_INLINE static vector_storage<T, 1> call(vector_storage<T, 1> input) {
+        return input;
+    }
+};
+
+}  // namespace detail
+
+/**
+ * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input`
+ * is 1 or `N`.
+ *
+ * Example
+ * =======
+ * ```
+ * vec<float, 1> a = {1.0f};
+ * vec<float, 5> x = broadcast<5>(a);  // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f]
+ *
+ * vec<float, 5> b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+ * vec<float, 5> y = broadcast<5>(b);  // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f]
+ * ```
+ */
+template<size_t N, typename V>
+KERNEL_FLOAT_INLINE vector<vector_value_type<V>, extent<N>>
+broadcast(const V& input, extent<N> new_size = {}) {
+    using T = vector_value_type<V>;
+    return detail::broadcast_impl<T, vector_extent_type<V>, extent<N>>::call(
+        into_vector_storage(input));
+}
+
+/**
+ * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the
+ * size of `input` is 1 or the same as `other`.
+ */
+template<typename V, typename R>
+KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<R>>
+broadcast_like(const V& input, const R& other) {
+    return broadcast(input, vector_extent_type<R> {});
+}
+
+namespace detail {
+
+template<size_t N>
+struct apply_recur_impl;
+
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_impl {
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
+        apply_recur_impl<N>::call(fun, result, inputs...);
+    }
+};
+
+template<size_t N>
+struct apply_recur_impl {
+    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+
+    template<typename F, typename Output, typename... Args>
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
+        apply_impl<F, K, Output, Args...>::call(fun, result, inputs...);
+        apply_impl<F, N - K, Output, Args...>::call(fun, result + K, (inputs + K)...);
+    }
+};
+
+template<>
+struct apply_recur_impl<0> {
+    template<typename F, typename Output, typename... Args>
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {}
+};
+
+template<>
+struct apply_recur_impl<1> {
+    template<typename F, typename Output, typename... Args>
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
+        result[0] = fun(inputs[0]...);
+    }
+};
+}  // namespace detail
+
+template<typename F, typename... Args>
+using map_type =
+    vector<result_t<F, vector_value_type<Args>...>, broadcast_vector_extent_type<Args...>>;
+
+/**
+ * Apply the function `F` to each element from the vector `input` and return the results as a new vector.
+ *
+ * Examples
+ * ========
+ * ```
+ * vec<float, 4> input = {1.0f, 2.0f, 3.0f, 4.0f};
+ * vec<float, 4> squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f]
+ * ```
+ */
+template<typename F, typename... Args>
+KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
+    using Output = result_t<F, vector_value_type<Args>...>;
+    using E = broadcast_vector_extent_type<Args...>;
+    vector_storage<Output, E::value> result;
+
+    detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>::call(
+        fun,
+        result.data(),
+        (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
+             into_vector_storage(args))
+             .data())...);
+
+    return result;
+}
+
+}  // namespace kernel_float
+
+#endif  // KERNEL_FLOAT_APPLY_H
 #ifndef KERNEL_FLOAT_COMPLEX_TYPE_H
 #define KERNEL_FLOAT_COMPLEX_TYPE_H
 
@@ -866,46 +1063,6 @@ struct promote_type<L, complex_type<R>> {
 
 
 namespace kernel_float {
-namespace detail {
-
-template<typename F, size_t N, typename Output, typename... Args>
-struct apply_impl {
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-#pragma unroll
-        for (size_t i = 0; i < N; i++) {
-            result[i] = fun(inputs[i]...);
-        }
-    }
-};
-}  // namespace detail
-
-template<typename F, typename V>
-using map_type = vector<result_t<F, vector_value_type<V>>, vector_extent_type<V>>;
-
-/**
- * Apply the function `F` to each element from the vector `input` and return the results as a new vector.
- *
- * Examples
- * ========
- * ```
- * vec<float, 4> input = {1.0f, 2.0f, 3.0f, 4.0f};
- * vec<float, 4> squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f]
- * ```
- */
-template<typename F, typename V>
-KERNEL_FLOAT_INLINE map_type<F, V> map(F fun, const V& input) {
-    using Input = vector_value_type<V>;
-    using Output = result_t<F, Input>;
-    vector_storage<Output, vector_extent<V>> result;
-
-    detail::apply_impl<F, vector_extent<V>, Output, Input>::call(
-        fun,
-        result.data(),
-        into_vector_storage(input).data());
-
-    return result;
-}
-
 namespace detail {
 // Indicates that elements of type `T` offer less precision than floats, thus operations
 // on elements of type `T` can be performed by upcasting them to ` float`.
@@ -1130,118 +1287,6 @@ KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf)
 
 namespace kernel_float {
 
-namespace detail {
-
-template<typename... Es>
-struct broadcast_extent_helper;
-
-template<typename E>
-struct broadcast_extent_helper<E> {
-    using type = E;
-};
-
-template<size_t N>
-struct broadcast_extent_helper<extent<N>, extent<N>> {
-    using type = extent<N>;
-};
-
-template<size_t N>
-struct broadcast_extent_helper<extent<1>, extent<N>> {
-    using type = extent<N>;
-};
-
-template<size_t N>
-struct broadcast_extent_helper<extent<N>, extent<1>> {
-    using type = extent<N>;
-};
-
-template<>
-struct broadcast_extent_helper<extent<1>, extent<1>> {
-    using type = extent<1>;
-};
-
-template<typename A, typename B, typename C, typename... Rest>
-struct broadcast_extent_helper<A, B, C, Rest...>:
-    broadcast_extent_helper<typename broadcast_extent_helper<A, B>::type, C, Rest...> {};
-
-}  // namespace detail
-
-template<typename... Es>
-using broadcast_extent = typename detail::broadcast_extent_helper<Es...>::type;
-
-template<typename... Vs>
-using broadcast_vector_extent_type = broadcast_extent<vector_extent_type<Vs>...>;
-
-template<typename From, typename To>
-static constexpr bool is_broadcastable = is_same_type<broadcast_extent<From, To>, To>;
-
-template<typename V, typename To>
-static constexpr bool is_vector_broadcastable = is_broadcastable<vector_extent_type<V>, To>;
-
-namespace detail {
-
-template<typename T, typename From, typename To>
-struct broadcast_impl;
-
-template<typename T, size_t N>
-struct broadcast_impl<T, extent<1>, extent<N>> {
-    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(const vector_storage<T, 1>& input) {
-        vector_storage<T, N> output;
-        for (size_t i = 0; i < N; i++) {
-            output.data()[i] = input.data()[0];
-        }
-        return output;
-    }
-};
-
-template<typename T, size_t N>
-struct broadcast_impl<T, extent<N>, extent<N>> {
-    KERNEL_FLOAT_INLINE static vector_storage<T, N> call(vector_storage<T, N> input) {
-        return input;
-    }
-};
-
-template<typename T>
-struct broadcast_impl<T, extent<1>, extent<1>> {
-    KERNEL_FLOAT_INLINE static vector_storage<T, 1> call(vector_storage<T, 1> input) {
-        return input;
-    }
-};
-
-}  // namespace detail
-
-/**
- * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input`
- * is 1 or `N`.
- *
- * Example
- * =======
- * ```
- * vec<float, 1> a = {1.0f};
- * vec<float, 5> x = broadcast<5>(a);  // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f]
- *
- * vec<float, 5> b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
- * vec<float, 5> y = broadcast<5>(b);  // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f]
- * ```
- */
-template<size_t N, typename V>
-KERNEL_FLOAT_INLINE vector<vector_value_type<V>, extent<N>>
-broadcast(const V& input, extent<N> new_size = {}) {
-    using T = vector_value_type<V>;
-    return detail::broadcast_impl<T, vector_extent_type<V>, extent<N>>::call(
-        into_vector_storage(input));
-}
-
-/**
- * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the
- * size of `input` is 1 or the same as `other`.
- */
-template<typename V, typename R>
-KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<R>>
-broadcast_like(const V& input, const R& other) {
-    return broadcast(input, vector_extent_type<R> {});
-}
-
 namespace detail {
 /**
  * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`.
@@ -2813,23 +2858,56 @@ struct aligned_ptr<const T, alignment> {
 
 namespace kernel_float {
 namespace detail {
+
+template<size_t N>
+struct reduce_recur_impl;
+
 template<typename F, size_t N, typename T, typename = void>
 struct reduce_impl {
     KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
-        return call(fun, input, make_index_sequence<N> {});
+        return reduce_recur_impl<N>::call(fun, input);
     }
+};
 
-  private:
-    template<size_t... Is>
-    KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) {
-        T result = input[0];
+template<size_t N>
+struct reduce_recur_impl {
+    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+
+    template<typename F, typename T>
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
+        vector_storage<T, K> temp;
+        apply_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
+
+        if constexpr (N < 2 * K) {
 #pragma unroll
-        for (size_t i = 1; i < N; i++) {
-            result = fun(result, input[i]);
+            for (size_t i = N - K; i < K; i++) {
+                temp.data()[i] = input[i];
+            }
         }
-        return result;
+
+        return reduce_impl<F, K, T>::call(fun, temp.data());
     }
 };
+
+template<>
+struct reduce_recur_impl<0> {};
+
+template<>
+struct reduce_recur_impl<1> {
+    template<typename F, typename T>
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
+        return input[0];
+    }
+};
+
+template<>
+struct reduce_recur_impl<2> {
+    template<typename F, typename T>
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
+        return fun(input[0], input[1]);
+    }
+};
+
 }  // namespace detail
 
 /**
@@ -3210,6 +3288,7 @@ KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
 
 
 
+
 namespace kernel_float {
 
 /**
@@ -3573,103 +3652,26 @@ template<>
 struct allow_float_fallback<__half> {
     static constexpr bool value = true;
 };
-
-template<typename F>
-struct map_halfx2 {
-    KERNEL_FLOAT_INLINE
-    static __half2 call(F fun, __half2 input) {
-        __half a = fun(input.x);
-        __half b = fun(input.y);
-        return {a, b};
-    }
-};
-
-template<typename F>
-struct zip_halfx2 {
-    KERNEL_FLOAT_INLINE
-    static __half2 call(F fun, __half2 left, __half2 right) {
-        __half a = fun(left.x, left.y);
-        __half b = fun(right.y, right.y);
-        return {a, b};
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __half, __half> {
-    KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __half2 a = {input[2 * i], input[2 * i + 1]};
-            __half2 b = map_halfx2<F>::call(fun, a);
-            result[2 * i + 0] = b.x;
-            result[2 * i + 1] = b.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(input[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __half, __half, __half> {
-    KERNEL_FLOAT_INLINE static void
-    call(F fun, __half* result, const __half* left, const __half* right) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __half2 a = {left[2 * i], left[2 * i + 1]};
-            __half2 b = {right[2 * i], right[2 * i + 1]};
-            __half2 c = zip_halfx2<F>::call(fun, a, b);
-            result[2 * i + 0] = c.x;
-            result[2 * i + 1] = c.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(left[N - 1], right[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct reduce_impl<F, N, __half, enable_if_t<(N >= 2)>> {
-    KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) {
-        __half2 accum = {input[0], input[1]};
-
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __half2 a = {input[2 * i], input[2 * i + 1]};
-            accum = zip_halfx2<F>::call(fun, accum, a);
-        }
-
-        __half result = fun(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            result = fun(result, input[N - 1]);
-        }
-
-        return result;
-    }
-};
-
 };  // namespace detail
 
 #if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2)                               \
-    namespace ops {                                                                 \
-    template<>                                                                      \
-    struct NAME<__half> {                                                           \
-        KERNEL_FLOAT_INLINE __half operator()(__half input) {                       \
-            return FUN1(input);                                                     \
-        }                                                                           \
-    };                                                                              \
-    }                                                                               \
-    namespace detail {                                                              \
-    template<>                                                                      \
-    struct map_halfx2<ops::NAME<__half>> {                                          \
-        KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \
-            return FUN2(input);                                                     \
-        }                                                                           \
-    };                                                                              \
+#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2)                                              \
+    namespace ops {                                                                                \
+    template<>                                                                                     \
+    struct NAME<__half> {                                                                          \
+        KERNEL_FLOAT_INLINE __half operator()(__half input) {                                      \
+            return FUN1(input);                                                                    \
+        }                                                                                          \
+    };                                                                                             \
+    }                                                                                              \
+    namespace detail {                                                                             \
+    template<>                                                                                     \
+    struct apply_impl<ops::NAME<__half>, 2, __half, __half> {                                      \
+        KERNEL_FLOAT_INLINE static void call(ops::NAME<__half>, __half* result, const __half* a) { \
+            __half2 r = FUN2(__half2 {a[0], a[1]});                                                \
+            result[0] = r.x, result[1] = r.y;                                                      \
+        }                                                                                          \
+    };                                                                                             \
     }
 #else
 #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2)
@@ -3696,22 +3698,24 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos)
 KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
 
 #if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)                                            \
-    namespace ops {                                                                               \
-    template<>                                                                                    \
-    struct NAME<__half> {                                                                         \
-        KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const {                  \
-            return FUN1(left, right);                                                             \
-        }                                                                                         \
-    };                                                                                            \
-    }                                                                                             \
-    namespace detail {                                                                            \
-    template<>                                                                                    \
-    struct zip_halfx2<ops::NAME<__half>> {                                                        \
-        KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \
-            return FUN2(left, right);                                                             \
-        }                                                                                         \
-    };                                                                                            \
+#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)                              \
+    namespace ops {                                                                 \
+    template<>                                                                      \
+    struct NAME<__half> {                                                           \
+        KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const {    \
+            return FUN1(left, right);                                               \
+        }                                                                           \
+    };                                                                              \
+    }                                                                               \
+    namespace detail {                                                              \
+    template<>                                                                      \
+    struct apply_impl<ops::NAME<__half>, 2, __half, __half, __half> {               \
+        KERNEL_FLOAT_INLINE static void                                             \
+        call(ops::NAME<__half>, __half* result, const __half* a, const __half* b) { \
+            __half2 r = FUN2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]});           \
+            result[0] = r.x, result[1] = r.y;                                       \
+        }                                                                           \
+    };                                                                              \
     }
 #else
 #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)
@@ -3732,6 +3736,28 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2)
 KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2)
 KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2)
 
+#if KERNEL_FLOAT_IS_DEVICE
+namespace ops {
+template<>
+struct fma<__half> {
+    KERNEL_FLOAT_INLINE __half operator()(__half a, __half b, __half c) const {
+        return __hfma(a, b, c);
+    }
+};
+}  // namespace ops
+
+namespace detail {
+template<>
+struct apply_impl<ops::fma<__half>, 2, __half, __half, __half, __half> {
+    KERNEL_FLOAT_INLINE static void
+    call(ops::fma<__half>, __half* result, const __half* a, const __half* b, const __half* c) {
+        __half2 r = __hfma2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]}, __half2 {c[0], c[1]});
+        result[0] = r.x, result[1] = r.y;
+    }
+};
+}  // namespace detail
+#endif
+
 #define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF)    \
     namespace ops {                                      \
     template<>                                           \
@@ -3857,103 +3883,27 @@ template<>
 struct allow_float_fallback<__nv_bfloat16> {
     static constexpr bool value = true;
 };
-
-template<typename F>
-struct map_bfloat16x2 {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat162 call(F fun, __nv_bfloat162 input) {
-        __nv_bfloat16 a = fun(input.x);
-        __nv_bfloat16 b = fun(input.y);
-        return {a, b};
-    }
-};
-
-template<typename F>
-struct zip_bfloat16x2 {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) {
-        __nv_bfloat16 a = fun(left.x, left.y);
-        __nv_bfloat16 b = fun(right.y, right.y);
-        return {a, b};
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16> {
-    KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]};
-            __nv_bfloat162 b = map_bfloat16x2<F>::call(fun, a);
-            result[2 * i + 0] = b.x;
-            result[2 * i + 1] = b.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(input[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> {
-    KERNEL_FLOAT_INLINE static void
-    call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) {
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]};
-            __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]};
-            __nv_bfloat162 c = zip_bfloat16x2<F>::call(fun, a, b);
-            result[2 * i + 0] = c.x;
-            result[2 * i + 1] = c.y;
-        }
-
-        if (N % 2 != 0) {
-            result[N - 1] = fun(left[N - 1], right[N - 1]);
-        }
-    }
-};
-
-template<typename F, size_t N>
-struct reduce_impl<F, N, __nv_bfloat16, enable_if_t<(N >= 2)>> {
-    KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) {
-        __nv_bfloat162 accum = {input[0], input[1]};
-
-#pragma unroll
-        for (size_t i = 0; 2 * i + 1 < N; i++) {
-            __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]};
-            accum = zip_bfloat16x2<F>::call(fun, accum, a);
-        }
-
-        __nv_bfloat16 result = fun(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            result = fun(result, input[N - 1]);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
+};  // namespace detail
 
 #if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                       \
-    namespace ops {                                                         \
-    template<>                                                              \
-    struct NAME<__nv_bfloat16> {                                            \
-        KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \
-            return FUN1(input);                                             \
-        }                                                                   \
-    };                                                                      \
-    }                                                                       \
-    namespace detail {                                                      \
-    template<>                                                              \
-    struct map_bfloat16x2<ops::NAME<__nv_bfloat16>> {                       \
-        KERNEL_FLOAT_INLINE static __nv_bfloat162                           \
-        call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) {              \
-            return FUN2(input);                                             \
-        }                                                                   \
-    };                                                                      \
+#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                                   \
+    namespace ops {                                                                     \
+    template<>                                                                          \
+    struct NAME<__nv_bfloat16> {                                                        \
+        KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) {             \
+            return FUN1(input);                                                         \
+        }                                                                               \
+    };                                                                                  \
+    }                                                                                   \
+    namespace detail {                                                                  \
+    template<>                                                                          \
+    struct apply_impl<ops::NAME<__nv_bfloat16>, 2, __nv_bfloat16, __nv_bfloat16> {      \
+        KERNEL_FLOAT_INLINE static void                                                 \
+        call(ops::NAME<__nv_bfloat16>, __nv_bfloat16* result, const __nv_bfloat16* a) { \
+            __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]});                       \
+            result[0] = r.x, result[1] = r.y;                                           \
+        }                                                                               \
+    };                                                                                  \
     }
 #else
 #define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)
@@ -3982,24 +3932,28 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
 #endif
 
 #if KERNEL_FLOAT_CUDA_ARCH >= 800
-#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2)                              \
-    namespace ops {                                                                 \
-    template<>                                                                      \
-    struct NAME<__nv_bfloat16> {                                                    \
-        KERNEL_FLOAT_INLINE __nv_bfloat16                                           \
-        operator()(__nv_bfloat16 left, __nv_bfloat16 right) const {                 \
-            return FUN1(left, right);                                               \
-        }                                                                           \
-    };                                                                              \
-    }                                                                               \
-    namespace detail {                                                              \
-    template<>                                                                      \
-    struct zip_bfloat16x2<ops::NAME<__nv_bfloat16>> {                               \
-        KERNEL_FLOAT_INLINE static __nv_bfloat162                                   \
-        call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \
-            return FUN2(left, right);                                               \
-        }                                                                           \
-    };                                                                              \
+#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2)                                            \
+    namespace ops {                                                                               \
+    template<>                                                                                    \
+    struct NAME<__nv_bfloat16> {                                                                  \
+        KERNEL_FLOAT_INLINE __nv_bfloat16                                                         \
+        operator()(__nv_bfloat16 left, __nv_bfloat16 right) const {                               \
+            return FUN1(left, right);                                                             \
+        }                                                                                         \
+    };                                                                                            \
+    }                                                                                             \
+    namespace detail {                                                                            \
+    template<>                                                                                    \
+    struct apply_impl<ops::NAME<__nv_bfloat16>, 2, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> { \
+        KERNEL_FLOAT_INLINE static void call(                                                     \
+            ops::NAME<__nv_bfloat16>,                                                             \
+            __nv_bfloat16* result,                                                                \
+            const __nv_bfloat16* a,                                                               \
+            const __nv_bfloat16* b) {                                                             \
+            __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]}, __nv_bfloat162 {b[0], b[1]});    \
+            result[0] = r.x, result[1] = r.y;                                                     \
+        }                                                                                         \
+    };                                                                                            \
     }
 #else
 #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2)
@@ -4021,6 +3975,42 @@ KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2)
 KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2)
 KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2)
 
+#if KERNEL_FLOAT_CUDA_ARCH >= 800
+namespace ops {
+template<>
+struct fma<__nv_bfloat16> {
+    KERNEL_FLOAT_INLINE __nv_bfloat16
+    operator()(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) const {
+        return __hfma(a, b, c);
+    }
+};
+}  // namespace ops
+
+namespace detail {
+template<>
+struct apply_impl<
+    ops::fma<__nv_bfloat16>,
+    2,
+    __nv_bfloat16,
+    __nv_bfloat16,
+    __nv_bfloat16,
+    __nv_bfloat16> {
+    KERNEL_FLOAT_INLINE static void call(
+        ops::fma<__nv_bfloat16>,
+        __nv_bfloat16* result,
+        const __nv_bfloat16* a,
+        const __nv_bfloat16* b,
+        const __nv_bfloat16* c) {
+        __nv_bfloat162 r = __hfma2(
+            __nv_bfloat162 {a[0], a[1]},
+            __nv_bfloat162 {b[0], b[1]},
+            __nv_bfloat162 {c[0], c[1]});
+        result[0] = r.x, result[1] = r.y;
+    }
+};
+}  // namespace detail
+#endif
+
 namespace ops {
 template<>
 struct cast<double, __nv_bfloat16> {

From d6b7199e1fc35785109a2fdf10eb232b03d50c84 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 24 Oct 2023 14:16:18 +0200
Subject: [PATCH 16/16] Change example to use FMA

---
 examples/vector_add/main.cu   | 2 +-
 include/kernel_float/meta.h   | 1 +
 single_include/kernel_float.h | 5 +++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
index 7b88292..448bbe5 100644
--- a/examples/vector_add/main.cu
+++ b/examples/vector_add/main.cu
@@ -18,7 +18,7 @@ __global__ void my_kernel(int length, const __half* input, double constant, floa
 
     if (i * N < length) {
         auto a = kf::read_aligned<N>(input + i * N);
-        auto b = (a * a) * constant;
+        auto b = kf::fma(a, a, kf::cast<__half>(constant));
         kf::write_aligned<N>(output + i * N, b);
     }
 }
diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h
index 8becc0e..5256129 100644
--- a/include/kernel_float/meta.h
+++ b/include/kernel_float/meta.h
@@ -270,6 +270,7 @@ struct enable_if_impl<true, T> {
 template<bool C, typename T = void>
 using enable_if_t = typename detail::enable_if_impl<C, T>::type;
 
+KERNEL_FLOAT_INLINE
 constexpr size_t round_up_to_power_of_two(size_t n) {
     size_t result = 1;
     while (result < n) {
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index 8c2ef41..b5f3c8a 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2023-10-24 14:03:49.849025
-// git hash: a35b9f600525b2253f9b2e1fb3cb91d382ac2a7d
+// date: 2023-10-24 14:14:37.228322
+// git hash: 28f811af866d73bef37acd541bac6a95df9a94c3
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -354,6 +354,7 @@ struct enable_if_impl<true, T> {
 template<bool C, typename T = void>
 using enable_if_t = typename detail::enable_if_impl<C, T>::type;
 
+KERNEL_FLOAT_INLINE
 constexpr size_t round_up_to_power_of_two(size_t n) {
     size_t result = 1;
     while (result < n) {