From 755a00eef4dfe5c5abe3bd75b78e8c83169f245e Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 4 Oct 2023 15:58:21 +0200 Subject: [PATCH 01/16] Add tiling object --- examples/CMakeLists.txt | 1 + examples/vector_add/main.cu | 2 +- examples/vector_add_tiling/CMakeLists.txt | 12 + examples/vector_add_tiling/main.cu | 97 ++++ include/kernel_float/tiling.h | 516 ++++++++++++++++++++++ tests/tiling.cu | 92 ++++ 6 files changed, 719 insertions(+), 1 deletion(-) create mode 100644 examples/vector_add_tiling/CMakeLists.txt create mode 100644 examples/vector_add_tiling/main.cu create mode 100644 tests/tiling.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 5342580..145f80b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(vector_add) +add_subdirectory(vector_add_tiling) diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index ea78d1a..465c707 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -17,7 +17,7 @@ __global__ void my_kernel(int length, const khalf* input, double constant, kf int i = blockIdx.x * blockDim.x + threadIdx.x; if (i * N < length) { - output[i] = kf::cast((input[i] * input[i]) * constant); + kf::cast_to(output[i]) = (input[i] * input[i]) * constant; } } diff --git a/examples/vector_add_tiling/CMakeLists.txt b/examples/vector_add_tiling/CMakeLists.txt new file mode 100644 index 0000000..a744c34 --- /dev/null +++ b/examples/vector_add_tiling/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.17) + +set (PROJECT_NAME kernel_float_vecadd_tiling) +project(${PROJECT_NAME} LANGUAGES CXX CUDA) +set (CMAKE_CXX_STANDARD 17) + +add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu") +target_link_libraries(${PROJECT_NAME} kernel_float) +set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80") + +find_package(CUDA REQUIRED) +target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE}) diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu new file mode 100644 index 0000000..1134778 --- /dev/null +++ b/examples/vector_add_tiling/main.cu @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +#include "kernel_float.h" +#include "kernel_float/tiling.h" +using namespace kernel_float::prelude; + +void cuda_check(cudaError_t code) { + if (code != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code)); + } +} + +template +__global__ void my_kernel( + int length, + kf::aligned_ptr input, + double constant, + kf::aligned_ptr output) { + auto tiling = kf::tiling< + kf::tile_factor, + kf::block_size, + kf::distributions>>(); + + auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0); + auto mask = tiling.local_mask(); + + auto a = kf::load(input.get(), points, mask); + auto b = (a * a) * constant; + kf::store(b, output.get(), points, mask); +} + +template +void run_kernel(int n) { + double constant = 1.0; + std::vector input(n); + std::vector output_expected; + std::vector output_result; + + // Generate input data + for (int i = 0; i < n; i++) { + input[i] = half(i); + output_expected[i] = float(i + constant); + } + + // Allocate device memory + __half* input_dev; + float* output_dev; + cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n)); + cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); + + // Copy device memory + cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault)); + + // Launch kernel! + int items_per_block = block_size * items_per_thread; + int grid_size = (n + items_per_block - 1) / items_per_block; + my_kernel<<>>( + n, + kf::aligned_ptr(input_dev), + constant, + kf::aligned_ptr(output_dev)); + + // Copy results back + cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault)); + + // Check results + for (int i = 0; i < n; i++) { + float result = output_result[i]; + float answer = output_expected[i]; + + if (result != answer) { + std::stringstream msg; + msg << "error: index " << i << " is incorrect: " << result << " != " << answer; + throw std::runtime_error(msg.str()); + } + } + + cuda_check(cudaFree(input_dev)); + cuda_check(cudaFree(output_dev)); +} + +int main() { + int n = 84000; // divisible by 1, 2, 3, 4, 5, 6, 7, 8 + cuda_check(cudaSetDevice(0)); + + run_kernel<1>(n); + run_kernel<2>(n); + run_kernel<3>(n); + run_kernel<4>(n); + run_kernel<8>(n); + + std::cout << "result correct\n"; + return EXIT_SUCCESS; +} diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h index e69de29..0561d01 100644 --- a/include/kernel_float/tiling.h +++ b/include/kernel_float/tiling.h @@ -0,0 +1,516 @@ +#ifndef KERNEL_FLOAT_TILING_H +#define KERNEL_FLOAT_TILING_H + +#include "iterate.h" +#include "vector.h" + +namespace kernel_float { + +template +struct block_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + block_size(dim3 thread_index) { + if (rank > 0 && size(0) > 1) { + thread_index_[0] = thread_index.x; + } + + if (rank > 1 && size(1) > 1) { + thread_index_[1] = thread_index.y; + } + + if (rank > 2 && size(2) > 1) { + thread_index_[2] = thread_index.z; + } + } + + KERNEL_FLOAT_INLINE + size_t thread_index(size_t axis) const { + return axis < rank ? thread_index_[axis] : 0; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } + + private: + unsigned int thread_index_[rank] = {0}; +}; + +template +struct virtual_block_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + virtual_block_size(dim3 thread_index) { + thread_index_ = thread_index.x; + } + + KERNEL_FLOAT_INLINE + size_t thread_index(size_t axis) const { + size_t product_up_to_axis = 1; +#pragma unroll + for (size_t i = 0; i < axis; i++) { + product_up_to_axis *= size(i); + } + + return (thread_index_ / product_up_to_axis) % size(axis); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } + + private: + unsigned int thread_index_ = 0; +}; + +template +struct tile_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis, size_t block_size = 0) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } +}; + +template +struct tile_factor { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis, size_t block_size) { + size_t factors[rank] = {Ns...}; + return block_size * (axis < rank ? factors[axis] : 1); + } +}; + +namespace dist { +template +struct blocked_impl { + static constexpr bool is_exhaustive = N % K == 0; + static constexpr size_t items_per_thread = (N / K) + (is_exhaustive ? 0 : 1); + + KERNEL_FLOAT_INLINE + static constexpr bool local_is_present(size_t thread_index, size_t local_index) { + return is_exhaustive || (local_to_global(thread_index, local_index) < N); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t local_to_global(size_t thread_index, size_t local_index) { + return thread_index * items_per_thread + local_index; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_local(size_t global_index) { + return global_index % items_per_thread; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_owner(size_t global_index) { + return global_index / items_per_thread; + } +}; + +struct blocked { + template + using type = blocked_impl; +}; + +template +struct cyclic_impl { + static constexpr bool is_exhaustive = N % (K * M) == 0; + static constexpr size_t items_per_thread = ((N / (K * M)) + (is_exhaustive ? 0 : 1)) * M; + + KERNEL_FLOAT_INLINE + static constexpr bool local_is_present(size_t thread_index, size_t local_index) { + return is_exhaustive || (local_to_global(thread_index, local_index) < N); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t local_to_global(size_t thread_index, size_t local_index) { + return (local_index / M) * M * K + thread_index * M + (local_index % M); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_local(size_t global_index) { + return (global_index / (M * K)) * M + (global_index % M); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_owner(size_t global_index) { + return (global_index / M) % K; + } +}; + +struct cyclic { + template + using type = cyclic_impl<1, N, K>; +}; + +template +struct block_cyclic { + template + using type = cyclic_impl; +}; +} // namespace dist + +template +struct distributions {}; + +namespace detail { +template +struct instantiate_distribution_impl { + template + using type = dist::cyclic::type; +}; + +template +struct instantiate_distribution_impl<0, distributions> { + template + using type = typename First::type; +}; + +template +struct instantiate_distribution_impl>: + instantiate_distribution_impl> {}; + +template< + typename TileDim, + typename BlockDim, + typename Distributions, + typename = make_index_sequence> +struct tiling_impl; + +template +struct tiling_impl> { + template + using dist_type = typename instantiate_distribution_impl:: + type; + + static constexpr size_t rank = TileDim::rank; + static constexpr size_t items_per_thread = (dist_type::items_per_thread * ... * 1); + static constexpr bool is_exhaustive = (dist_type::is_exhaustive && ...); + + template + KERNEL_FLOAT_INLINE static vector_storage + local_to_global(const BlockDim& block, size_t item) { + vector_storage result; + ((result.data()[Is] = dist_type::local_to_global( + block.thread_index(Is), + item % dist_type::items_per_thread), + item /= dist_type::items_per_thread), + ...); + return result; + } + + KERNEL_FLOAT_INLINE + static bool local_is_present(const BlockDim& block, size_t item) { + bool is_present = true; + ((is_present &= dist_type::local_is_present( + block.thread_index(Is), + item % dist_type::items_per_thread), + item /= dist_type::items_per_thread), + ...); + return is_present; + } +}; +}; // namespace detail + +template +struct tiling_iterator; + +template< + typename TileDim, + typename BlockDim, + typename Distributions = distributions<>, + typename IndexType = int> +struct tiling { + using self_type = tiling; + using impl_type = detail::tiling_impl; + using block_type = BlockDim; + using tile_type = TileDim; + + static constexpr size_t rank = tile_type::rank; + static constexpr size_t num_locals = impl_type::items_per_thread; + + using index_type = IndexType; + using point_type = vector>; + +#if KERNEL_FLOAT_IS_DEVICE + __forceinline__ __device__ tiling() : block_(threadIdx) {} +#endif + + KERNEL_FLOAT_INLINE + tiling(BlockDim block, vec offset = {}) : block_(block), offset_(offset) {} + + /** + * Returns the number of items per thread in the tiling. + * + * Note that this method is ``constexpr`` and can be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return impl_type::items_per_thread; + } + + /** + * Checks if the tiling is exhaustive, meaning all items are always present for all threads. If this returns + * `true`, then ``is_present`` will always true for any given index. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr bool all_present() { + return impl_type::is_exhaustive; + } + + /** + * Checks if a specific item is present for the current thread based on the distribution strategy. Not always + * is the number of items stored per thread equal to the number of items _owned_ by each thread (for example, + * if the tile size is not divisible by the block size). In this case, ``is_present`` will return `false` for + * certain items. + */ + KERNEL_FLOAT_INLINE + bool is_present(size_t item) const { + return all_present() || impl_type::local_is_present(block_, item); + } + + /** + * Returns the global coordinates of a specific item for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> at(size_t item) const { + return impl_type::template local_to_global(block_, item) + offset_; + } + + /** + * Returns the global coordinates of a specific item along a specified axis for the current thread. + */ + KERNEL_FLOAT_INLINE + index_type at(size_t item, size_t axis) const { + return axis < rank ? at(item)[axis] : index_type {}; + } + + /** + * Returns the global coordinates of a specific item for the current thread (alias of ``at``). + */ + KERNEL_FLOAT_INLINE + vector> operator[](size_t item) const { + return at(item); + } + + /** + * Returns a vector of global coordinates of all items present for the current thread. + */ + KERNEL_FLOAT_INLINE + vector>, extent> local_points() const { + return range([&](size_t i) { return at(i); }); + } + + /** + * Returns a vector of coordinate values along a specified axis for all items present for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> local_points(size_t axis) const { + return range([&](size_t i) { return at(i, axis); }); + } + + /** + * Returns a vector of boolean values representing the result of ``is_present`` of the items for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> local_mask() const { + return range([&](size_t i) { return is_present(i); }); + } + + /** + * Returns the thread index (position) along a specified axis for the current thread. + */ + KERNEL_FLOAT_INLINE + index_type thread_index(size_t axis) const { + return index_type(block_.thread_index(axis)); + } + + /** + * Returns the size of the block (number of threads) along a specified axis. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr index_type block_size(size_t axis) { + return index_type(block_type::size(axis)); + } + + /** + * Returns the size of the tile along a specified axis. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr index_type tile_size(size_t axis) { + return index_type(tile_type::size(axis, block_size(axis))); + } + + /** + * Returns the offset of the tile along a specified axis. + */ + KERNEL_FLOAT_INLINE + index_type tile_offset(size_t axis) const { + return index_type(offset_[axis]); + } + + /** + * Returns a vector of thread indices for all axes. + */ + KERNEL_FLOAT_INLINE + vector> thread_index() const { + return range([&](size_t i) { return thread_index(i); }); + } + + /** + * Returns a vector of block sizes for all axes. + */ + KERNEL_FLOAT_INLINE + static vector> block_size() { + return range([&](size_t i) { return block_size(i); }); + } + + /** + * Returns a vector of tile sizes for all axes. + */ + KERNEL_FLOAT_INLINE + static vector> tile_size() { + return range([&](size_t i) { return tile_size(i); }); + } + + /** + * Returns the offset of the tile for all axes. + */ + KERNEL_FLOAT_INLINE + vector> tile_offset() const { + return range([&](size_t i) { return tile_offset(i); }); + } + + /** + * Returns an iterator pointing to the beginning of the tiling. + */ + KERNEL_FLOAT_INLINE + tiling_iterator begin() const { + return {*this, 0}; + } + + /** + * Returns an iterator pointing to the end of the tiling. + */ + KERNEL_FLOAT_INLINE + tiling_iterator end() const { + return {*this, num_locals}; + } + + /** + * Applies a provided function to each item present in the tiling for the current thread. + * The function should take an index and a ``vector`` of global coordinates as arguments. + */ + template + KERNEL_FLOAT_INLINE void for_each(F fun) const { +#pragma unroll + for (size_t i = 0; i < num_locals; i++) { + if (is_present(i)) { + fun(i, at(i)); + } + } + } + + /** + * Adds ``offset`` to all points of this tiling and returns a new tiling. + */ + KERNEL_FLOAT_INLINE friend tiling + operator+(const tiling& self, const vector>& offset) { + return tiling {self.block_, self.offset_ + offset}; + } + + /** + * Adds ``offset`` to all points of this tiling and returns a new tiling. + */ + KERNEL_FLOAT_INLINE friend tiling + operator+(const vector>& offset, const tiling& self) { + return self + offset; + } + + /** + * Adds ``offset`` to all points of this tiling. + */ + KERNEL_FLOAT_INLINE friend tiling& + operator+=(tiling& self, const vector>& offset) { + return self = self + offset; + } + + private: + BlockDim block_; + vector> offset_; +}; + +template +struct tiling_iterator { + using value_type = vector>; + + KERNEL_FLOAT_INLINE + tiling_iterator(const T& inner, size_t position = 0) : inner_(&inner), position_(position) { + while (position_ < T::num_locals && !inner_->is_present(position_)) { + position_++; + } + } + + KERNEL_FLOAT_INLINE + value_type operator*() const { + return inner_->at(position_); + } + + KERNEL_FLOAT_INLINE + tiling_iterator& operator++() { + return *this = tiling_iterator(*inner_, position_ + 1); + } + + KERNEL_FLOAT_INLINE + tiling_iterator operator++(int) { + tiling_iterator old = *this; + this ++; + return old; + } + + KERNEL_FLOAT_INLINE + friend bool operator==(const tiling_iterator& a, const tiling_iterator& b) { + return a.position_ == b.position_; + } + + KERNEL_FLOAT_INLINE + friend bool operator!=(const tiling_iterator& a, const tiling_iterator& b) { + return !operator==(a, b); + } + + size_t position_ = 0; + const T* inner_; +}; + +template +using tiling_1d = tiling, block_size, distributions, IndexType>; + +// clang-format off +#define KERNEL_FLOAT_TILING_FOR(TILING_VARIABLE__, INDEX_VARIABLE__, POINT_VARIABLE__) \ + _Pragma("unroll") \ + for (::std::size_t INDEX_VARIABLE__ = 0; INDEX_VARIABLE__ < TILING_VARIABLE__.size(); INDEX_VARIABLE__++) \ + if (typename decltype(TILING_VARIABLE__)::point_type POINT_VARIABLE__ = TILING_VARIABLE__.at(INDEX_VARIABLE__); \ + TILING_VARIABLE__.is_present(INDEX_VARIABLE__)) +// clang-format on + +} // namespace kernel_float + +#endif // KERNEL_FLOAT_TILING_H \ No newline at end of file diff --git a/tests/tiling.cu b/tests/tiling.cu new file mode 100644 index 0000000..c1599aa --- /dev/null +++ b/tests/tiling.cu @@ -0,0 +1,92 @@ +#include "common.h" +#include "kernel_float/tiling.h" + +struct basic_tiling_test { + template + __host__ __device__ void operator()(generator gen) { + auto tiling = kf::tiling< + kf::tile_size<8, 8>, + kf::block_size<2, 4>, + kf::distributions>(dim3(1, 2, 0)); + + ASSERT_EQ(tiling.size(), size_t(8)); + + ASSERT_EQ( + tiling.local_points(), + kf::make_vec( + kf::make_vec(1, 2), + kf::make_vec(3, 2), + kf::make_vec(5, 2), + kf::make_vec(7, 2), + kf::make_vec(1, 6), + kf::make_vec(3, 6), + kf::make_vec(5, 6), + kf::make_vec(7, 6))); + + ASSERT_EQ(tiling.local_points(0), kf::make_vec(1, 3, 5, 7, 1, 3, 5, 7)); + + ASSERT_EQ(tiling.local_points(1), kf::make_vec(2, 2, 2, 2, 6, 6, 6, 6)); + + ASSERT_EQ(tiling.at(0), kf::make_vec(1, 2)); + ASSERT_EQ(tiling.at(1), kf::make_vec(3, 2)); + ASSERT_EQ(tiling.at(2), kf::make_vec(5, 2)); + + ASSERT_EQ(tiling.at(0, 0), 1); + ASSERT_EQ(tiling.at(0, 1), 2); + ASSERT_EQ(tiling.at(1, 0), 3); + ASSERT_EQ(tiling.at(1, 1), 2); + + ASSERT_EQ(tiling[0], kf::make_vec(1, 2)); + ASSERT_EQ(tiling[1], kf::make_vec(3, 2)); + ASSERT_EQ(tiling[2], kf::make_vec(5, 2)); + ASSERT_EQ(tiling[3], kf::make_vec(7, 2)); + + ASSERT_EQ( + tiling.local_mask(), + kf::make_vec(true, true, true, true, true, true, true, true)); + ASSERT_EQ(tiling.is_present(0), true); + ASSERT_EQ(tiling.is_present(1), true); + ASSERT_EQ(tiling.is_present(2), true); + ASSERT_EQ(tiling.is_present(3), true); + + ASSERT_EQ(tiling.thread_index(0), 1); + ASSERT_EQ(tiling.thread_index(1), 2); + ASSERT_EQ(tiling.thread_index(2), 0); + ASSERT_EQ(tiling.thread_index(), kf::make_vec(1, 2)); + + ASSERT_EQ(tiling.block_size(0), 2); + ASSERT_EQ(tiling.block_size(1), 4); + ASSERT_EQ(tiling.block_size(2), 1); + ASSERT_EQ(tiling.block_size(), kf::make_vec(2, 4)); + + ASSERT_EQ(tiling.tile_size(0), 8); + ASSERT_EQ(tiling.tile_size(1), 8); + ASSERT_EQ(tiling.tile_size(2), 1); + ASSERT_EQ(tiling.tile_size(), kf::make_vec(8, 8)); + + ASSERT_EQ(tiling.size(), size_t(8)); + + size_t counter = 0; + const int points[8][2] = { + {1, 2}, + {3, 2}, + {5, 2}, + {7, 2}, + {1, 6}, + {3, 6}, + {5, 6}, + {7, 6}, + }; + + KERNEL_FLOAT_TILING_FOR(tiling, i, point) { + ASSERT_EQ(counter, i); + ASSERT_EQ(point[0], points[i][0]); + ASSERT_EQ(point[1], points[i][1]); + counter++; + } + + ASSERT_EQ(counter, size_t(8)); + } +}; + +REGISTER_TEST_CASE("basic tiling tests", basic_tiling_test, int) From 4b68f09dd07ea9cba7611e63aa000d8dd157d62e Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 4 Oct 2023 15:58:41 +0200 Subject: [PATCH 02/16] Add `range(fun)` utility function --- include/kernel_float/iterate.h | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 68c1645..1b3d683 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -30,13 +30,13 @@ void for_each(V&& input, F fun) { namespace detail { template struct range_impl { - KERNEL_FLOAT_INLINE - static vector_storage call() { + template + KERNEL_FLOAT_INLINE static vector_storage call(F fun) { vector_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result.data()[i] = T(i); + result.data()[i] = fun(i); } return result; @@ -44,6 +44,21 @@ struct range_impl { }; } // namespace detail +/** + * Generate vector consisting of the result `fun(0)...fun(N-1)` + * + * Example + * ======= + * ``` + * // Returns [0.0f, 2.0f, 4.0f] + * vec vec = range<3>([](auto i){ return float(i * 2.0f); }); + * ``` + */ +template> +KERNEL_FLOAT_INLINE vector> range(F fun) { + return detail::range_impl::call(fun); +} + /** * Generate vector consisting of the numbers `0...N-1` of type `T` * @@ -56,7 +71,7 @@ struct range_impl { */ template KERNEL_FLOAT_INLINE vector> range() { - return detail::range_impl::call(); + return detail::range_impl::call(ops::cast()); } /** @@ -71,7 +86,7 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return detail::range_impl, vector_extent>::call(); + return range, vector_extent>(); } /** @@ -96,7 +111,7 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return detail::range_impl>::call(); + return range>(); } namespace detail { From aaf864569f9339a6bb55528743e87f6f3f8f1953 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 4 Oct 2023 15:59:01 +0200 Subject: [PATCH 03/16] Fix bugs in `aligned_ptr` --- include/kernel_float/memory.h | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index 01fce2f..fbe60a1 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -215,13 +215,6 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m return store(values, ptr, indices, indices < max_length); } -// TOOD: check if this way is support across all compilers -#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) -#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment)) -#else -#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (ptr) -#endif - template struct AssignConversionProxy { KERNEL_FLOAT_INLINE @@ -263,6 +256,20 @@ KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(vector& inp return AssignConversionProxy(input.data()); } +/** + * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes. + * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful! + */ +template +KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) { +// TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) + return static_cast(__builtin_assume_aligned(ptr, alignment)); +#else + return ptr; +#endif +} + /** * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes. */ @@ -281,7 +288,7 @@ struct aligned_ptr { */ KERNEL_FLOAT_INLINE T* get() const { - return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment); + return unsafe_assume_aligned(ptr_, alignment); } KERNEL_FLOAT_INLINE @@ -360,12 +367,18 @@ struct aligned_ptr { KERNEL_FLOAT_INLINE explicit aligned_ptr(const T* ptr) : ptr_(ptr) {} + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} + /** * Return the pointer value. */ KERNEL_FLOAT_INLINE const T* get() const { - return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment); + return unsafe_assume_aligned(ptr_, alignment); } KERNEL_FLOAT_INLINE @@ -406,6 +419,9 @@ struct aligned_ptr { const T* ptr_ = nullptr; }; +template +aligned_ptr(T*) -> aligned_ptr; + } // namespace kernel_float #endif //KERNEL_FLOAT_MEMORY_H From 2a51e52229bd78ac1600ccd357bf0bfcd63bc9e6 Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 6 Oct 2023 15:18:33 +0200 Subject: [PATCH 04/16] Add ability for `KERNEL_FLOAT_TILING_FOR_IMPL` to accept two or three arguments --- include/kernel_float/macros.h | 5 ++++ include/kernel_float/meta.h | 2 +- include/kernel_float/tiling.h | 19 ++++++++++---- tests/tiling.cu | 47 +++++++++++++++++++++-------------- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h index 8f42895..bfc8595 100644 --- a/include/kernel_float/macros.h +++ b/include/kernel_float/macros.h @@ -44,4 +44,9 @@ } while (0) #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable() +// Somet utility macros +#define KERNEL_FLOAT_CONCAT_IMPL(A, B) A##B +#define KERNEL_FLOAT_CONCAT(A, B) KERNEL_FLOAT_CONCAT_IMPL(A, B) +#define KERNEL_FLOAT_CALL(F, ...) F(__VA_ARGS__) + #endif //KERNEL_FLOAT_MACROS_H diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index 9c133a3..5141f82 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -77,7 +77,7 @@ template using decay_t = typename detail::decay_impl::type; template -struct promote_type; +struct promote_type {}; template struct promote_type { diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h index 0561d01..427d044 100644 --- a/include/kernel_float/tiling.h +++ b/include/kernel_float/tiling.h @@ -504,11 +504,20 @@ template, block_size, distributions, IndexType>; // clang-format off -#define KERNEL_FLOAT_TILING_FOR(TILING_VARIABLE__, INDEX_VARIABLE__, POINT_VARIABLE__) \ - _Pragma("unroll") \ - for (::std::size_t INDEX_VARIABLE__ = 0; INDEX_VARIABLE__ < TILING_VARIABLE__.size(); INDEX_VARIABLE__++) \ - if (typename decltype(TILING_VARIABLE__)::point_type POINT_VARIABLE__ = TILING_VARIABLE__.at(INDEX_VARIABLE__); \ - TILING_VARIABLE__.is_present(INDEX_VARIABLE__)) +#define KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \ + _Pragma("unroll") \ + for (size_t ITER_VAR = 0; ITER_VAR < (TILING).size(); ITER_VAR++) \ + if (POINT_VAR = (TILING).at(ITER_VAR); (TILING).is_present(ITER_VAR)) \ + +#define KERNEL_FLOAT_TILING_FOR_IMPL2(ITER_VAR, TILING, INDEX_VAR, POINT_VAR) \ + KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \ + if (INDEX_VAR = ITER_VAR; true) + +#define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \ + KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B) + +#define KERNEL_FLOAT_TILING_FOR(...) \ + KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1) // clang-format on } // namespace kernel_float diff --git a/tests/tiling.cu b/tests/tiling.cu index c1599aa..5735e15 100644 --- a/tests/tiling.cu +++ b/tests/tiling.cu @@ -4,12 +4,14 @@ struct basic_tiling_test { template __host__ __device__ void operator()(generator gen) { - auto tiling = kf::tiling< + using TestTiling = kf::tiling< kf::tile_size<8, 8>, kf::block_size<2, 4>, - kf::distributions>(dim3(1, 2, 0)); + kf::distributions // + >; + auto tiling = TestTiling(dim3(1, 2, 0)); - ASSERT_EQ(tiling.size(), size_t(8)); + ASSERT_EQ(TestTiling::size(), size_t(8)); ASSERT_EQ( tiling.local_points(), @@ -24,7 +26,6 @@ struct basic_tiling_test { kf::make_vec(7, 6))); ASSERT_EQ(tiling.local_points(0), kf::make_vec(1, 3, 5, 7, 1, 3, 5, 7)); - ASSERT_EQ(tiling.local_points(1), kf::make_vec(2, 2, 2, 2, 6, 6, 6, 6)); ASSERT_EQ(tiling.at(0), kf::make_vec(1, 2)); @@ -44,6 +45,7 @@ struct basic_tiling_test { ASSERT_EQ( tiling.local_mask(), kf::make_vec(true, true, true, true, true, true, true, true)); + ASSERT_EQ(TestTiling::all_present(), true); ASSERT_EQ(tiling.is_present(0), true); ASSERT_EQ(tiling.is_present(1), true); ASSERT_EQ(tiling.is_present(2), true); @@ -54,19 +56,16 @@ struct basic_tiling_test { ASSERT_EQ(tiling.thread_index(2), 0); ASSERT_EQ(tiling.thread_index(), kf::make_vec(1, 2)); - ASSERT_EQ(tiling.block_size(0), 2); - ASSERT_EQ(tiling.block_size(1), 4); - ASSERT_EQ(tiling.block_size(2), 1); - ASSERT_EQ(tiling.block_size(), kf::make_vec(2, 4)); - - ASSERT_EQ(tiling.tile_size(0), 8); - ASSERT_EQ(tiling.tile_size(1), 8); - ASSERT_EQ(tiling.tile_size(2), 1); - ASSERT_EQ(tiling.tile_size(), kf::make_vec(8, 8)); + ASSERT_EQ(TestTiling::block_size(0), 2); + ASSERT_EQ(TestTiling::block_size(1), 4); + ASSERT_EQ(TestTiling::block_size(2), 1); + ASSERT_EQ(TestTiling::block_size(), kf::make_vec(2, 4)); - ASSERT_EQ(tiling.size(), size_t(8)); + ASSERT_EQ(TestTiling::tile_size(0), 8); + ASSERT_EQ(TestTiling::tile_size(1), 8); + ASSERT_EQ(TestTiling::tile_size(2), 1); + ASSERT_EQ(TestTiling::tile_size(), kf::make_vec(8, 8)); - size_t counter = 0; const int points[8][2] = { {1, 2}, {3, 2}, @@ -78,14 +77,26 @@ struct basic_tiling_test { {7, 6}, }; - KERNEL_FLOAT_TILING_FOR(tiling, i, point) { - ASSERT_EQ(counter, i); + size_t counter = 0; + KERNEL_FLOAT_TILING_FOR(tiling, auto point) { + ASSERT(counter < 8); + ASSERT_EQ(point[0], points[counter][0]); + ASSERT_EQ(point[1], points[counter][1]); + counter++; + } + + ASSERT(counter == 8); + + counter = 0; + KERNEL_FLOAT_TILING_FOR(tiling, int i, auto point) { + ASSERT(counter < 8); + ASSERT_EQ(counter, size_t(i)); ASSERT_EQ(point[0], points[i][0]); ASSERT_EQ(point[1], points[i][1]); counter++; } - ASSERT_EQ(counter, size_t(8)); + ASSERT(counter == 8); } }; From c51828f289aa8309623e3431d580a6960c1392be Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 6 Oct 2023 15:18:57 +0200 Subject: [PATCH 05/16] Add missing `KERNEL_FLOAT_INLINE` on `for_each` --- include/kernel_float/iterate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 1b3d683..5e91180 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -18,7 +18,7 @@ namespace kernel_float { * ``` */ template -void for_each(V&& input, F fun) { +KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) { auto storage = into_vector_storage(input); #pragma unroll @@ -315,4 +315,4 @@ KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... in } // namespace kernel_float -#endif \ No newline at end of file +#endif From 257335f531058dda3871423c87f9d0f41771814f Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 6 Oct 2023 16:24:01 +0200 Subject: [PATCH 06/16] Move `cast_to` to from memory.h to conversion.h --- include/kernel_float/conversion.h | 41 +++++++++++++++++++++++++++++++ include/kernel_float/memory.h | 41 ------------------------------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h index 78a1927..61e0236 100644 --- a/include/kernel_float/conversion.h +++ b/include/kernel_float/conversion.h @@ -193,6 +193,47 @@ KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_s return convert_storage(input); } +template +struct AssignConversionProxy { + KERNEL_FLOAT_INLINE + explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {} + + template + KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) { + *ptr_ = detail::convert_impl< + vector_value_type, + vector_extent_type, + vector_value_type, + vector_extent_type, + M>::call(into_vector_storage(values)); + + return *this; + } + + private: + T* ptr_; +}; + +/** + * Takes a vector reference and gives back a helper object. This object helps when you want to assign one vector to another + * vector of a different type. It's a way to enable implicit type conversion. + * + * For example, if `x = expression;` does not compile because `x` and `expression` are different vector types, you can use + * `cast_to(x) = expression;` to make it work. + * + * Example + * ======= + * ``` + * vec x; + * vec y = {1.0, 2.0}; + * cast_to(x) = y; // Normally, `x = y;` would give an error, but `cast_to` fixes that. + * ``` + */ +template +KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(T& input) { + return AssignConversionProxy(&input); +} + /** * Returns a vector containing `N` copies of `value`. * diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index fbe60a1..06c1aac 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -215,47 +215,6 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m return store(values, ptr, indices, indices < max_length); } -template -struct AssignConversionProxy { - KERNEL_FLOAT_INLINE - explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {} - - template - KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) { - auto indices = range(); - detail::store_impl::call( - ptr_, - convert_storage(std::forward(values)).data(), - indices.data()); - - return *this; - } - - private: - T* ptr_; -}; - -/** - * Takes a reference to a vector and returns a special proxy object that automatically performs the correct conversion - * when a vector of a different element type is assigned. This is useful to perform implicit type conversions. - * - * For example, let assume that a line like `x = expression;` would not compile since `x` and `expressions` are - * vectors of different element types. Then it is possible to use `cast_to(x) = expression;` to fix this error, - * which possibly introduces a type conversion. - * - * Example - * ======= - * ``` - * vec x; - * vec y = {1.0, 2.0}; - * cast_to(x) = y; // normally, the line `x = y;` would not compile, but `cast_to` make this possible - * ``` - */ -template -KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(vector& input) { - return AssignConversionProxy(input.data()); -} - /** * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes. * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful! From 33245d7f1cd0d3069ed534012a23ab6d73c7f606 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 10 Oct 2023 14:20:42 +0200 Subject: [PATCH 07/16] Add fast math functions to documentation --- docs/api.rst | 1 + docs/build_api.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 39ca304..09f7789 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -8,6 +8,7 @@ API Reference api/binary_operators.rst api/reductions.rst api/mathematical.rst + api/fast_math.rst api/conditional.rst api/memory_read_write.rst diff --git a/docs/build_api.py b/docs/build_api.py index 26d182c..b31e6c6 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -90,7 +90,8 @@ def build_index_page(groups): "for_each", ], "Generation": [ - "range", + ("range", "range()"), + ("range", "range(F fun)"), "range_like", "each_index", "fill", @@ -193,6 +194,14 @@ def build_index_page(groups): "isinf", "isnan", ], + "Fast math": [ + "fast_exp", + "fast_log", + "fast_cos", + "fast_sin", + "fast_tan", + "fast_div", + ], "Conditional": [ ("where", "where(const C&, const L&, const R&)"), ("where", "where(const C&, const L&)"), @@ -202,12 +211,12 @@ def build_index_page(groups): "cast_to", ("load", "load(const T*, const I&)"), ("load", "load(const T*, const I&, const M&)"), - ("loadn", "loadn(const T*, ptrdiff_t)"), - ("loadn", "loadn(const T*, ptrdiff_t, ptrdiff_t)"), + ("loadn", "loadn(const T*, size_t)"), + ("loadn", "loadn(const T*, size_t, size_t)"), ("store", "store(const V&, T *ptr, const I&)"), ("store", "store(const V&, T *ptr, const I&, const M&)"), - ("storen", "storen(const V&, T*, ptrdiff_t)"), - ("storen", "storen(const V&, T*, ptrdiff_t, ptrdiff_t)"), + ("storen", "storen(const V&, T*, size_t)"), + ("storen", "storen(const V&, T*, size_t, size_t)"), ("aligned_ptr", "aligned_ptr", "struct"), ] } From 2851957ed755e5e9113e690389f07d387cc61bff Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 10 Oct 2023 15:27:24 +0200 Subject: [PATCH 08/16] Document `constant` and `tiling` --- docs/api.rst | 1 + docs/build_api.py | 10 +++++++++- docs/guides.rst | 1 + docs/guides/constant.md | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 docs/guides/constant.md diff --git a/docs/api.rst b/docs/api.rst index 09f7789..d3b2e21 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -11,4 +11,5 @@ API Reference api/fast_math.rst api/conditional.rst api/memory_read_write.rst + api/utilities.rst diff --git a/docs/build_api.py b/docs/build_api.py index b31e6c6..b06f043 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -27,7 +27,10 @@ def build_doxygen_page(name, items): content += "-" * len(title) + "\n" for symbol in symbols: - content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n" + if directive == "define": + content += f".. doxygendefine:: {symbol}\n\n" + else: + content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n" stripped_name = name.lower().replace(" ", "_").replace("/", "_") filename = f"api/{stripped_name}.rst" @@ -218,6 +221,11 @@ def build_index_page(groups): ("storen", "storen(const V&, T*, size_t)"), ("storen", "storen(const V&, T*, size_t, size_t)"), ("aligned_ptr", "aligned_ptr", "struct"), + ], + "Utilities": [ + ("constant", "constant", "struct"), + ("tiling", "tiling", "struct"), + ("KERNEL_FLOAT_TILING_FOR", "KERNEL_FLOAT_TILING_FOR", "define"), ] } diff --git a/docs/guides.rst b/docs/guides.rst index 609868b..72c2ead 100644 --- a/docs/guides.rst +++ b/docs/guides.rst @@ -6,3 +6,4 @@ Guides guides/introduction.rst guides/promotion.rst guides/prelude.rst + guides/constant.rst diff --git a/docs/guides/constant.md b/docs/guides/constant.md new file mode 100644 index 0000000..5d38501 --- /dev/null +++ b/docs/guides/constant.md @@ -0,0 +1,38 @@ +Using `kernel_float::constant` +=== + +When working with mixed precision types, you will find that working with constants presents a bit a challenge. + +For example, a simple expression such as `3.14 * x` where `x` is of type `vec` will NOT be performed +in `float` precision as you might expect, but instead in `double` precision. +This happens since the left-hand side of this expression +(a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`. + +To solve this problem, `kernel_float` offers a type called `constant` that can be used to represents +constants. Any binary operations between a value of type `U` and a `constant` will result in both +operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes +`constant` useful for representing constant in your code. + + +For example, consider the following code: + +``` +#include "kernel_float.h" +namespace kf = kernel_float; + +int main() { + using Type = float; + const int N = 8; + static constexpr auto PI = kf::make_constant(3.14); + + kf::vec i = kf::range(); + kf::vec x = kf::cast(i) * PI; + kf::vec y = x * kf::sin(x); + Type result = kf::sum(y); + printf("result=%f", double(result)); + + return EXIT_SUCCESS; +} +``` + +This code example uses the ``make_constant`` utility function to create `constant`. From be9bbba87ac02e239a40a88d53ba1db7aea4ae13 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 10 Oct 2023 15:27:52 +0200 Subject: [PATCH 09/16] Fix small spelling mistakes in the documentation --- docs/guides/constant.md | 7 +++--- include/kernel_float/constant.h | 33 +++++++++++++++++++++++--- include/kernel_float/conversion.h | 20 ++++++++-------- include/kernel_float/iterate.h | 16 ++++++------- include/kernel_float/reduce.h | 2 +- include/kernel_float/tiling.h | 39 ++++++++++++++++++++++++++++++- include/kernel_float/vector.h | 10 ++++++-- 7 files changed, 98 insertions(+), 29 deletions(-) diff --git a/docs/guides/constant.md b/docs/guides/constant.md index 5d38501..628354b 100644 --- a/docs/guides/constant.md +++ b/docs/guides/constant.md @@ -1,18 +1,17 @@ Using `kernel_float::constant` === -When working with mixed precision types, you will find that working with constants presents a bit a challenge. +When working with mixed precision types, you will find that working with constants presents a bit of a challenge. For example, a simple expression such as `3.14 * x` where `x` is of type `vec` will NOT be performed in `float` precision as you might expect, but instead in `double` precision. This happens since the left-hand side of this expression (a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`. -To solve this problem, `kernel_float` offers a type called `constant` that can be used to represents +To solve this problem, `kernel_float` offers a type called `constant` that can be used to represent constants. Any binary operations between a value of type `U` and a `constant` will result in both operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes -`constant` useful for representing constant in your code. - +`constant` useful for representing constants in your code. For example, consider the following code: diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 1b98925..be49d6f 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -6,17 +6,44 @@ namespace kernel_float { +/** + * `constant` represents a constant value of type `T`. + * + * The object has the property that for any binary operation involving + * a `constant` and a value of type `U`, the constant is automatically + * cast to also be of type `U`. + * + * For example: + * ``` + * float a = 5; + * constant b = 3; + * + * auto c = a + b; // The result will be of type `float` + * ``` + */ template struct constant { + /** + * Create a new constant from the given value. + */ + KERNEL_FLOAT_INLINE + constexpr constant(T value = {}) : value_(value) {} + + KERNEL_FLOAT_INLINE + constexpr constant(const constant& that) : value_(that.value) {} + + /** + * Create a new constant from another constant of type `R`. + */ template KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) { auto f = ops::cast(); value_ = f(that.get()); } - KERNEL_FLOAT_INLINE - constexpr constant(T value = {}) : value_(value) {} - + /** + * Return the value of the constant + */ KERNEL_FLOAT_INLINE constexpr T get() const { return value_; diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h index 61e0236..6f9e4fd 100644 --- a/include/kernel_float/conversion.h +++ b/include/kernel_float/conversion.h @@ -215,11 +215,11 @@ struct AssignConversionProxy { }; /** - * Takes a vector reference and gives back a helper object. This object helps when you want to assign one vector to another - * vector of a different type. It's a way to enable implicit type conversion. + * Takes a vector reference and gives back a helper object. This object allows you to assign + * a vector of a different type to another vector while perofrming implicit type converion. * - * For example, if `x = expression;` does not compile because `x` and `expression` are different vector types, you can use - * `cast_to(x) = expression;` to make it work. + * For example, if `x = expression;` does not compile because `x` and `expression` are + * different vector types, you can use `cast_to(x) = expression;` to make it work. * * Example * ======= @@ -240,7 +240,7 @@ KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(T& input) { * Example * ======= * ``` - * vec a = fill<3>(42); // return [42, 42, 42] + * vec a = fill<3>(42); // returns [42, 42, 42] * ``` */ template @@ -255,7 +255,7 @@ KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { * Example * ======= * ``` - * vec a = zeros(); // return [0, 0, 0] + * vec a = zeros(); // returns [0, 0, 0] * ``` */ template @@ -270,7 +270,7 @@ KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { * Example * ======= * ``` - * vec a = ones(); // return [1, 1, 1] + * vec a = ones(); // returns [1, 1, 1] * ``` */ template @@ -286,7 +286,7 @@ KERNEL_FLOAT_INLINE vector> ones(extent = {}) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = fill_like(a, 42); // return [42, 42, 42] + * vec b = fill_like(a, 42); // returns [42, 42, 42] * ``` */ template, typename E = vector_extent_type> @@ -301,7 +301,7 @@ KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = zeros_like(a); // return [0, 0, 0] + * vec b = zeros_like(a); // returns [0, 0, 0] * ``` */ template, typename E = vector_extent_type> @@ -316,7 +316,7 @@ KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = ones_like(a); // return [1, 1, 1] + * vec b = ones_like(a); // returns [1, 1, 1] * ``` */ template, typename E = vector_extent_type> diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 5e91180..884d167 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -45,7 +45,7 @@ struct range_impl { } // namespace detail /** - * Generate vector consisting of the result `fun(0)...fun(N-1)` + * Generate vector consisting of the result `fun(0), ..., fun(N-1)` * * Example * ======= @@ -60,7 +60,7 @@ KERNEL_FLOAT_INLINE vector> range(F fun) { } /** - * Generate vector consisting of the numbers `0...N-1` of type `T` + * Generate vector consisting of the numbers `0, ..., N-1` of type `T` * * Example * ======= @@ -75,7 +75,7 @@ KERNEL_FLOAT_INLINE vector> range() { } /** - * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` + * Takes a vector `vec` and returns a new vector consisting of the numbers ``0, ..., N-1`` of type ``T`` * * Example * ======= @@ -90,7 +90,7 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { } /** - * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0, ..., N-1``. The data type used * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when * needing to iterate over the indices of a vector. * @@ -258,14 +258,14 @@ using concat_type = vector, extent>> * ======= * ``` * double vec1 = 1.0; - * double3 vec2 = {3.0, 4.0, 5.0); - * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; - * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] + * double3 vec2 = {2.0, 3.0, 4.0); + * double4 vec3 = {5.0, 6.0, 7.0, 8.0}; + * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8] * * int num1 = 42; * float num2 = 3.14159; * int2 num3 = {-10, 10}; - * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] + * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] * ``` */ template diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index dfa52c3..7056175 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -97,7 +97,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * ======= * ``` * vec x = {5, 0, 2, 1, 0}; - * int y = sum(x); // Returns 5*0*2*1*0 = 0 + * int y = product(x); // Returns 5*0*2*1*0 = 0 * ``` */ template> diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h index 427d044..9cac00d 100644 --- a/include/kernel_float/tiling.h +++ b/include/kernel_float/tiling.h @@ -227,6 +227,22 @@ struct tiling_impl> { template struct tiling_iterator; +/** + * Represents a tiling where the elements given by `TileDim` are distributed over the + * threads given by `BlockDim` according to the distributions given by `Distributions`. + * + * The template parameters should be the following: + * + * * ``TileDim``: Should be an instance of ``tile_size<...>``. For example, + * ``tile_size<16, 16>`` represents a 2-dimensional 16x16 tile. + * * ``BlockDim``: Should be an instance of ``block_dim<...>``. For example, + * ``block_dim<16, 4>`` represents a thread block having X dimension 16 + * and Y-dimension 4 for a total of 64 threads per block. + * * ``Distributions``: Should be an instance of ``distributions<...>``. For example, + * ``distributions`` will distribute elements in + * cyclic fashion along the X-axis and blocked fashion along the Y-axis. + * * ``IndexType``: The type used for index values (``int`` by default) + */ template< typename TileDim, typename BlockDim, @@ -516,10 +532,31 @@ using tiling_1d = tiling, block_size, distributions #define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \ KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B) +/** + * Iterate over the points in a ``tiling<...>`` using a for loop. + * + * There are two ways to use this macro. Using the 1 variable form: + * ``` + * auto t = tiling, block_size<4, 4>>; + * + * KERNEL_FLOAT_TILING_FOR(t, auto point) { + * printf("%d,%d\n", point[0], point[1]); + * } + * ``` + * + * Or using the 2 variables form: + * ``` + * auto t = tiling, block_size<4, 4>>; + * + * KERNEL_FLOAT_TILING_FOR(t, auto index, auto point) { + * printf("%d] %d,%d\n", index, point[0], point[1]); + * } + * ``` + */ #define KERNEL_FLOAT_TILING_FOR(...) \ KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1) // clang-format on } // namespace kernel_float -#endif // KERNEL_FLOAT_TILING_H \ No newline at end of file +#endif // KERNEL_FLOAT_TILING_H diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 4ed5eff..56cd1fe 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -11,9 +11,9 @@ namespace kernel_float { /** - * Container that stores ``N`` elements of type ``T``. + * Container that store fixed number of elements of type ``T``. * - * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for + * It is not recommended to use this class directly, instead, use the type `vec` which is an alias for * `vector, vector_storage>`. * * @tparam T The type of the values stored within the vector. @@ -64,11 +64,17 @@ struct vector: public S { return E::size; } + /** + * Returns a reference to the underlying storage type. + */ KERNEL_FLOAT_INLINE storage_type& storage() { return *this; } + /** + * Returns a reference to the underlying storage type. + */ KERNEL_FLOAT_INLINE const storage_type& storage() const { return *this; From b1d8f9c7b2b3cad8c4b6a359a4cf3d4b36112423 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 11 Oct 2023 15:46:12 +0200 Subject: [PATCH 10/16] Update single_include --- single_include/kernel_float.h | 844 ++++++++++++++++++++++++++++++++-- 1 file changed, 817 insertions(+), 27 deletions(-) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index aea324e..313cc85 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,13 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! +<<<<<<< HEAD // date: 2023-10-13 14:55:52.284209 // git hash: 3da5ba08788e4d89a1b20b6a12bb4ba0f8de6b40 +======= +// date: 2023-10-11 15:46:04.149164 +// git hash: b1f6c1b73c2212223b10142054a28806f56b5ee6 +>>>>>>> 9bf416c (Update single_include) //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -66,6 +71,11 @@ } while (0) #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable() +// Somet utility macros +#define KERNEL_FLOAT_CONCAT_IMPL(A, B) A##B +#define KERNEL_FLOAT_CONCAT(A, B) KERNEL_FLOAT_CONCAT_IMPL(A, B) +#define KERNEL_FLOAT_CALL(F, ...) F(__VA_ARGS__) + #endif //KERNEL_FLOAT_MACROS_H #ifndef KERNEL_FLOAT_CORE_H #define KERNEL_FLOAT_CORE_H @@ -146,7 +156,7 @@ template using decay_t = typename detail::decay_impl::type; template -struct promote_type; +struct promote_type {}; template struct promote_type { @@ -1303,13 +1313,54 @@ KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_s return convert_storage(input); } +template +struct AssignConversionProxy { + KERNEL_FLOAT_INLINE + explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {} + + template + KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) { + *ptr_ = detail::convert_impl< + vector_value_type, + vector_extent_type, + vector_value_type, + vector_extent_type, + M>::call(into_vector_storage(values)); + + return *this; + } + + private: + T* ptr_; +}; + +/** + * Takes a vector reference and gives back a helper object. This object allows you to assign + * a vector of a different type to another vector while perofrming implicit type converion. + * + * For example, if `x = expression;` does not compile because `x` and `expression` are + * different vector types, you can use `cast_to(x) = expression;` to make it work. + * + * Example + * ======= + * ``` + * vec x; + * vec y = {1.0, 2.0}; + * cast_to(x) = y; // Normally, `x = y;` would give an error, but `cast_to` fixes that. + * ``` + */ +template +KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(T& input) { + return AssignConversionProxy(&input); +} + /** * Returns a vector containing `N` copies of `value`. * * Example * ======= * ``` - * vec a = fill<3>(42); // return [42, 42, 42] + * vec a = fill<3>(42); // returns [42, 42, 42] * ``` */ template @@ -1324,7 +1375,7 @@ KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { * Example * ======= * ``` - * vec a = zeros(); // return [0, 0, 0] + * vec a = zeros(); // returns [0, 0, 0] * ``` */ template @@ -1339,7 +1390,7 @@ KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { * Example * ======= * ``` - * vec a = ones(); // return [1, 1, 1] + * vec a = ones(); // returns [1, 1, 1] * ``` */ template @@ -1355,7 +1406,7 @@ KERNEL_FLOAT_INLINE vector> ones(extent = {}) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = fill_like(a, 42); // return [42, 42, 42] + * vec b = fill_like(a, 42); // returns [42, 42, 42] * ``` */ template, typename E = vector_extent_type> @@ -1370,7 +1421,7 @@ KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = zeros_like(a); // return [0, 0, 0] + * vec b = zeros_like(a); // returns [0, 0, 0] * ``` */ template, typename E = vector_extent_type> @@ -1385,7 +1436,7 @@ KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { * ======= * ``` * vec a = {1, 2, 3}; - * vec b = ones_like(a); // return [1, 1, 1] + * vec b = ones_like(a); // returns [1, 1, 1] * ``` */ template, typename E = vector_extent_type> @@ -1765,17 +1816,44 @@ KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { namespace kernel_float { +/** + * `constant` represents a constant value of type `T`. + * + * The object has the property that for any binary operation involving + * a `constant` and a value of type `U`, the constant is automatically + * cast to also be of type `U`. + * + * For example: + * ``` + * float a = 5; + * constant b = 3; + * + * auto c = a + b; // The result will be of type `float` + * ``` + */ template struct constant { + /** + * Create a new constant from the given value. + */ + KERNEL_FLOAT_INLINE + constexpr constant(T value = {}) : value_(value) {} + + KERNEL_FLOAT_INLINE + constexpr constant(const constant& that) : value_(that.value) {} + + /** + * Create a new constant from another constant of type `R`. + */ template KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) { auto f = ops::cast(); value_ = f(that.get()); } - KERNEL_FLOAT_INLINE - constexpr constant(T value = {}) : value_(value) {} - + /** + * Return the value of the constant + */ KERNEL_FLOAT_INLINE constexpr T get() const { return value_; @@ -1893,7 +1971,7 @@ namespace kernel_float { * ``` */ template -void for_each(V&& input, F fun) { +KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) { auto storage = into_vector_storage(input); #pragma unroll @@ -1905,13 +1983,13 @@ void for_each(V&& input, F fun) { namespace detail { template struct range_impl { - KERNEL_FLOAT_INLINE - static vector_storage call() { + template + KERNEL_FLOAT_INLINE static vector_storage call(F fun) { vector_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result.data()[i] = T(i); + result.data()[i] = fun(i); } return result; @@ -1920,7 +1998,22 @@ struct range_impl { } // namespace detail /** - * Generate vector consisting of the numbers `0...N-1` of type `T` + * Generate vector consisting of the result `fun(0), ..., fun(N-1)` + * + * Example + * ======= + * ``` + * // Returns [0.0f, 2.0f, 4.0f] + * vec vec = range<3>([](auto i){ return float(i * 2.0f); }); + * ``` + */ +template> +KERNEL_FLOAT_INLINE vector> range(F fun) { + return detail::range_impl::call(fun); +} + +/** + * Generate vector consisting of the numbers `0, ..., N-1` of type `T` * * Example * ======= @@ -1931,11 +2024,11 @@ struct range_impl { */ template KERNEL_FLOAT_INLINE vector> range() { - return detail::range_impl::call(); + return detail::range_impl::call(ops::cast()); } /** - * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` + * Takes a vector `vec` and returns a new vector consisting of the numbers ``0, ..., N-1`` of type ``T`` * * Example * ======= @@ -1946,11 +2039,11 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return detail::range_impl, vector_extent>::call(); + return range, vector_extent>(); } /** - * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0, ..., N-1``. The data type used * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when * needing to iterate over the indices of a vector. * @@ -1971,7 +2064,7 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return detail::range_impl>::call(); + return range>(); } namespace detail { @@ -2118,14 +2211,14 @@ using concat_type = vector, extent>> * ======= * ``` * double vec1 = 1.0; - * double3 vec2 = {3.0, 4.0, 5.0); - * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; - * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] + * double3 vec2 = {2.0, 3.0, 4.0); + * double4 vec3 = {5.0, 6.0, 7.0, 8.0}; + * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8] * * int num1 = 42; * float num2 = 3.14159; * int2 num3 = {-10, 10}; - * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] + * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] * ``` */ template @@ -2393,6 +2486,7 @@ KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t m return store(values, ptr, indices, indices < max_length); } +<<<<<<< HEAD // TOOD: check if this way is support across all compilers #if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) #define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment)) @@ -2537,13 +2631,135 @@ struct aligned_ptr { KERNEL_FLOAT_INLINE explicit aligned_ptr(const T* ptr) : ptr_(ptr) {} +======= +/** + * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes. + * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful! + */ +template +KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) { +// TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) + return static_cast(__builtin_assume_aligned(ptr, alignment)); +#else + return ptr; +#endif +} + +/** + * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes. + */ +template +struct aligned_ptr { + static_assert(alignment >= alignof(T), "invalid alignment"); + + KERNEL_FLOAT_INLINE + aligned_ptr(nullptr_t = nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + + /** + * Return the pointer value. + */ + KERNEL_FLOAT_INLINE + T* get() const { + return unsafe_assume_aligned(ptr_, alignment); + } + + KERNEL_FLOAT_INLINE + operator T*() const { + return get(); + } + + template + KERNEL_FLOAT_INLINE T& operator[](I&& index) const { + return get()[std::forward(index)]; + } + + /** + * See ``kernel_float::load`` + */ + template> + KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { + return ::kernel_float::load(get(), indices, mask); + } + + /** + * See ``kernel_float::loadn`` + */ + template + KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { + return ::kernel_float::loadn(get(), offset); + } + + /** + * See ``kernel_float::loadn`` + */ + template + KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { + return ::kernel_float::loadn(get(), offset, max_length); + } + + /** + * See ``kernel_float::store`` + */ + template> + KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const { + ::kernel_float::store(values, get(), indices, mask); + } + /** + * See ``kernel_float::storen`` + */ + template> + KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const { + ::kernel_float::storen(values, get(), offset); + } + /** + * See ``kernel_float::storen`` + */ + template> + KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const { + ::kernel_float::storen(values, get(), offset, max_length); + } + + private: + T* ptr_ = nullptr; +}; + +/** + * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes. + */ +template +struct aligned_ptr { + static_assert(alignment >= alignof(T), "invalid alignment"); + + KERNEL_FLOAT_INLINE + aligned_ptr(nullptr_t = nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(const T* ptr) : ptr_(ptr) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} +>>>>>>> 9bf416c (Update single_include) /** * Return the pointer value. */ KERNEL_FLOAT_INLINE const T* get() const { +<<<<<<< HEAD return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment); +======= + return unsafe_assume_aligned(ptr_, alignment); +>>>>>>> 9bf416c (Update single_include) } KERNEL_FLOAT_INLINE @@ -2584,6 +2800,12 @@ struct aligned_ptr { const T* ptr_ = nullptr; }; +<<<<<<< HEAD +======= +template +aligned_ptr(T*) -> aligned_ptr; + +>>>>>>> 9bf416c (Update single_include) } // namespace kernel_float #endif //KERNEL_FLOAT_MEMORY_H @@ -2686,7 +2908,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * ======= * ``` * vec x = {5, 0, 2, 1, 0}; - * int y = sum(x); // Returns 5*0*2*1*0 = 0 + * int y = product(x); // Returns 5*0*2*1*0 = 0 * ``` */ template> @@ -2994,9 +3216,9 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { namespace kernel_float { /** - * Container that stores ``N`` elements of type ``T``. + * Container that store fixed number of elements of type ``T``. * - * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for + * It is not recommended to use this class directly, instead, use the type `vec` which is an alias for * `vector, vector_storage>`. * * @tparam T The type of the values stored within the vector. @@ -3047,11 +3269,17 @@ struct vector: public S { return E::size; } + /** + * Returns a reference to the underlying storage type. + */ KERNEL_FLOAT_INLINE storage_type& storage() { return *this; } + /** + * Returns a reference to the underlying storage type. + */ KERNEL_FLOAT_INLINE const storage_type& storage() const { return *this; @@ -4147,3 +4375,565 @@ kconstant(T&&) -> kconstant>; } // namespace kernel_float #endif +#ifndef KERNEL_FLOAT_TILING_H +#define KERNEL_FLOAT_TILING_H + + + + +namespace kernel_float { + +template +struct block_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + block_size(dim3 thread_index) { + if (rank > 0 && size(0) > 1) { + thread_index_[0] = thread_index.x; + } + + if (rank > 1 && size(1) > 1) { + thread_index_[1] = thread_index.y; + } + + if (rank > 2 && size(2) > 1) { + thread_index_[2] = thread_index.z; + } + } + + KERNEL_FLOAT_INLINE + size_t thread_index(size_t axis) const { + return axis < rank ? thread_index_[axis] : 0; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } + + private: + unsigned int thread_index_[rank] = {0}; +}; + +template +struct virtual_block_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + virtual_block_size(dim3 thread_index) { + thread_index_ = thread_index.x; + } + + KERNEL_FLOAT_INLINE + size_t thread_index(size_t axis) const { + size_t product_up_to_axis = 1; +#pragma unroll + for (size_t i = 0; i < axis; i++) { + product_up_to_axis *= size(i); + } + + return (thread_index_ / product_up_to_axis) % size(axis); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } + + private: + unsigned int thread_index_ = 0; +}; + +template +struct tile_size { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis, size_t block_size = 0) { + size_t sizes[rank] = {Ns...}; + return axis < rank ? sizes[axis] : 1; + } +}; + +template +struct tile_factor { + static constexpr size_t rank = sizeof...(Ns); + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis, size_t block_size) { + size_t factors[rank] = {Ns...}; + return block_size * (axis < rank ? factors[axis] : 1); + } +}; + +namespace dist { +template +struct blocked_impl { + static constexpr bool is_exhaustive = N % K == 0; + static constexpr size_t items_per_thread = (N / K) + (is_exhaustive ? 0 : 1); + + KERNEL_FLOAT_INLINE + static constexpr bool local_is_present(size_t thread_index, size_t local_index) { + return is_exhaustive || (local_to_global(thread_index, local_index) < N); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t local_to_global(size_t thread_index, size_t local_index) { + return thread_index * items_per_thread + local_index; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_local(size_t global_index) { + return global_index % items_per_thread; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_owner(size_t global_index) { + return global_index / items_per_thread; + } +}; + +struct blocked { + template + using type = blocked_impl; +}; + +template +struct cyclic_impl { + static constexpr bool is_exhaustive = N % (K * M) == 0; + static constexpr size_t items_per_thread = ((N / (K * M)) + (is_exhaustive ? 0 : 1)) * M; + + KERNEL_FLOAT_INLINE + static constexpr bool local_is_present(size_t thread_index, size_t local_index) { + return is_exhaustive || (local_to_global(thread_index, local_index) < N); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t local_to_global(size_t thread_index, size_t local_index) { + return (local_index / M) * M * K + thread_index * M + (local_index % M); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_local(size_t global_index) { + return (global_index / (M * K)) * M + (global_index % M); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t global_to_owner(size_t global_index) { + return (global_index / M) % K; + } +}; + +struct cyclic { + template + using type = cyclic_impl<1, N, K>; +}; + +template +struct block_cyclic { + template + using type = cyclic_impl; +}; +} // namespace dist + +template +struct distributions {}; + +namespace detail { +template +struct instantiate_distribution_impl { + template + using type = dist::cyclic::type; +}; + +template +struct instantiate_distribution_impl<0, distributions> { + template + using type = typename First::type; +}; + +template +struct instantiate_distribution_impl>: + instantiate_distribution_impl> {}; + +template< + typename TileDim, + typename BlockDim, + typename Distributions, + typename = make_index_sequence> +struct tiling_impl; + +template +struct tiling_impl> { + template + using dist_type = typename instantiate_distribution_impl:: + type; + + static constexpr size_t rank = TileDim::rank; + static constexpr size_t items_per_thread = (dist_type::items_per_thread * ... * 1); + static constexpr bool is_exhaustive = (dist_type::is_exhaustive && ...); + + template + KERNEL_FLOAT_INLINE static vector_storage + local_to_global(const BlockDim& block, size_t item) { + vector_storage result; + ((result.data()[Is] = dist_type::local_to_global( + block.thread_index(Is), + item % dist_type::items_per_thread), + item /= dist_type::items_per_thread), + ...); + return result; + } + + KERNEL_FLOAT_INLINE + static bool local_is_present(const BlockDim& block, size_t item) { + bool is_present = true; + ((is_present &= dist_type::local_is_present( + block.thread_index(Is), + item % dist_type::items_per_thread), + item /= dist_type::items_per_thread), + ...); + return is_present; + } +}; +}; // namespace detail + +template +struct tiling_iterator; + +/** + * Represents a tiling where the elements given by `TileDim` are distributed over the + * threads given by `BlockDim` according to the distributions given by `Distributions`. + * + * The template parameters should be the following: + * + * * ``TileDim``: Should be an instance of ``tile_size<...>``. For example, + * ``tile_size<16, 16>`` represents a 2-dimensional 16x16 tile. + * * ``BlockDim``: Should be an instance of ``block_dim<...>``. For example, + * ``block_dim<16, 4>`` represents a thread block having X dimension 16 + * and Y-dimension 4 for a total of 64 threads per block. + * * ``Distributions``: Should be an instance of ``distributions<...>``. For example, + * ``distributions`` will distribute elements in + * cyclic fashion along the X-axis and blocked fashion along the Y-axis. + * * ``IndexType``: The type used for index values (``int`` by default) + */ +template< + typename TileDim, + typename BlockDim, + typename Distributions = distributions<>, + typename IndexType = int> +struct tiling { + using self_type = tiling; + using impl_type = detail::tiling_impl; + using block_type = BlockDim; + using tile_type = TileDim; + + static constexpr size_t rank = tile_type::rank; + static constexpr size_t num_locals = impl_type::items_per_thread; + + using index_type = IndexType; + using point_type = vector>; + +#if KERNEL_FLOAT_IS_DEVICE + __forceinline__ __device__ tiling() : block_(threadIdx) {} +#endif + + KERNEL_FLOAT_INLINE + tiling(BlockDim block, vec offset = {}) : block_(block), offset_(offset) {} + + /** + * Returns the number of items per thread in the tiling. + * + * Note that this method is ``constexpr`` and can be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return impl_type::items_per_thread; + } + + /** + * Checks if the tiling is exhaustive, meaning all items are always present for all threads. If this returns + * `true`, then ``is_present`` will always true for any given index. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr bool all_present() { + return impl_type::is_exhaustive; + } + + /** + * Checks if a specific item is present for the current thread based on the distribution strategy. Not always + * is the number of items stored per thread equal to the number of items _owned_ by each thread (for example, + * if the tile size is not divisible by the block size). In this case, ``is_present`` will return `false` for + * certain items. + */ + KERNEL_FLOAT_INLINE + bool is_present(size_t item) const { + return all_present() || impl_type::local_is_present(block_, item); + } + + /** + * Returns the global coordinates of a specific item for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> at(size_t item) const { + return impl_type::template local_to_global(block_, item) + offset_; + } + + /** + * Returns the global coordinates of a specific item along a specified axis for the current thread. + */ + KERNEL_FLOAT_INLINE + index_type at(size_t item, size_t axis) const { + return axis < rank ? at(item)[axis] : index_type {}; + } + + /** + * Returns the global coordinates of a specific item for the current thread (alias of ``at``). + */ + KERNEL_FLOAT_INLINE + vector> operator[](size_t item) const { + return at(item); + } + + /** + * Returns a vector of global coordinates of all items present for the current thread. + */ + KERNEL_FLOAT_INLINE + vector>, extent> local_points() const { + return range([&](size_t i) { return at(i); }); + } + + /** + * Returns a vector of coordinate values along a specified axis for all items present for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> local_points(size_t axis) const { + return range([&](size_t i) { return at(i, axis); }); + } + + /** + * Returns a vector of boolean values representing the result of ``is_present`` of the items for the current thread. + */ + KERNEL_FLOAT_INLINE + vector> local_mask() const { + return range([&](size_t i) { return is_present(i); }); + } + + /** + * Returns the thread index (position) along a specified axis for the current thread. + */ + KERNEL_FLOAT_INLINE + index_type thread_index(size_t axis) const { + return index_type(block_.thread_index(axis)); + } + + /** + * Returns the size of the block (number of threads) along a specified axis. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr index_type block_size(size_t axis) { + return index_type(block_type::size(axis)); + } + + /** + * Returns the size of the tile along a specified axis. + * + * Note that this method is ``constexpr`` and can thus be called at compile-time. + */ + KERNEL_FLOAT_INLINE + static constexpr index_type tile_size(size_t axis) { + return index_type(tile_type::size(axis, block_size(axis))); + } + + /** + * Returns the offset of the tile along a specified axis. + */ + KERNEL_FLOAT_INLINE + index_type tile_offset(size_t axis) const { + return index_type(offset_[axis]); + } + + /** + * Returns a vector of thread indices for all axes. + */ + KERNEL_FLOAT_INLINE + vector> thread_index() const { + return range([&](size_t i) { return thread_index(i); }); + } + + /** + * Returns a vector of block sizes for all axes. + */ + KERNEL_FLOAT_INLINE + static vector> block_size() { + return range([&](size_t i) { return block_size(i); }); + } + + /** + * Returns a vector of tile sizes for all axes. + */ + KERNEL_FLOAT_INLINE + static vector> tile_size() { + return range([&](size_t i) { return tile_size(i); }); + } + + /** + * Returns the offset of the tile for all axes. + */ + KERNEL_FLOAT_INLINE + vector> tile_offset() const { + return range([&](size_t i) { return tile_offset(i); }); + } + + /** + * Returns an iterator pointing to the beginning of the tiling. + */ + KERNEL_FLOAT_INLINE + tiling_iterator begin() const { + return {*this, 0}; + } + + /** + * Returns an iterator pointing to the end of the tiling. + */ + KERNEL_FLOAT_INLINE + tiling_iterator end() const { + return {*this, num_locals}; + } + + /** + * Applies a provided function to each item present in the tiling for the current thread. + * The function should take an index and a ``vector`` of global coordinates as arguments. + */ + template + KERNEL_FLOAT_INLINE void for_each(F fun) const { +#pragma unroll + for (size_t i = 0; i < num_locals; i++) { + if (is_present(i)) { + fun(i, at(i)); + } + } + } + + /** + * Adds ``offset`` to all points of this tiling and returns a new tiling. + */ + KERNEL_FLOAT_INLINE friend tiling + operator+(const tiling& self, const vector>& offset) { + return tiling {self.block_, self.offset_ + offset}; + } + + /** + * Adds ``offset`` to all points of this tiling and returns a new tiling. + */ + KERNEL_FLOAT_INLINE friend tiling + operator+(const vector>& offset, const tiling& self) { + return self + offset; + } + + /** + * Adds ``offset`` to all points of this tiling. + */ + KERNEL_FLOAT_INLINE friend tiling& + operator+=(tiling& self, const vector>& offset) { + return self = self + offset; + } + + private: + BlockDim block_; + vector> offset_; +}; + +template +struct tiling_iterator { + using value_type = vector>; + + KERNEL_FLOAT_INLINE + tiling_iterator(const T& inner, size_t position = 0) : inner_(&inner), position_(position) { + while (position_ < T::num_locals && !inner_->is_present(position_)) { + position_++; + } + } + + KERNEL_FLOAT_INLINE + value_type operator*() const { + return inner_->at(position_); + } + + KERNEL_FLOAT_INLINE + tiling_iterator& operator++() { + return *this = tiling_iterator(*inner_, position_ + 1); + } + + KERNEL_FLOAT_INLINE + tiling_iterator operator++(int) { + tiling_iterator old = *this; + this ++; + return old; + } + + KERNEL_FLOAT_INLINE + friend bool operator==(const tiling_iterator& a, const tiling_iterator& b) { + return a.position_ == b.position_; + } + + KERNEL_FLOAT_INLINE + friend bool operator!=(const tiling_iterator& a, const tiling_iterator& b) { + return !operator==(a, b); + } + + size_t position_ = 0; + const T* inner_; +}; + +template +using tiling_1d = tiling, block_size, distributions, IndexType>; + +// clang-format off +#define KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \ + _Pragma("unroll") \ + for (size_t ITER_VAR = 0; ITER_VAR < (TILING).size(); ITER_VAR++) \ + if (POINT_VAR = (TILING).at(ITER_VAR); (TILING).is_present(ITER_VAR)) \ + +#define KERNEL_FLOAT_TILING_FOR_IMPL2(ITER_VAR, TILING, INDEX_VAR, POINT_VAR) \ + KERNEL_FLOAT_TILING_FOR_IMPL1(ITER_VAR, TILING, POINT_VAR, _) \ + if (INDEX_VAR = ITER_VAR; true) + +#define KERNEL_FLOAT_TILING_FOR_IMPL(ITER_VAR, TILING, A, B, N, ...) \ + KERNEL_FLOAT_CALL(KERNEL_FLOAT_CONCAT(KERNEL_FLOAT_TILING_FOR_IMPL, N), ITER_VAR, TILING, A, B) + +/** + * Iterate over the points in a ``tiling<...>`` using a for loop. + * + * There are two ways to use this macro. Using the 1 variable form: + * ``` + * auto t = tiling, block_size<4, 4>>; + * + * KERNEL_FLOAT_TILING_FOR(t, auto point) { + * printf("%d,%d\n", point[0], point[1]); + * } + * ``` + * + * Or using the 2 variables form: + * ``` + * auto t = tiling, block_size<4, 4>>; + * + * KERNEL_FLOAT_TILING_FOR(t, auto index, auto point) { + * printf("%d] %d,%d\n", index, point[0], point[1]); + * } + * ``` + */ +#define KERNEL_FLOAT_TILING_FOR(...) \ + KERNEL_FLOAT_TILING_FOR_IMPL(KERNEL_FLOAT_CONCAT(__tiling_index_variable__, __LINE__), __VA_ARGS__, 2, 1) +// clang-format on + +} // namespace kernel_float + +#endif // KERNEL_FLOAT_TILING_H From 79449e3375ef3c8ee245aea3d5310bedee189485 Mon Sep 17 00:00:00 2001 From: stijn Date: Sun, 22 Oct 2023 22:12:27 +0200 Subject: [PATCH 11/16] Refactor functions/types in `memory.h` --- examples/vector_add/main.cu | 16 +- examples/vector_add_tiling/main.cu | 4 +- include/kernel_float/macros.h | 28 +- include/kernel_float/memory.h | 523 ++++++++++++++++++----------- tests/common.h | 3 +- tests/memory.cu | 81 +++-- 6 files changed, 407 insertions(+), 248 deletions(-) diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 465c707..5d10239 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -13,11 +13,13 @@ void cuda_check(cudaError_t code) { } template -__global__ void my_kernel(int length, const khalf* input, double constant, kfloat* output) { +__global__ void my_kernel(int length, const __half* input, double constant, float* output) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i * N < length) { - kf::cast_to(output[i]) = (input[i] * input[i]) * constant; + auto a = kf::read_aligned(input + i * N); + auto b = (a * a) * constant; + kf::write_aligned(output + i * N, b); } } @@ -35,8 +37,8 @@ void run_kernel(int n) { } // Allocate device memory - khalf* input_dev; - kfloat* output_dev; + __half* input_dev; + float* output_dev; cuda_check(cudaMalloc(&input_dev, sizeof(half) * n)); cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); @@ -47,7 +49,11 @@ void run_kernel(int n) { int block_size = 256; int items_per_block = block_size * items_per_thread; int grid_size = (n + items_per_block - 1) / items_per_block; - my_kernel<<>>(n, input_dev, constant, output_dev); + my_kernel<<>>( + n, + kf::aligned_ptr(input_dev), + constant, + kf::aligned_ptr(output_dev)); // Copy results back cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault)); diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu index 1134778..291cb05 100644 --- a/examples/vector_add_tiling/main.cu +++ b/examples/vector_add_tiling/main.cu @@ -27,9 +27,9 @@ __global__ void my_kernel( auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0); auto mask = tiling.local_mask(); - auto a = kf::load(input.get(), points, mask); + auto a = input.read(points, mask); auto b = (a * a) * constant; - kf::store(b, output.get(), points, mask); + output.write(points, b, mask); } template diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h index bfc8595..689b6b2 100644 --- a/include/kernel_float/macros.h +++ b/include/kernel_float/macros.h @@ -9,35 +9,35 @@ #define KERNEL_FLOAT_IS_DEVICE (1) #define KERNEL_FLOAT_IS_HOST (0) #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__) -#else +#else // __CUDA_ARCH__ #define KERNEL_FLOAT_INLINE __forceinline__ __host__ #define KERNEL_FLOAT_IS_DEVICE (0) #define KERNEL_FLOAT_IS_HOST (1) #define KERNEL_FLOAT_CUDA_ARCH (0) -#endif -#else +#endif // __CUDA_ARCH__ +#else // __CUDACC__ #define KERNEL_FLOAT_INLINE inline #define KERNEL_FLOAT_CUDA (0) #define KERNEL_FLOAT_IS_HOST (1) #define KERNEL_FLOAT_IS_DEVICE (0) #define KERNEL_FLOAT_CUDA_ARCH (0) -#endif +#endif // __CUDACC__ #ifndef KERNEL_FLOAT_FP16_AVAILABLE #define KERNEL_FLOAT_FP16_AVAILABLE (1) -#endif +#endif // KERNEL_FLOAT_FP16_AVAILABLE #ifndef KERNEL_FLOAT_BF16_AVAILABLE #define KERNEL_FLOAT_BF16_AVAILABLE (1) -#endif +#endif // KERNEL_FLOAT_BF16_AVAILABLE #ifndef KERNEL_FLOAT_FP8_AVAILABLE #ifdef __CUDACC_VER_MAJOR__ #define KERNEL_FLOAT_FP8_AVAILABLE (__CUDACC_VER_MAJOR__ >= 12) -#else +#else // __CUDACC_VER_MAJOR__ #define KERNEL_FLOAT_FP8_AVAILABLE (0) -#endif -#endif +#endif // __CUDACC_VER_MAJOR__ +#endif // KERNEL_FLOAT_FP8_AVAILABLE #define KERNEL_FLOAT_ASSERT(expr) \ do { \ @@ -49,4 +49,14 @@ #define KERNEL_FLOAT_CONCAT(A, B) KERNEL_FLOAT_CONCAT_IMPL(A, B) #define KERNEL_FLOAT_CALL(F, ...) F(__VA_ARGS__) +// TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) && 0 +#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) \ + static_cast(__builtin_assume_aligned(static_cast(PTR), (ALIGNMENT))) +#else +#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) (PTR) +#endif + +#define KERNEL_FLOAT_MAX_ALIGNMENT (32) + #endif //KERNEL_FLOAT_MACROS_H diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index 06c1aac..badfcf4 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -6,30 +6,20 @@ #include "iterate.h" namespace kernel_float { - namespace detail { template> -struct load_impl; +struct copy_impl; template -struct load_impl> { +struct copy_impl> { KERNEL_FLOAT_INLINE - static vector_storage call(const T* input, const size_t* offsets) { - return {input[offsets[Is]]...}; + static vector_storage load(const T* input, const size_t* offsets, const bool* mask) { + return {(mask[Is] ? input[offsets[Is]] : T {})...}; } KERNEL_FLOAT_INLINE - static vector_storage call(const T* input, const size_t* offsets, const bool* mask) { - bool all_valid = true; - for (size_t i = 0; i < N; i++) { - all_valid &= mask[i]; - } - - if (all_valid) { - return {input[offsets[Is]]...}; - } else { - return {(mask[Is] ? input[offsets[Is]] : T())...}; - } + static void store(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) { + ((mask[Is] ? outputs[offsets[Is]] = inputs[Is] : T {}), ...); } }; } // namespace detail @@ -37,202 +27,266 @@ struct load_impl> { /** * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``. * + * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should + * be loaded and ``false`` indicates that the value should be skipped. This can be used + * to prevent reading out of bounds. + * * ``` - * // Load 4 elements at data[0], data[2], data[4], data[8] - * vec values = load(data, make_vec(0, 2, 4, 8)); + * // Load 2 elements at data[0] and data[8], skip data[2] and data[4] + * vec values = = read(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true)); * ``` */ -template -KERNEL_FLOAT_INLINE vector> load(const T* ptr, const I& indices) { - return detail::load_impl>::call(ptr, cast(indices).data()); +template> +KERNEL_FLOAT_INLINE vector read(const T* ptr, const I& indices, const M& mask = true) { + return detail::copy_impl::load( + ptr, + convert_storage(indices, E()).data(), + convert_storage(mask, E()).data()); } /** - * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``. + * Store the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``. * * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should - * be loaded and ``false`` indicates that the value should be skipped. This can be used - * to prevent reading out of bounds. + * be store and ``false`` indicates that the value should be skipped. This can be used + * to prevent writing out of bounds. * * ``` - * // Load 2 elements at data[0] and data[8], skip data[2] and data[4] - * vec values = = load(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true)); + * // Store 2 elements at data[0] and data[8], skip data[2] and data[4] + * auto values = make_vec(42, 13, 87, 12); + * auto mask = make_vec(true, false, false, true); + * write(data, make_vec(0, 2, 4, 8), values, mask); * ``` */ -template> -KERNEL_FLOAT_INLINE vector load(const T* ptr, const I& indices, const M& mask) { - static constexpr E new_size = {}; - - return detail::load_impl::call( +template< + typename T, + typename V, + typename I, + typename M = bool, + typename E = broadcast_vector_extent_type> +KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) { + return detail::copy_impl::store( ptr, - convert_storage(indices, new_size).data(), - convert_storage(mask, new_size).data()); + convert_storage(values, E()).data(), + convert_storage(indices, E()).data(), + convert_storage(mask, E()).data()); } /** - * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an - * ``offset`` can be given that shifts all the indices by a fixed amount. + * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. * * ``` * // Load 4 elements at locations data[0], data[1], data[2], data[3] - * vec values = loadn<4>(data); + * vec values = read<4>(data); * * // Load 4 elements at locations data[10], data[11], data[12], data[13] - * vec values2 = loadn<4>(data, 10); + * vec values = read<4>(values + 10, data); * ``` */ template -KERNEL_FLOAT_INLINE vector> loadn(const T* ptr, size_t offset = 0) { - return load(ptr, offset + range()); +KERNEL_FLOAT_INLINE vector> read(const T* ptr) { + return read(ptr, range()); } /** - * Load ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``. - * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used - * to prevent reading out of bounds. + * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. * * ``` - * // Returns {ptr[8], ptr[9], 0, 0}; - * vec values = loadn<4>(data, 8, 10); + * // Store 4 elements at locations data[0], data[1], data[2], data[3] + * vec values = {1.0f, 2.0f, 3.0f, 4.0f}; + * write(data, values); + * + * // Store 4 elements at locations data[10], data[11], data[12], data[13] + * write(data + 10, values); * ``` */ -template -KERNEL_FLOAT_INLINE vector> loadn(const T* ptr, size_t offset, size_t max_length) { - auto indices = offset + range(); - return load(ptr, indices, indices < max_length); +template +KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) { + static constexpr size_t N = vector_extent; + write(ptr, range(), values); } namespace detail { -template> -struct store_impl; +KERNEL_FLOAT_INLINE +constexpr size_t gcd(size_t a, size_t b) { + return b == 0 ? a : gcd(b, a % b); +} + +template +struct copy_aligned_impl { + static constexpr size_t half = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1)); + static constexpr size_t new_alignment = gcd(alignment, sizeof(T) * half); -template -struct store_impl> { KERNEL_FLOAT_INLINE - static void call(T* outputs, const T* inputs, const size_t* offsets) { - ((outputs[offsets[Is]] = inputs[Is]), ...); + static void load(T* output, const T* input) { + copy_aligned_impl::load(output, input); + copy_aligned_impl::load(output + half, input + half); } KERNEL_FLOAT_INLINE - static void call(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) { - bool all_valid = true; - for (size_t i = 0; i < N; i++) { - all_valid &= mask[i]; - } - - if (all_valid) { - ((outputs[offsets[Is]] = inputs[Is]), ...); - } else { -#pragma unroll - for (size_t i = 0; i < N; i++) { - if (mask[i]) { - outputs[offsets[i]] = inputs[i]; - } - } - } + static void store(T* output, const T* input) { + copy_aligned_impl::store(output, input); + copy_aligned_impl::store(output + half, input + half); } }; -} // namespace detail -/** - * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``. - * - * ``` - * // Store 4 elements at data[0], data[2], data[4], data[8] - * auto values = make_vec(42, 13, 87, 12); - * store(values, data, make_vec(0, 2, 4, 8)); - * ``` - */ -template< - typename T, - typename V, - typename I, - typename M, - typename E = broadcast_vector_extent_type> -KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices, const M& mask) { - return detail::store_impl::call( - ptr, - convert_storage(values, E()).data(), - convert_storage(indices, E()).data(), - convert_storage(mask, E()).data()); -} +template +struct copy_aligned_impl { + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) {} + + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) {} +}; + +template +struct copy_aligned_impl { + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) { + output[0] = input[0]; + } + + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + output[0] = input[0]; + } +}; + +template +struct copy_aligned_impl { + static constexpr size_t new_alignment = gcd(alignment, 2 * sizeof(T)); + struct alignas(new_alignment) storage_type { + T v0, v1; + }; + + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) { + storage_type storage = *reinterpret_cast(input); + output[0] = storage.v0; + output[1] = storage.v1; + } + + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + *reinterpret_cast(output) = storage_type {input[0], input[1]}; + } +}; + +template +struct copy_aligned_impl { + static constexpr size_t new_alignment = gcd(alignment, 4 * sizeof(T)); + struct alignas(new_alignment) storage_type { + T v0, v1, v2, v3; + }; + + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) { + storage_type storage = *reinterpret_cast(input); + output[0] = storage.v0; + output[1] = storage.v1; + output[2] = storage.v2; + output[3] = storage.v3; + } + + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + *reinterpret_cast(output) = storage_type { + input[0], // + input[1], + input[2], + input[3]}; + } +}; + +template +struct copy_aligned_impl { + static constexpr size_t new_alignment = gcd(alignment, 8 * sizeof(T)); + struct alignas(new_alignment) storage_type { + T v0, v1, v2, v3, v4, v5, v6, v7; + }; + + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) { + storage_type storage = *reinterpret_cast(input); + output[0] = storage.v0; + output[1] = storage.v1; + output[2] = storage.v2; + output[3] = storage.v3; + output[4] = storage.v4; + output[5] = storage.v5; + output[6] = storage.v6; + output[7] = storage.v7; + } + + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + *reinterpret_cast(output) = storage_type { + input[0], // + input[1], + input[2], + input[3], + input[4], + input[5], + input[6], + input[7]}; + } +}; + +} // namespace detail /** - * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``. + * Load ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``. * - * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should - * be store and ``false`` indicates that the value should be skipped. This can be used - * to prevent writing out of bounds. + * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector + * operation. If the pointer is not aligned, undefined behavior will occur. * * ``` - * // Store 2 elements at data[0] and data[8], skip data[2] and data[4] - * auto values = make_vec(42, 13, 87, 12); - * auto mask = make_vec(true, false, false, true); - * store(values, data, make_vec(0, 2, 4, 8), mask); + * // Load 4 elements at locations data[0], data[1], data[2], data[3] + * vec values = read_aligned<4>(data); + * + * // Load 4 elements at locations data[10], data[11], data[12], data[13] + * vec values2 = read_aligned<4>(data + 10); * ``` */ -template> -KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices) { - return detail::store_impl::call( - ptr, - convert_storage(values, E()).data(), - convert_storage(indices, E()).data()); +template +KERNEL_FLOAT_INLINE vector> read_aligned(const T* ptr) { + static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); + vector_storage result; + detail::copy_aligned_impl::load( + result.data(), + KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr, alignment)); + return result; } /** - * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an - * ``offset`` can be given that shifts all the indices by a fixed amount. + * Store ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``. + * + * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector + * operation. If the pointer is not aligned, undefined behavior will occur. * * ``` * // Store 4 elements at locations data[0], data[1], data[2], data[3] * vec values = {1.0f, 2.0f, 3.0f, 4.0f}; - * storen<4>(values, data); + * write_aligned(data, values); * * // Load 4 elements at locations data[10], data[11], data[12], data[13] - * storen<4>(values, data, 10); + * write_aligned(data + 10, values); * ``` */ -template> -KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset = 0) { - auto indices = offset + range(); - return store(values, ptr, indices); -} - -/** - * Store ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``. - * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used - * to prevent reading out of bounds. - * - * ``` - * // Store 1.0f at data[8] and 2.0f at data[9]. Ignores remaining values. - * vec values = {1.0f, 2.0f, 3.0f, 4.0f}; - * storen<4>(values, data, 8, 10); - * ``` - */ -template> -KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t max_length) { - auto indices = offset + range(); - return store(values, ptr, indices, indices < max_length); -} - -/** - * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes. - * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful! - */ -template -KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) { -// TOOD: check if this way is support across all compilers -#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) - return static_cast(__builtin_assume_aligned(ptr, alignment)); -#else - return ptr; -#endif +template +KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) { + static constexpr size_t N = vector_extent; + static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); + + return detail::copy_aligned_impl::store( + KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr, alignment), + convert_storage(values).data()); } /** * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes. */ -template +template struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); @@ -242,12 +296,15 @@ struct aligned_ptr { KERNEL_FLOAT_INLINE explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} + /** * Return the pointer value. */ KERNEL_FLOAT_INLINE T* get() const { - return unsafe_assume_aligned(ptr_, alignment); + return KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr_, alignment); } KERNEL_FLOAT_INLINE @@ -261,58 +318,111 @@ struct aligned_ptr { } /** - * See ``kernel_float::load`` + * Returns a new pointer that is offset by ``Step * offset`` items + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns a pointer to element `vector[8]` with alignment of 4 + * ptr.offset(8); + * + * // Returns a pointer to element `vector[4]` with alignment of 16 + * ptr.offset<4>(); + * + * // Returns a pointer to element `vector[8]` with alignment of 16 + * ptr.offset<4>(2); + * ``` */ - template> - KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { - return ::kernel_float::load(get(), indices, mask); + template + KERNEL_FLOAT_INLINE aligned_ptr + offset(size_t n = 1) const { + return aligned_ptr {get() + Step * n}; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::read`` */ template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { - return ::kernel_float::loadn(get(), offset); + KERNEL_FLOAT_INLINE vector> read() const { + vector_storage result; + detail::copy_aligned_impl::load(result.data(), get()); + return result; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::write`` */ - template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { - return ::kernel_float::loadn(get(), offset, max_length); + template + KERNEL_FLOAT_INLINE void write(const V& values) const { + constexpr size_t N = vector_extent; + return detail::copy_aligned_impl::store( + get(), + convert_storage(values).data()); } /** - * See ``kernel_float::store`` + * See ``kernel_float::read`` */ - template> - KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const { - ::kernel_float::store(values, get(), indices, mask); + template> + KERNEL_FLOAT_INLINE vector read(const I& indices, const M& mask = true) { + return ::kernel_float::read(get(), indices, mask); } + /** - * See ``kernel_float::storen`` + * See ``kernel_float::write`` */ - template> - KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const { - ::kernel_float::storen(values, get(), offset); + template + KERNEL_FLOAT_INLINE void write(const I& indices, const V& values, const M& mask = true) { + return ::kernel_float::write(get(), indices, values, mask); } + + /** + * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items. + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns vector[40], vector[41], vector[42], vector[43] + * ptr.read_at<4>(10); + * + * // Returns vector[20], vector[21], vector[22] + * ptr.read_at<2, 3>(10); + * ``` + */ + template + KERNEL_FLOAT_INLINE vector> read_at(size_t offset) const { + return this->offset(offset).template read(); + } + /** - * See ``kernel_float::storen`` + * Offsets the pointer by `Step * offset` items and then writes the subsequent `N` items. + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * vec values = {1, 2, 3, 4}; + * + * // Writes to vector[40], vector[41], vector[42], vector[43] + * ptr.write_at<4>(10, values); + * + * // Returns vector[20], vector[21], vector[22], vector[23] + * ptr.write_at<2>(10, values); + * ``` */ - template> - KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const { - ::kernel_float::storen(values, get(), offset, max_length); + template + KERNEL_FLOAT_INLINE void write_at(size_t offset, const V& values) const { + return this->offset(offset).template write(values); } private: T* ptr_ = nullptr; }; -/** - * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes. - */ template struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); @@ -337,7 +447,7 @@ struct aligned_ptr { */ KERNEL_FLOAT_INLINE const T* get() const { - return unsafe_assume_aligned(ptr_, alignment); + return KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr_, alignment); } KERNEL_FLOAT_INLINE @@ -351,36 +461,71 @@ struct aligned_ptr { } /** - * See ``kernel_float::load`` + * Returns a new pointer that is offset by ``Step * offset`` items + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns a pointer to element `vector[8]` with alignment of 4 + * ptr.offset(8); + * + * // Returns a pointer to element `vector[4]` with alignment of 16 + * ptr.offset<4>(); + * + * // Returns a pointer to element `vector[8]` with alignment of 16 + * ptr.offset<4>(2); + * ``` */ - template> - KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { - return ::kernel_float::load(get(), indices, mask); + template + KERNEL_FLOAT_INLINE aligned_ptr + offset(size_t n = 1) const { + return aligned_ptr {get() + Step * n}; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::read`` */ template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { - return ::kernel_float::loadn(get(), offset); + KERNEL_FLOAT_INLINE vector> read() const { + vector_storage result; + detail::copy_aligned_impl::load(result.data(), get()); + return result; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::write`` */ - template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { - return ::kernel_float::loadn(get(), offset, max_length); + template> + KERNEL_FLOAT_INLINE vector read(const I& indices, const M& mask = true) { + return ::kernel_float::read(get(), indices, mask); + } + + /** + * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items. + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns vector[40], vector[41], vector[42], vector[43] + * ptr.read_at<4>(10); + * + * // Returns vector[20], vector[21], vector[22] + * ptr.read_at<2, 3>(10); + * ``` + */ + template + KERNEL_FLOAT_INLINE vector> read_at(size_t offset) const { + return this->offset(offset).template read(); } private: const T* ptr_ = nullptr; }; -template -aligned_ptr(T*) -> aligned_ptr; - } // namespace kernel_float #endif //KERNEL_FLOAT_MEMORY_H diff --git a/tests/common.h b/tests/common.h index dac12bb..ff8072d 100644 --- a/tests/common.h +++ b/tests/common.h @@ -11,7 +11,8 @@ namespace kf = kernel_float; namespace detail { -static __host__ __device__ void __assertion_failed(const char* expr, const char* file, int line) { +__attribute__((noinline)) static __host__ __device__ void +__assertion_failed(const char* expr, const char* file, int line) { #ifndef __CUDA_ARCH__ std::string msg = "assertion failed: " + std::string(expr) + " (" + file + ":" + std::to_string(line) + ")"; diff --git a/tests/memory.cu b/tests/memory.cu index 6fdc90d..ac29b46 100644 --- a/tests/memory.cu +++ b/tests/memory.cu @@ -7,44 +7,37 @@ struct load_test { { auto expected = kf::make_vec(T(3.0), T(2.0), T(7.0)); - auto output = kf::load(data, kf::make_vec(3, 2, 7)); + auto output = kf::read(data, kf::make_vec(3, 2, 7)); ASSERT_EQ(expected, output); } { auto expected = kf::make_vec(T(3.0), T(2.0), T(7.0)); - auto output = kf::load(data, kf::make_vec(3, 2, 7), kf::make_vec(true, true, true)); + auto output = kf::read(data, kf::make_vec(3, 2, 7), kf::make_vec(true, true, true)); ASSERT_EQ(expected, output); } { auto expected = kf::make_vec(T(3.0), T(), T(7.0)); - auto output = kf::load(data, kf::make_vec(3, 100, 7), kf::make_vec(true, false, true)); + auto output = kf::read(data, kf::make_vec(3, 100, 7), kf::make_vec(true, false, true)); ASSERT_EQ(expected, output); } { auto expected = kf::make_vec(T(0.0), T(1.0), T(2.0)); - auto output = kf::loadn<3>(data); + auto output = kf::read<3>(data); ASSERT_EQ(expected, output); } { auto expected = kf::make_vec(T(2.0), T(3.0), T(4.0)); - auto output = kf::loadn<3>(data, 2); - ASSERT_EQ(expected, output); - } - - { - auto expected = kf::make_vec(T(6.0), T(7.0), T()); - auto output = kf::loadn<3>(data, 6, 8); + auto output = kf::read<3>(data + 2); ASSERT_EQ(expected, output); } } }; -REGISTER_TEST_CASE("load", load_test, int, float, double) -REGISTER_TEST_CASE_GPU("load", load_test, __half, __nv_bfloat16) +REGISTER_TEST_CASE("load", load_test, int, float, double, __half, __nv_bfloat16) struct store_test { template @@ -53,7 +46,7 @@ struct store_test { T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)}; auto values = kf::make_vec(T(100.0), T(200.0)); auto offsets = kf::make_vec(1, 3); - kf::store(values, data, offsets); + kf::write(data, offsets, values); ASSERT_EQ(data[0], T(0.0)); ASSERT_EQ(data[1], T(100.0)); ASSERT_EQ(data[2], T(2.0)); @@ -65,7 +58,7 @@ struct store_test { auto values = kf::make_vec(T(100.0), T(200.0)); auto offsets = kf::make_vec(1, 3); auto mask = kf::make_vec(true, true); - kf::store(values, data, offsets, mask); + kf::write(data, offsets, values, mask); ASSERT_EQ(data[0], T(0.0)); ASSERT_EQ(data[1], T(100.0)); ASSERT_EQ(data[2], T(2.0)); @@ -77,7 +70,7 @@ struct store_test { auto values = kf::make_vec(T(100.0), T(200.0)); auto offsets = kf::make_vec(1, 3); auto mask = kf::make_vec(true, false); - kf::store(values, data, offsets, mask); + kf::write(data, offsets, values, mask); ASSERT_EQ(data[0], T(0.0)); ASSERT_EQ(data[1], T(100.0)); ASSERT_EQ(data[2], T(2.0)); @@ -87,7 +80,7 @@ struct store_test { { T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)}; auto values = kf::make_vec(T(100.0), T(200.0)); - kf::storen(values, data); + kf::write(data, values); ASSERT_EQ(data[0], T(100.0)); ASSERT_EQ(data[1], T(200.0)); ASSERT_EQ(data[2], T(2.0)); @@ -97,37 +90,16 @@ struct store_test { { T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)}; auto values = kf::make_vec(T(100.0), T(200.0)); - kf::storen(values, data, 1); + kf::write(data + 1, values); ASSERT_EQ(data[0], T(0.0)); ASSERT_EQ(data[1], T(100.0)); ASSERT_EQ(data[2], T(200.0)); ASSERT_EQ(data[3], T(3.0)); } - - { - T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)}; - auto values = kf::make_vec(T(100.0), T(200.0)); - kf::storen(values, data, 1, 4); - ASSERT_EQ(data[0], T(0.0)); - ASSERT_EQ(data[1], T(100.0)); - ASSERT_EQ(data[2], T(200.0)); - ASSERT_EQ(data[3], T(3.0)); - } - - { - T data[4] = {T(0.0), T(1.0), T(2.0), T(3.0)}; - auto values = kf::make_vec(T(100.0), T(200.0)); - kf::storen(values, data, 3, 4); - ASSERT_EQ(data[0], T(0.0)); - ASSERT_EQ(data[1], T(1.0)); - ASSERT_EQ(data[2], T(2.0)); - ASSERT_EQ(data[3], T(100.0)); - } } }; -REGISTER_TEST_CASE("store", store_test, int, float, double) -REGISTER_TEST_CASE_GPU("store", store_test, __half, __nv_bfloat16) +REGISTER_TEST_CASE("store", store_test, int, float, double, __half, __nv_bfloat16) struct assign_conversion_test { template @@ -141,5 +113,30 @@ struct assign_conversion_test { } }; -REGISTER_TEST_CASE("assign conversion", assign_conversion_test, int, float, double) -REGISTER_TEST_CASE_GPU("assign conversion", assign_conversion_test, __half, __nv_bfloat16) \ No newline at end of file +REGISTER_TEST_CASE( + "assign conversion", + assign_conversion_test, + int, + float, + double, + __half, + __nv_bfloat16) + +struct aligned_ptr_test { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + struct alignas(32) storage_type { + T data[N]; + }; + + storage_type input = {T(double(I))...}; + storage_type output = {T(double(I * 0))...}; + + auto v = kf::read_aligned(input.data); + kf::write_aligned(output.data, v); + + ASSERT_EQ_ALL(output.data[I], T(double(I))); + } +}; + +REGISTER_TEST_CASE("aligned pointer", aligned_ptr_test, int, float, double, __half, __nv_bfloat16) \ No newline at end of file From ed7143d9b83a89f043191f4957760081bda60b4d Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 23 Oct 2023 10:39:30 +0200 Subject: [PATCH 12/16] Change function signature of `read/write_aligned` --- examples/vector_add/main.cu | 2 +- include/kernel_float/memory.h | 42 ++- single_include/kernel_float.h | 673 +++++++++++++++++----------------- tests/memory.cu | 10 +- 4 files changed, 363 insertions(+), 364 deletions(-) diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 5d10239..7b88292 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -19,7 +19,7 @@ __global__ void my_kernel(int length, const __half* input, double constant, floa if (i * N < length) { auto a = kf::read_aligned(input + i * N); auto b = (a * a) * constant; - kf::write_aligned(output + i * N, b); + kf::write_aligned(output + i * N, b); } } diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index badfcf4..d75cddf 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -112,21 +112,21 @@ constexpr size_t gcd(size_t a, size_t b) { return b == 0 ? a : gcd(b, a % b); } -template +template struct copy_aligned_impl { - static constexpr size_t half = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1)); - static constexpr size_t new_alignment = gcd(alignment, sizeof(T) * half); + static constexpr size_t K = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1)); + static constexpr size_t alignment_K = gcd(alignment, sizeof(T) * K); KERNEL_FLOAT_INLINE static void load(T* output, const T* input) { - copy_aligned_impl::load(output, input); - copy_aligned_impl::load(output + half, input + half); + copy_aligned_impl::load(output, input); + copy_aligned_impl::load(output + K, input + K); } KERNEL_FLOAT_INLINE static void store(T* output, const T* input) { - copy_aligned_impl::store(output, input); - copy_aligned_impl::store(output + half, input + half); + copy_aligned_impl::store(output, input); + copy_aligned_impl::store(output + K, input + K); } }; @@ -141,6 +141,8 @@ struct copy_aligned_impl { template struct copy_aligned_impl { + using storage_type = T; + KERNEL_FLOAT_INLINE static void load(T* output, const T* input) { output[0] = input[0]; @@ -153,9 +155,9 @@ struct copy_aligned_impl { }; template -struct copy_aligned_impl { - static constexpr size_t new_alignment = gcd(alignment, 2 * sizeof(T)); - struct alignas(new_alignment) storage_type { +struct copy_aligned_impl sizeof(T))>> { + static constexpr size_t storage_alignment = gcd(alignment, 2 * sizeof(T)); + struct alignas(storage_alignment) storage_type { T v0, v1; }; @@ -173,9 +175,9 @@ struct copy_aligned_impl { }; template -struct copy_aligned_impl { - static constexpr size_t new_alignment = gcd(alignment, 4 * sizeof(T)); - struct alignas(new_alignment) storage_type { +struct copy_aligned_impl 2 * sizeof(T))>> { + static constexpr size_t storage_alignment = gcd(alignment, 4 * sizeof(T)); + struct alignas(storage_alignment) storage_type { T v0, v1, v2, v3; }; @@ -199,9 +201,9 @@ struct copy_aligned_impl { }; template -struct copy_aligned_impl { - static constexpr size_t new_alignment = gcd(alignment, 8 * sizeof(T)); - struct alignas(new_alignment) storage_type { +struct copy_aligned_impl 4 * sizeof(T))>> { + static constexpr size_t storage_alignment = gcd(alignment, 8 * sizeof(T)); + struct alignas(storage_alignment) storage_type { T v0, v1, v2, v3, v4, v5, v6, v7; }; @@ -248,9 +250,9 @@ struct copy_aligned_impl { * vec values2 = read_aligned<4>(data + 10); * ``` */ -template +template KERNEL_FLOAT_INLINE vector> read_aligned(const T* ptr) { - static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); + static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); vector_storage result; detail::copy_aligned_impl::load( result.data(), @@ -273,10 +275,10 @@ KERNEL_FLOAT_INLINE vector> read_aligned(const T* ptr) { * write_aligned(data + 10, values); * ``` */ -template +template KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) { static constexpr size_t N = vector_extent; - static constexpr size_t alignment = detail::gcd(N * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); + static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); return detail::copy_aligned_impl::store( KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr, alignment), diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 313cc85..c849e90 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,13 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -<<<<<<< HEAD -// date: 2023-10-13 14:55:52.284209 -// git hash: 3da5ba08788e4d89a1b20b6a12bb4ba0f8de6b40 -======= -// date: 2023-10-11 15:46:04.149164 -// git hash: b1f6c1b73c2212223b10142054a28806f56b5ee6 ->>>>>>> 9bf416c (Update single_include) +// date: 2023-10-23 10:41:33.267957 +// git hash: 5844245b6d2d679f23d9ccc9693a3274b75917f9 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -36,35 +31,35 @@ #define KERNEL_FLOAT_IS_DEVICE (1) #define KERNEL_FLOAT_IS_HOST (0) #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__) -#else +#else // __CUDA_ARCH__ #define KERNEL_FLOAT_INLINE __forceinline__ __host__ #define KERNEL_FLOAT_IS_DEVICE (0) #define KERNEL_FLOAT_IS_HOST (1) #define KERNEL_FLOAT_CUDA_ARCH (0) -#endif -#else +#endif // __CUDA_ARCH__ +#else // __CUDACC__ #define KERNEL_FLOAT_INLINE inline #define KERNEL_FLOAT_CUDA (0) #define KERNEL_FLOAT_IS_HOST (1) #define KERNEL_FLOAT_IS_DEVICE (0) #define KERNEL_FLOAT_CUDA_ARCH (0) -#endif +#endif // __CUDACC__ #ifndef KERNEL_FLOAT_FP16_AVAILABLE #define KERNEL_FLOAT_FP16_AVAILABLE (1) -#endif +#endif // KERNEL_FLOAT_FP16_AVAILABLE #ifndef KERNEL_FLOAT_BF16_AVAILABLE #define KERNEL_FLOAT_BF16_AVAILABLE (1) -#endif +#endif // KERNEL_FLOAT_BF16_AVAILABLE #ifndef KERNEL_FLOAT_FP8_AVAILABLE #ifdef __CUDACC_VER_MAJOR__ #define KERNEL_FLOAT_FP8_AVAILABLE (__CUDACC_VER_MAJOR__ >= 12) -#else +#else // __CUDACC_VER_MAJOR__ #define KERNEL_FLOAT_FP8_AVAILABLE (0) -#endif -#endif +#endif // __CUDACC_VER_MAJOR__ +#endif // KERNEL_FLOAT_FP8_AVAILABLE #define KERNEL_FLOAT_ASSERT(expr) \ do { \ @@ -76,6 +71,16 @@ #define KERNEL_FLOAT_CONCAT(A, B) KERNEL_FLOAT_CONCAT_IMPL(A, B) #define KERNEL_FLOAT_CALL(F, ...) F(__VA_ARGS__) +// TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) && 0 +#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) \ + static_cast(__builtin_assume_aligned(static_cast(PTR), (ALIGNMENT))) +#else +#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) (PTR) +#endif + +#define KERNEL_FLOAT_MAX_ALIGNMENT (32) + #endif //KERNEL_FLOAT_MACROS_H #ifndef KERNEL_FLOAT_CORE_H #define KERNEL_FLOAT_CORE_H @@ -855,7 +860,6 @@ struct promote_type> { } // namespace kernel_float #endif - #ifndef KERNEL_FLOAT_UNOPS_H #define KERNEL_FLOAT_UNOPS_H @@ -2277,47 +2281,24 @@ KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... in namespace kernel_float { - namespace detail { template> -struct load_impl; +struct copy_impl; template -struct load_impl> { +struct copy_impl> { KERNEL_FLOAT_INLINE - static vector_storage call(const T* input, const size_t* offsets) { - return {input[offsets[Is]]...}; + static vector_storage load(const T* input, const size_t* offsets, const bool* mask) { + return {(mask[Is] ? input[offsets[Is]] : T {})...}; } KERNEL_FLOAT_INLINE - static vector_storage call(const T* input, const size_t* offsets, const bool* mask) { - bool all_valid = true; - for (size_t i = 0; i < N; i++) { - all_valid &= mask[i]; - } - - if (all_valid) { - return {input[offsets[Is]]...}; - } else { - return {(mask[Is] ? input[offsets[Is]] : T())...}; - } + static void store(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) { + ((mask[Is] ? outputs[offsets[Is]] = inputs[Is] : T {}), ...); } }; } // namespace detail -/** - * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``. - * - * ``` - * // Load 4 elements at data[0], data[2], data[4], data[8] - * vec values = load(data, make_vec(0, 2, 4, 8)); - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> load(const T* ptr, const I& indices) { - return detail::load_impl>::call(ptr, cast(indices).data()); -} - /** * Load the elements from the buffer ``ptr`` at the locations specified by ``indices``. * @@ -2327,101 +2308,39 @@ KERNEL_FLOAT_INLINE vector> load(const T* ptr, const I& * * ``` * // Load 2 elements at data[0] and data[8], skip data[2] and data[4] - * vec values = = load(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true)); + * vec values = = read(data, make_vec(0, 2, 4, 8), make_vec(true, false, false, true)); * ``` */ -template> -KERNEL_FLOAT_INLINE vector load(const T* ptr, const I& indices, const M& mask) { - static constexpr E new_size = {}; - - return detail::load_impl::call( +template> +KERNEL_FLOAT_INLINE vector read(const T* ptr, const I& indices, const M& mask = true) { + return detail::copy_impl::load( ptr, - convert_storage(indices, new_size).data(), - convert_storage(mask, new_size).data()); -} - -/** - * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an - * ``offset`` can be given that shifts all the indices by a fixed amount. - * - * ``` - * // Load 4 elements at locations data[0], data[1], data[2], data[3] - * vec values = loadn<4>(data); - * - * // Load 4 elements at locations data[10], data[11], data[12], data[13] - * vec values2 = loadn<4>(data, 10); - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> loadn(const T* ptr, size_t offset = 0) { - return load(ptr, offset + range()); + convert_storage(indices, E()).data(), + convert_storage(mask, E()).data()); } /** - * Load ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``. - * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used - * to prevent reading out of bounds. + * Store the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``. * - * ``` - * // Returns {ptr[8], ptr[9], 0, 0}; - * vec values = loadn<4>(data, 8, 10); - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> loadn(const T* ptr, size_t offset, size_t max_length) { - auto indices = offset + range(); - return load(ptr, indices, indices < max_length); -} - -namespace detail { -template> -struct store_impl; - -template -struct store_impl> { - KERNEL_FLOAT_INLINE - static void call(T* outputs, const T* inputs, const size_t* offsets) { - ((outputs[offsets[Is]] = inputs[Is]), ...); - } - - KERNEL_FLOAT_INLINE - static void call(T* outputs, const T* inputs, const size_t* offsets, const bool* mask) { - bool all_valid = true; - for (size_t i = 0; i < N; i++) { - all_valid &= mask[i]; - } - - if (all_valid) { - ((outputs[offsets[Is]] = inputs[Is]), ...); - } else { -#pragma unroll - for (size_t i = 0; i < N; i++) { - if (mask[i]) { - outputs[offsets[i]] = inputs[i]; - } - } - } - } -}; -} // namespace detail - -/** - * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``. + * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should + * be store and ``false`` indicates that the value should be skipped. This can be used + * to prevent writing out of bounds. * * ``` - * // Store 4 elements at data[0], data[2], data[4], data[8] + * // Store 2 elements at data[0] and data[8], skip data[2] and data[4] * auto values = make_vec(42, 13, 87, 12); - * store(values, data, make_vec(0, 2, 4, 8)); + * auto mask = make_vec(true, false, false, true); + * write(data, make_vec(0, 2, 4, 8), values, mask); * ``` */ template< typename T, typename V, typename I, - typename M, + typename M = bool, typename E = broadcast_vector_extent_type> -KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices, const M& mask) { - return detail::store_impl::call( +KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) { + return detail::copy_impl::store( ptr, convert_storage(values, E()).data(), convert_storage(indices, E()).data(), @@ -2429,227 +2348,222 @@ KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices, const } /** - * Load the elements from the vector `values` in the buffer ``ptr`` at the locations specified by ``indices``. - * - * The ``mask`` should be a vector of booleans where ``true`` indicates that the value should - * be store and ``false`` indicates that the value should be skipped. This can be used - * to prevent writing out of bounds. + * Load ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. * * ``` - * // Store 2 elements at data[0] and data[8], skip data[2] and data[4] - * auto values = make_vec(42, 13, 87, 12); - * auto mask = make_vec(true, false, false, true); - * store(values, data, make_vec(0, 2, 4, 8), mask); + * // Load 4 elements at locations data[0], data[1], data[2], data[3] + * vec values = read<4>(data); + * + * // Load 4 elements at locations data[10], data[11], data[12], data[13] + * vec values = read<4>(values + 10, data); * ``` */ -template> -KERNEL_FLOAT_INLINE void store(const V& values, T* ptr, const I& indices) { - return detail::store_impl::call( - ptr, - convert_storage(values, E()).data(), - convert_storage(indices, E()).data()); +template +KERNEL_FLOAT_INLINE vector> read(const T* ptr) { + return read(ptr, range()); } /** - * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. Optionally, an - * ``offset`` can be given that shifts all the indices by a fixed amount. + * Store ``N`` elements at the location ``ptr[0], ptr[1], ptr[2], ...``. * * ``` * // Store 4 elements at locations data[0], data[1], data[2], data[3] * vec values = {1.0f, 2.0f, 3.0f, 4.0f}; - * storen<4>(values, data); + * write(data, values); * - * // Load 4 elements at locations data[10], data[11], data[12], data[13] - * storen<4>(values, data, 10); + * // Store 4 elements at locations data[10], data[11], data[12], data[13] + * write(data + 10, values); * ``` */ -template> -KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset = 0) { - auto indices = offset + range(); - return store(values, ptr, indices); +template +KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) { + static constexpr size_t N = vector_extent; + write(ptr, range(), values); } -/** - * Store ``N`` elements at the location ``ptr[offset+0], ptr[offset+1], ptr[offset+2], ...``. - * Locations for which the index equals or exceeds ``max_length`` are ignored. This can be used - * to prevent reading out of bounds. - * - * ``` - * // Store 1.0f at data[8] and 2.0f at data[9]. Ignores remaining values. - * vec values = {1.0f, 2.0f, 3.0f, 4.0f}; - * storen<4>(values, data, 8, 10); - * ``` - */ -template> -KERNEL_FLOAT_INLINE void storen(const V& values, T* ptr, size_t offset, size_t max_length) { - auto indices = offset + range(); - return store(values, ptr, indices, indices < max_length); +namespace detail { +KERNEL_FLOAT_INLINE +constexpr size_t gcd(size_t a, size_t b) { + return b == 0 ? a : gcd(b, a % b); } -<<<<<<< HEAD -// TOOD: check if this way is support across all compilers -#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) -#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (__builtin_assume_aligned(ptr, alignment)) -#else -#define KERNEL_FLOAT_ASSUME_ALIGNED(ptr, alignment) (ptr) -#endif +template +struct copy_aligned_impl { + static constexpr size_t K = N > 8 ? 8 : (N > 4 ? 4 : (N > 2 ? 2 : 1)); + static constexpr size_t alignment_K = gcd(alignment, sizeof(T) * K); -template -struct AssignConversionProxy { KERNEL_FLOAT_INLINE - explicit AssignConversionProxy(T* ptr) : ptr_(ptr) {} - - template - KERNEL_FLOAT_INLINE AssignConversionProxy& operator=(U&& values) { - auto indices = range(); - detail::store_impl::call( - ptr_, - convert_storage(std::forward(values)).data(), - indices.data()); - - return *this; + static void load(T* output, const T* input) { + copy_aligned_impl::load(output, input); + copy_aligned_impl::load(output + K, input + K); } - private: - T* ptr_; + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + copy_aligned_impl::store(output, input); + copy_aligned_impl::store(output + K, input + K); + } }; -/** - * Takes a reference to a vector and returns a special proxy object that automatically performs the correct conversion - * when a vector of a different element type is assigned. This is useful to perform implicit type conversions. - * - * For example, let assume that a line like `x = expression;` would not compile since `x` and `expressions` are - * vectors of different element types. Then it is possible to use `cast_to(x) = expression;` to fix this error, - * which possibly introduces a type conversion. - * - * Example - * ======= - * ``` - * vec x; - * vec y = {1.0, 2.0}; - * cast_to(x) = y; // normally, the line `x = y;` would not compile, but `cast_to` make this possible - * ``` - */ -template -KERNEL_FLOAT_INLINE AssignConversionProxy cast_to(vector& input) { - return AssignConversionProxy(input.data()); -} - -/** - * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes. - */ -template -struct aligned_ptr { - static_assert(alignment >= alignof(T), "invalid alignment"); - +template +struct copy_aligned_impl { KERNEL_FLOAT_INLINE - aligned_ptr(nullptr_t = nullptr) {} + static void load(T* output, const T* input) {} KERNEL_FLOAT_INLINE - explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + static void store(T* output, const T* input) {} +}; + +template +struct copy_aligned_impl { + using storage_type = T; - /** - * Return the pointer value. - */ KERNEL_FLOAT_INLINE - T* get() const { - return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment); + static void load(T* output, const T* input) { + output[0] = input[0]; } KERNEL_FLOAT_INLINE - operator T*() const { - return get(); + static void store(T* output, const T* input) { + output[0] = input[0]; } +}; - template - KERNEL_FLOAT_INLINE T& operator[](I&& index) const { - return get()[std::forward(index)]; - } +template +struct copy_aligned_impl sizeof(T))>> { + static constexpr size_t storage_alignment = gcd(alignment, 2 * sizeof(T)); + struct alignas(storage_alignment) storage_type { + T v0, v1; + }; - /** - * See ``kernel_float::load`` - */ - template> - KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { - return ::kernel_float::load(get(), indices, mask); + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) { + storage_type storage = *reinterpret_cast(input); + output[0] = storage.v0; + output[1] = storage.v1; } - /** - * See ``kernel_float::loadn`` - */ - template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { - return ::kernel_float::loadn(get(), offset); + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + *reinterpret_cast(output) = storage_type {input[0], input[1]}; } +}; - /** - * See ``kernel_float::loadn`` - */ - template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { - return ::kernel_float::loadn(get(), offset, max_length); - } +template +struct copy_aligned_impl 2 * sizeof(T))>> { + static constexpr size_t storage_alignment = gcd(alignment, 4 * sizeof(T)); + struct alignas(storage_alignment) storage_type { + T v0, v1, v2, v3; + }; - /** - * See ``kernel_float::store`` - */ - template> - KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const { - ::kernel_float::store(values, get(), indices, mask); - } - /** - * See ``kernel_float::storen`` - */ - template> - KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const { - ::kernel_float::storen(values, get(), offset); - } - /** - * See ``kernel_float::storen`` - */ - template> - KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const { - ::kernel_float::storen(values, get(), offset, max_length); + KERNEL_FLOAT_INLINE + static void load(T* output, const T* input) { + storage_type storage = *reinterpret_cast(input); + output[0] = storage.v0; + output[1] = storage.v1; + output[2] = storage.v2; + output[3] = storage.v3; } - private: - T* ptr_ = nullptr; + KERNEL_FLOAT_INLINE + static void store(T* output, const T* input) { + *reinterpret_cast(output) = storage_type { + input[0], // + input[1], + input[2], + input[3]}; + } }; -/** - * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes. - */ template -struct aligned_ptr { - static_assert(alignment >= alignof(T), "invalid alignment"); +struct copy_aligned_impl 4 * sizeof(T))>> { + static constexpr size_t storage_alignment = gcd(alignment, 8 * sizeof(T)); + struct alignas(storage_alignment) storage_type { + T v0, v1, v2, v3, v4, v5, v6, v7; + }; KERNEL_FLOAT_INLINE - aligned_ptr(nullptr_t = nullptr) {} + static void load(T* output, const T* input) { + storage_type storage = *reinterpret_cast(input); + output[0] = storage.v0; + output[1] = storage.v1; + output[2] = storage.v2; + output[3] = storage.v3; + output[4] = storage.v4; + output[5] = storage.v5; + output[6] = storage.v6; + output[7] = storage.v7; + } KERNEL_FLOAT_INLINE - explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + static void store(T* output, const T* input) { + *reinterpret_cast(output) = storage_type { + input[0], // + input[1], + input[2], + input[3], + input[4], + input[5], + input[6], + input[7]}; + } +}; + +} // namespace detail - KERNEL_FLOAT_INLINE - explicit aligned_ptr(const T* ptr) : ptr_(ptr) {} -======= /** - * Returns the original pointer ``ptr`` and hints to the compiler that this pointer is aligned to ``alignment`` bytes. - * If this is not actually the case, compiler optimizations will break things and generate invalid code. Be careful! + * Load ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``. + * + * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector + * operation. If the pointer is not aligned, undefined behavior will occur. + * + * ``` + * // Load 4 elements at locations data[0], data[1], data[2], data[3] + * vec values = read_aligned<4>(data); + * + * // Load 4 elements at locations data[10], data[11], data[12], data[13] + * vec values2 = read_aligned<4>(data + 10); + * ``` */ -template -KERNEL_FLOAT_INLINE T* unsafe_assume_aligned(T* ptr, size_t alignment) { -// TOOD: check if this way is support across all compilers -#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) - return static_cast(__builtin_assume_aligned(ptr, alignment)); -#else - return ptr; -#endif +template +KERNEL_FLOAT_INLINE vector> read_aligned(const T* ptr) { + static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); + vector_storage result; + detail::copy_aligned_impl::load( + result.data(), + KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr, alignment)); + return result; +} + +/** + * Store ``N`` elements at the locations ``ptr[0], ptr[1], ptr[2], ...``. + * + * It is assumed that ``ptr`` is maximum aligned such that all ``N`` elements can be loaded at once using a vector + * operation. If the pointer is not aligned, undefined behavior will occur. + * + * ``` + * // Store 4 elements at locations data[0], data[1], data[2], data[3] + * vec values = {1.0f, 2.0f, 3.0f, 4.0f}; + * write_aligned(data, values); + * + * // Load 4 elements at locations data[10], data[11], data[12], data[13] + * write_aligned(data + 10, values); + * ``` + */ +template +KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) { + static constexpr size_t N = vector_extent; + static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT); + + return detail::copy_aligned_impl::store( + KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr, alignment), + convert_storage(values).data()); } /** * Represents a pointer of type ``T*`` that is guaranteed to be aligned to ``alignment`` bytes. */ -template +template struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); @@ -2659,12 +2573,15 @@ struct aligned_ptr { KERNEL_FLOAT_INLINE explicit aligned_ptr(T* ptr) : ptr_(ptr) {} + KERNEL_FLOAT_INLINE + aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} + /** * Return the pointer value. */ KERNEL_FLOAT_INLINE T* get() const { - return unsafe_assume_aligned(ptr_, alignment); + return KERNEL_FLOAT_ASSUME_ALIGNED(T, ptr_, alignment); } KERNEL_FLOAT_INLINE @@ -2678,58 +2595,111 @@ struct aligned_ptr { } /** - * See ``kernel_float::load`` + * Returns a new pointer that is offset by ``Step * offset`` items + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns a pointer to element `vector[8]` with alignment of 4 + * ptr.offset(8); + * + * // Returns a pointer to element `vector[4]` with alignment of 16 + * ptr.offset<4>(); + * + * // Returns a pointer to element `vector[8]` with alignment of 16 + * ptr.offset<4>(2); + * ``` */ - template> - KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { - return ::kernel_float::load(get(), indices, mask); + template + KERNEL_FLOAT_INLINE aligned_ptr + offset(size_t n = 1) const { + return aligned_ptr {get() + Step * n}; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::read`` */ template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { - return ::kernel_float::loadn(get(), offset); + KERNEL_FLOAT_INLINE vector> read() const { + vector_storage result; + detail::copy_aligned_impl::load(result.data(), get()); + return result; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::write`` */ - template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { - return ::kernel_float::loadn(get(), offset, max_length); + template + KERNEL_FLOAT_INLINE void write(const V& values) const { + constexpr size_t N = vector_extent; + return detail::copy_aligned_impl::store( + get(), + convert_storage(values).data()); } /** - * See ``kernel_float::store`` + * See ``kernel_float::read`` */ - template> - KERNEL_FLOAT_INLINE void store(const V& values, const I& indices, const M& mask = true) const { - ::kernel_float::store(values, get(), indices, mask); + template> + KERNEL_FLOAT_INLINE vector read(const I& indices, const M& mask = true) { + return ::kernel_float::read(get(), indices, mask); } + /** - * See ``kernel_float::storen`` + * See ``kernel_float::write`` + */ + template + KERNEL_FLOAT_INLINE void write(const I& indices, const V& values, const M& mask = true) { + return ::kernel_float::write(get(), indices, values, mask); + } + + /** + * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items. + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns vector[40], vector[41], vector[42], vector[43] + * ptr.read_at<4>(10); + * + * // Returns vector[20], vector[21], vector[22] + * ptr.read_at<2, 3>(10); + * ``` */ - template> - KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset = 0) const { - ::kernel_float::storen(values, get(), offset); + template + KERNEL_FLOAT_INLINE vector> read_at(size_t offset) const { + return this->offset(offset).template read(); } + /** - * See ``kernel_float::storen`` + * Offsets the pointer by `Step * offset` items and then writes the subsequent `N` items. + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * vec values = {1, 2, 3, 4}; + * + * // Writes to vector[40], vector[41], vector[42], vector[43] + * ptr.write_at<4>(10, values); + * + * // Returns vector[20], vector[21], vector[22], vector[23] + * ptr.write_at<2>(10, values); + * ``` */ - template> - KERNEL_FLOAT_INLINE void storen(const V& values, size_t offset, size_t max_length) const { - ::kernel_float::storen(values, get(), offset, max_length); + template + KERNEL_FLOAT_INLINE void write_at(size_t offset, const V& values) const { + return this->offset(offset).template write(values); } private: T* ptr_ = nullptr; }; -/** - * Represents a pointer of type ``const T*`` that is guaranteed to be aligned to ``alignment`` bytes. - */ template struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); @@ -2748,18 +2718,13 @@ struct aligned_ptr { KERNEL_FLOAT_INLINE aligned_ptr(const aligned_ptr& ptr) : ptr_(ptr.get()) {} ->>>>>>> 9bf416c (Update single_include) /** * Return the pointer value. */ KERNEL_FLOAT_INLINE const T* get() const { -<<<<<<< HEAD - return KERNEL_FLOAT_ASSUME_ALIGNED(ptr_, alignment); -======= - return unsafe_assume_aligned(ptr_, alignment); ->>>>>>> 9bf416c (Update single_include) + return KERNEL_FLOAT_ASSUME_ALIGNED(const T, ptr_, alignment); } KERNEL_FLOAT_INLINE @@ -2773,39 +2738,71 @@ struct aligned_ptr { } /** - * See ``kernel_float::load`` + * Returns a new pointer that is offset by ``Step * offset`` items + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns a pointer to element `vector[8]` with alignment of 4 + * ptr.offset(8); + * + * // Returns a pointer to element `vector[4]` with alignment of 16 + * ptr.offset<4>(); + * + * // Returns a pointer to element `vector[8]` with alignment of 16 + * ptr.offset<4>(2); + * ``` */ - template> - KERNEL_FLOAT_INLINE vector load(const I& indices, const M& mask = true) const { - return ::kernel_float::load(get(), indices, mask); + template + KERNEL_FLOAT_INLINE aligned_ptr + offset(size_t n = 1) const { + return aligned_ptr {get() + Step * n}; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::read`` */ template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset = 0) const { - return ::kernel_float::loadn(get(), offset); + KERNEL_FLOAT_INLINE vector> read() const { + vector_storage result; + detail::copy_aligned_impl::load(result.data(), get()); + return result; } /** - * See ``kernel_float::loadn`` + * See ``kernel_float::write`` */ - template - KERNEL_FLOAT_INLINE vector> loadn(size_t offset, size_t max_length) const { - return ::kernel_float::loadn(get(), offset, max_length); + template> + KERNEL_FLOAT_INLINE vector read(const I& indices, const M& mask = true) { + return ::kernel_float::read(get(), indices, mask); + } + + /** + * Offsets the pointer by `Step * offset` items and then read the subsequent `N` items. + * + * Example + * ======= + * ``` + * aligned_ptr ptr = vector.data(); + * + * // Returns vector[40], vector[41], vector[42], vector[43] + * ptr.read_at<4>(10); + * + * // Returns vector[20], vector[21], vector[22] + * ptr.read_at<2, 3>(10); + * ``` + */ + template + KERNEL_FLOAT_INLINE vector> read_at(size_t offset) const { + return this->offset(offset).template read(); } private: const T* ptr_ = nullptr; }; -<<<<<<< HEAD -======= -template -aligned_ptr(T*) -> aligned_ptr; - ->>>>>>> 9bf416c (Update single_include) } // namespace kernel_float #endif //KERNEL_FLOAT_MEMORY_H diff --git a/tests/memory.cu b/tests/memory.cu index ac29b46..3b281bd 100644 --- a/tests/memory.cu +++ b/tests/memory.cu @@ -124,19 +124,19 @@ REGISTER_TEST_CASE( struct aligned_ptr_test { template - __host__ __device__ void operator()(generator gen, std::index_sequence) { + __host__ __device__ void operator()(generator, std::index_sequence) { struct alignas(32) storage_type { T data[N]; }; storage_type input = {T(double(I))...}; - storage_type output = {T(double(I * 0))...}; - auto v = kf::read_aligned(input.data); - kf::write_aligned(output.data, v); + ASSERT_EQ_ALL(v[I], T(double(I))); + storage_type output = {T(double(I * 0))...}; + kf::write_aligned(output.data, v); ASSERT_EQ_ALL(output.data[I], T(double(I))); } }; -REGISTER_TEST_CASE("aligned pointer", aligned_ptr_test, int, float, double, __half, __nv_bfloat16) \ No newline at end of file +REGISTER_TEST_CASE("aligned pointer", aligned_ptr_test, int, float, double, __half, __nv_bfloat16) From f668fa44813fadb51dc48192391b51502ba17618 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 23 Oct 2023 12:10:22 +0200 Subject: [PATCH 13/16] Replace `nullptr_t` by `decltype(nullptr)` --- include/kernel_float/memory.h | 4 ++-- single_include/kernel_float.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index d75cddf..8768391 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -293,7 +293,7 @@ struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); KERNEL_FLOAT_INLINE - aligned_ptr(nullptr_t = nullptr) {} + aligned_ptr(decltype(nullptr) = nullptr) {} KERNEL_FLOAT_INLINE explicit aligned_ptr(T* ptr) : ptr_(ptr) {} @@ -430,7 +430,7 @@ struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); KERNEL_FLOAT_INLINE - aligned_ptr(nullptr_t = nullptr) {} + aligned_ptr(decltype(nullptr) = nullptr) {} KERNEL_FLOAT_INLINE explicit aligned_ptr(T* ptr) : ptr_(ptr) {} diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index c849e90..229727b 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-10-23 10:41:33.267957 -// git hash: 5844245b6d2d679f23d9ccc9693a3274b75917f9 +// date: 2023-10-23 12:07:20.751837 +// git hash: ed7143d9b83a89f043191f4957760081bda60b4d //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -2568,7 +2568,7 @@ struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); KERNEL_FLOAT_INLINE - aligned_ptr(nullptr_t = nullptr) {} + aligned_ptr(decltype(nullptr) = nullptr) {} KERNEL_FLOAT_INLINE explicit aligned_ptr(T* ptr) : ptr_(ptr) {} @@ -2705,7 +2705,7 @@ struct aligned_ptr { static_assert(alignment >= alignof(T), "invalid alignment"); KERNEL_FLOAT_INLINE - aligned_ptr(nullptr_t = nullptr) {} + aligned_ptr(decltype(nullptr) = nullptr) {} KERNEL_FLOAT_INLINE explicit aligned_ptr(T* ptr) : ptr_(ptr) {} From a35b9f600525b2253f9b2e1fb3cb91d382ac2a7d Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 23 Oct 2023 12:19:28 +0200 Subject: [PATCH 14/16] Add missing keywords necessary for nvrtc --- include/kernel_float/tiling.h | 4 ++-- single_include/kernel_float.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h index 9cac00d..978d175 100644 --- a/include/kernel_float/tiling.h +++ b/include/kernel_float/tiling.h @@ -175,7 +175,7 @@ struct instantiate_distribution_impl { template struct instantiate_distribution_impl<0, distributions> { template - using type = typename First::type; + using type = typename First::template type; }; template @@ -193,7 +193,7 @@ template> { template using dist_type = typename instantiate_distribution_impl:: - type; + template type; static constexpr size_t rank = TileDim::rank; static constexpr size_t items_per_thread = (dist_type::items_per_thread * ... * 1); diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 229727b..035e87f 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-10-23 12:07:20.751837 -// git hash: ed7143d9b83a89f043191f4957760081bda60b4d +// date: 2023-10-23 12:19:00.301788 +// git hash: f668fa44813fadb51dc48192391b51502ba17618 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -4549,7 +4549,7 @@ struct instantiate_distribution_impl { template struct instantiate_distribution_impl<0, distributions> { template - using type = typename First::type; + using type = typename First::template type; }; template @@ -4567,7 +4567,7 @@ template> { template using dist_type = typename instantiate_distribution_impl:: - type; + template type; static constexpr size_t rank = TileDim::rank; static constexpr size_t items_per_thread = (dist_type::items_per_thread * ... * 1); From 28f811af866d73bef37acd541bac6a95df9a94c3 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 24 Oct 2023 14:04:27 +0200 Subject: [PATCH 15/16] Refactor how `apply_impl` is performed to simplify fp16/bf16 --- include/kernel_float/apply.h | 189 ++++++++ include/kernel_float/bf16.h | 190 +++----- include/kernel_float/conversion.h | 112 ----- include/kernel_float/fp16.h | 167 +++---- include/kernel_float/meta.h | 8 + include/kernel_float/reduce.h | 49 +- include/kernel_float/unops.h | 42 +- include/kernel_float/vector.h | 1 + single_include/kernel_float.h | 760 +++++++++++++++--------------- 9 files changed, 749 insertions(+), 769 deletions(-) create mode 100644 include/kernel_float/apply.h diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h new file mode 100644 index 0000000..72c301a --- /dev/null +++ b/include/kernel_float/apply.h @@ -0,0 +1,189 @@ +#ifndef KERNEL_FLOAT_APPLY_H +#define KERNEL_FLOAT_APPLY_H + +#include "base.h" + +namespace kernel_float { +namespace detail { + +template +struct broadcast_extent_helper; + +template +struct broadcast_extent_helper { + using type = E; +}; + +template +struct broadcast_extent_helper, extent> { + using type = extent; +}; + +template +struct broadcast_extent_helper, extent> { + using type = extent; +}; + +template +struct broadcast_extent_helper, extent<1>> { + using type = extent; +}; + +template<> +struct broadcast_extent_helper, extent<1>> { + using type = extent<1>; +}; + +template +struct broadcast_extent_helper: + broadcast_extent_helper::type, C, Rest...> {}; + +} // namespace detail + +template +using broadcast_extent = typename detail::broadcast_extent_helper::type; + +template +using broadcast_vector_extent_type = broadcast_extent...>; + +template +static constexpr bool is_broadcastable = is_same_type, To>; + +template +static constexpr bool is_vector_broadcastable = is_broadcastable, To>; + +namespace detail { + +template +struct broadcast_impl; + +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(const vector_storage& input) { + vector_storage output; + for (size_t i = 0; i < N; i++) { + output.data()[i] = input.data()[0]; + } + return output; + } +}; + +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { + return input; + } +}; + +template +struct broadcast_impl, extent<1>> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { + return input; + } +}; + +} // namespace detail + +/** + * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` + * is 1 or `N`. + * + * Example + * ======= + * ``` + * vec a = {1.0f}; + * vec x = broadcast<5>(a); // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f] + * + * vec b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + * vec y = broadcast<5>(b); // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector, extent> +broadcast(const V& input, extent new_size = {}) { + using T = vector_value_type; + return detail::broadcast_impl, extent>::call( + into_vector_storage(input)); +} + +/** + * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the + * size of `input` is 1 or the same as `other`. + */ +template +KERNEL_FLOAT_INLINE vector, vector_extent_type> +broadcast_like(const V& input, const R& other) { + return broadcast(input, vector_extent_type {}); +} + +namespace detail { + +template +struct apply_recur_impl; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { + apply_recur_impl::call(fun, result, inputs...); + } +}; + +template +struct apply_recur_impl { + static constexpr size_t K = round_up_to_power_of_two(N) / 2; + + template + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { + apply_impl::call(fun, result, inputs...); + apply_impl::call(fun, result + K, (inputs + K)...); + } +}; + +template<> +struct apply_recur_impl<0> { + template + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {} +}; + +template<> +struct apply_recur_impl<1> { + template + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { + result[0] = fun(inputs[0]...); + } +}; +} // namespace detail + +template +using map_type = + vector...>, broadcast_vector_extent_type>; + +/** + * Apply the function `F` to each element from the vector `input` and return the results as a new vector. + * + * Examples + * ======== + * ``` + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] + * ``` + */ +template +KERNEL_FLOAT_INLINE map_type map(F fun, const Args&... args) { + using Output = result_t...>; + using E = broadcast_vector_extent_type; + vector_storage result; + + detail::apply_impl...>::call( + fun, + result.data(), + (detail::broadcast_impl, vector_extent_type, E>::call( + into_vector_storage(args)) + .data())...); + + return result; +} + +} // namespace kernel_float + +#endif // KERNEL_FLOAT_APPLY_H \ No newline at end of file diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index b37d817..8488dee 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -31,103 +31,27 @@ template<> struct allow_float_fallback<__nv_bfloat16> { static constexpr bool value = true; }; - -template -struct map_bfloat16x2 { - KERNEL_FLOAT_INLINE - static __nv_bfloat162 call(F fun, __nv_bfloat162 input) { - __nv_bfloat16 a = fun(input.x); - __nv_bfloat16 b = fun(input.y); - return {a, b}; - } -}; - -template -struct zip_bfloat16x2 { - KERNEL_FLOAT_INLINE - static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) { - __nv_bfloat16 a = fun(left.x, left.y); - __nv_bfloat16 b = fun(right.y, right.y); - return {a, b}; - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; - __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result[2 * i + 0] = b.x; - result[2 * i + 1] = b.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); - } - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void - call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]}; - __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]}; - __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result[2 * i + 0] = c.x; - result[2 * i + 1] = c.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); - } - } -}; - -template -struct reduce_impl= 2)>> { - KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) { - __nv_bfloat162 accum = {input[0], input[1]}; - -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; - accum = zip_bfloat16x2::call(fun, accum, a); - } - - __nv_bfloat16 result = fun(accum.x, accum.y); - - if (N % 2 != 0) { - result = fun(result, input[N - 1]); - } - - return result; - } -}; -} // namespace detail +}; // namespace detail #if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_bfloat16x2> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ - return FUN2(input); \ - } \ - }; \ +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __nv_bfloat16, __nv_bfloat16> { \ + KERNEL_FLOAT_INLINE static void \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat16* result, const __nv_bfloat16* a) { \ + __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) @@ -156,24 +80,28 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #endif #if KERNEL_FLOAT_CUDA_ARCH >= 800 -#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 \ - operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_bfloat16x2> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ - return FUN2(left, right); \ - } \ - }; \ +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> { \ + KERNEL_FLOAT_INLINE static void call( \ + ops::NAME<__nv_bfloat16>, \ + __nv_bfloat16* result, \ + const __nv_bfloat16* a, \ + const __nv_bfloat16* b) { \ + __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]}, __nv_bfloat162 {b[0], b[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) @@ -195,6 +123,42 @@ KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) +#if KERNEL_FLOAT_CUDA_ARCH >= 800 +namespace ops { +template<> +struct fma<__nv_bfloat16> { + KERNEL_FLOAT_INLINE __nv_bfloat16 + operator()(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) const { + return __hfma(a, b, c); + } +}; +} // namespace ops + +namespace detail { +template<> +struct apply_impl< + ops::fma<__nv_bfloat16>, + 2, + __nv_bfloat16, + __nv_bfloat16, + __nv_bfloat16, + __nv_bfloat16> { + KERNEL_FLOAT_INLINE static void call( + ops::fma<__nv_bfloat16>, + __nv_bfloat16* result, + const __nv_bfloat16* a, + const __nv_bfloat16* b, + const __nv_bfloat16* c) { + __nv_bfloat162 r = __hfma2( + __nv_bfloat162 {a[0], a[1]}, + __nv_bfloat162 {b[0], b[1]}, + __nv_bfloat162 {c[0], c[1]}); + result[0] = r.x, result[1] = r.y; + } +}; +} // namespace detail +#endif + namespace ops { template<> struct cast { diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h index 6f9e4fd..9faaeec 100644 --- a/include/kernel_float/conversion.h +++ b/include/kernel_float/conversion.h @@ -6,118 +6,6 @@ namespace kernel_float { -namespace detail { - -template -struct broadcast_extent_helper; - -template -struct broadcast_extent_helper { - using type = E; -}; - -template -struct broadcast_extent_helper, extent> { - using type = extent; -}; - -template -struct broadcast_extent_helper, extent> { - using type = extent; -}; - -template -struct broadcast_extent_helper, extent<1>> { - using type = extent; -}; - -template<> -struct broadcast_extent_helper, extent<1>> { - using type = extent<1>; -}; - -template -struct broadcast_extent_helper: - broadcast_extent_helper::type, C, Rest...> {}; - -} // namespace detail - -template -using broadcast_extent = typename detail::broadcast_extent_helper::type; - -template -using broadcast_vector_extent_type = broadcast_extent...>; - -template -static constexpr bool is_broadcastable = is_same_type, To>; - -template -static constexpr bool is_vector_broadcastable = is_broadcastable, To>; - -namespace detail { - -template -struct broadcast_impl; - -template -struct broadcast_impl, extent> { - KERNEL_FLOAT_INLINE static vector_storage call(const vector_storage& input) { - vector_storage output; - for (size_t i = 0; i < N; i++) { - output.data()[i] = input.data()[0]; - } - return output; - } -}; - -template -struct broadcast_impl, extent> { - KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { - return input; - } -}; - -template -struct broadcast_impl, extent<1>> { - KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { - return input; - } -}; - -} // namespace detail - -/** - * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` - * is 1 or `N`. - * - * Example - * ======= - * ``` - * vec a = {1.0f}; - * vec x = broadcast<5>(a); // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f] - * - * vec b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; - * vec y = broadcast<5>(b); // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f] - * ``` - */ -template -KERNEL_FLOAT_INLINE vector, extent> -broadcast(const V& input, extent new_size = {}) { - using T = vector_value_type; - return detail::broadcast_impl, extent>::call( - into_vector_storage(input)); -} - -/** - * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the - * size of `input` is 1 or the same as `other`. - */ -template -KERNEL_FLOAT_INLINE vector, vector_extent_type> -broadcast_like(const V& input, const R& other) { - return broadcast(input, vector_extent_type {}); -} - namespace detail { /** * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`. diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index f36836a..7546fe3 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -29,103 +29,26 @@ template<> struct allow_float_fallback<__half> { static constexpr bool value = true; }; - -template -struct map_halfx2 { - KERNEL_FLOAT_INLINE - static __half2 call(F fun, __half2 input) { - __half a = fun(input.x); - __half b = fun(input.y); - return {a, b}; - } -}; - -template -struct zip_halfx2 { - KERNEL_FLOAT_INLINE - static __half2 call(F fun, __half2 left, __half2 right) { - __half a = fun(left.x, left.y); - __half b = fun(right.y, right.y); - return {a, b}; - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __half2 a = {input[2 * i], input[2 * i + 1]}; - __half2 b = map_halfx2::call(fun, a); - result[2 * i + 0] = b.x; - result[2 * i + 1] = b.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); - } - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void - call(F fun, __half* result, const __half* left, const __half* right) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __half2 a = {left[2 * i], left[2 * i + 1]}; - __half2 b = {right[2 * i], right[2 * i + 1]}; - __half2 c = zip_halfx2::call(fun, a, b); - result[2 * i + 0] = c.x; - result[2 * i + 1] = c.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); - } - } -}; - -template -struct reduce_impl= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) { - __half2 accum = {input[0], input[1]}; - -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __half2 a = {input[2 * i], input[2 * i + 1]}; - accum = zip_halfx2::call(fun, accum, a); - } - - __half result = fun(accum.x, accum.y); - - if (N % 2 != 0) { - result = fun(result, input[N - 1]); - } - - return result; - } -}; - }; // namespace detail #if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_halfx2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ - return FUN2(input); \ - } \ - }; \ +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __half, __half> { \ + KERNEL_FLOAT_INLINE static void call(ops::NAME<__half>, __half* result, const __half* a) { \ + __half2 r = FUN2(__half2 {a[0], a[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) @@ -152,22 +75,24 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_halfx2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ - return FUN2(left, right); \ - } \ - }; \ +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __half, __half, __half> { \ + KERNEL_FLOAT_INLINE static void \ + call(ops::NAME<__half>, __half* result, const __half* a, const __half* b) { \ + __half2 r = FUN2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) @@ -188,6 +113,28 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) +#if KERNEL_FLOAT_IS_DEVICE +namespace ops { +template<> +struct fma<__half> { + KERNEL_FLOAT_INLINE __half operator()(__half a, __half b, __half c) const { + return __hfma(a, b, c); + } +}; +} // namespace ops + +namespace detail { +template<> +struct apply_impl, 2, __half, __half, __half, __half> { + KERNEL_FLOAT_INLINE static void + call(ops::fma<__half>, __half* result, const __half* a, const __half* b, const __half* c) { + __half2 r = __hfma2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]}, __half2 {c[0], c[1]}); + result[0] = r.x, result[1] = r.y; + } +}; +} // namespace detail +#endif + #define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ namespace ops { \ template<> \ diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index 5141f82..8becc0e 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -270,6 +270,14 @@ struct enable_if_impl { template using enable_if_t = typename detail::enable_if_impl::type; +constexpr size_t round_up_to_power_of_two(size_t n) { + size_t result = 1; + while (result < n) { + result *= 2; + } + return result; +} + } // namespace kernel_float #endif diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 7056175..8ec790d 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -5,23 +5,56 @@ namespace kernel_float { namespace detail { + +template +struct reduce_recur_impl; + template struct reduce_impl { KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { - return call(fun, input, make_index_sequence {}); + return reduce_recur_impl::call(fun, input); } +}; + +template +struct reduce_recur_impl { + static constexpr size_t K = round_up_to_power_of_two(N) / 2; + + template + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { + vector_storage temp; + apply_impl::call(fun, temp.data(), input, input + K); - private: - template - KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) { - T result = input[0]; + if constexpr (N < 2 * K) { #pragma unroll - for (size_t i = 1; i < N; i++) { - result = fun(result, input[i]); + for (size_t i = N - K; i < K; i++) { + temp.data()[i] = input[i]; + } } - return result; + + return reduce_impl::call(fun, temp.data()); + } +}; + +template<> +struct reduce_recur_impl<0> {}; + +template<> +struct reduce_recur_impl<1> { + template + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { + return input[0]; + } +}; + +template<> +struct reduce_recur_impl<2> { + template + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { + return fun(input[0], input[1]); } }; + } // namespace detail /** diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index b0900ae..d45e1cc 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -1,49 +1,9 @@ #ifndef KERNEL_FLOAT_UNOPS_H #define KERNEL_FLOAT_UNOPS_H -#include "base.h" +#include "apply.h" namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = fun(inputs[i]...); - } - } -}; -} // namespace detail - -template -using map_type = vector>, vector_extent_type>; - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. - * - * Examples - * ======== - * ``` - * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; - * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] - * ``` - */ -template -KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { - using Input = vector_value_type; - using Output = result_t; - vector_storage> result; - - detail::apply_impl, Output, Input>::call( - fun, - result.data(), - into_vector_storage(input).data()); - - return result; -} - namespace detail { // Indicates that elements of type `T` offer less precision than floats, thus operations // on elements of type `T` can be performed by upcasting them to ` float`. diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 56cd1fe..0880d00 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -6,6 +6,7 @@ #include "iterate.h" #include "macros.h" #include "reduce.h" +#include "triops.h" #include "unops.h" namespace kernel_float { diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 035e87f..8c2ef41 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-10-23 12:19:00.301788 -// git hash: f668fa44813fadb51dc48192391b51502ba17618 +// date: 2023-10-24 14:03:49.849025 +// git hash: a35b9f600525b2253f9b2e1fb3cb91d382ac2a7d //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -354,6 +354,14 @@ struct enable_if_impl { template using enable_if_t = typename detail::enable_if_impl::type; +constexpr size_t round_up_to_power_of_two(size_t n) { + size_t result = 1; + while (result < n) { + result *= 2; + } + return result; +} + } // namespace kernel_float #endif @@ -611,6 +619,195 @@ KERNEL_FLOAT_INLINE vector_storage_type into_vector_storage(V&& input) { } // namespace kernel_float #endif +#ifndef KERNEL_FLOAT_APPLY_H +#define KERNEL_FLOAT_APPLY_H + + + +namespace kernel_float { +namespace detail { + +template +struct broadcast_extent_helper; + +template +struct broadcast_extent_helper { + using type = E; +}; + +template +struct broadcast_extent_helper, extent> { + using type = extent; +}; + +template +struct broadcast_extent_helper, extent> { + using type = extent; +}; + +template +struct broadcast_extent_helper, extent<1>> { + using type = extent; +}; + +template<> +struct broadcast_extent_helper, extent<1>> { + using type = extent<1>; +}; + +template +struct broadcast_extent_helper: + broadcast_extent_helper::type, C, Rest...> {}; + +} // namespace detail + +template +using broadcast_extent = typename detail::broadcast_extent_helper::type; + +template +using broadcast_vector_extent_type = broadcast_extent...>; + +template +static constexpr bool is_broadcastable = is_same_type, To>; + +template +static constexpr bool is_vector_broadcastable = is_broadcastable, To>; + +namespace detail { + +template +struct broadcast_impl; + +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(const vector_storage& input) { + vector_storage output; + for (size_t i = 0; i < N; i++) { + output.data()[i] = input.data()[0]; + } + return output; + } +}; + +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { + return input; + } +}; + +template +struct broadcast_impl, extent<1>> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { + return input; + } +}; + +} // namespace detail + +/** + * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` + * is 1 or `N`. + * + * Example + * ======= + * ``` + * vec a = {1.0f}; + * vec x = broadcast<5>(a); // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f] + * + * vec b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + * vec y = broadcast<5>(b); // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector, extent> +broadcast(const V& input, extent new_size = {}) { + using T = vector_value_type; + return detail::broadcast_impl, extent>::call( + into_vector_storage(input)); +} + +/** + * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the + * size of `input` is 1 or the same as `other`. + */ +template +KERNEL_FLOAT_INLINE vector, vector_extent_type> +broadcast_like(const V& input, const R& other) { + return broadcast(input, vector_extent_type {}); +} + +namespace detail { + +template +struct apply_recur_impl; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { + apply_recur_impl::call(fun, result, inputs...); + } +}; + +template +struct apply_recur_impl { + static constexpr size_t K = round_up_to_power_of_two(N) / 2; + + template + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { + apply_impl::call(fun, result, inputs...); + apply_impl::call(fun, result + K, (inputs + K)...); + } +}; + +template<> +struct apply_recur_impl<0> { + template + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {} +}; + +template<> +struct apply_recur_impl<1> { + template + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { + result[0] = fun(inputs[0]...); + } +}; +} // namespace detail + +template +using map_type = + vector...>, broadcast_vector_extent_type>; + +/** + * Apply the function `F` to each element from the vector `input` and return the results as a new vector. + * + * Examples + * ======== + * ``` + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] + * ``` + */ +template +KERNEL_FLOAT_INLINE map_type map(F fun, const Args&... args) { + using Output = result_t...>; + using E = broadcast_vector_extent_type; + vector_storage result; + + detail::apply_impl...>::call( + fun, + result.data(), + (detail::broadcast_impl, vector_extent_type, E>::call( + into_vector_storage(args)) + .data())...); + + return result; +} + +} // namespace kernel_float + +#endif // KERNEL_FLOAT_APPLY_H #ifndef KERNEL_FLOAT_COMPLEX_TYPE_H #define KERNEL_FLOAT_COMPLEX_TYPE_H @@ -866,46 +1063,6 @@ struct promote_type> { namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = fun(inputs[i]...); - } - } -}; -} // namespace detail - -template -using map_type = vector>, vector_extent_type>; - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. - * - * Examples - * ======== - * ``` - * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; - * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] - * ``` - */ -template -KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { - using Input = vector_value_type; - using Output = result_t; - vector_storage> result; - - detail::apply_impl, Output, Input>::call( - fun, - result.data(), - into_vector_storage(input).data()); - - return result; -} - namespace detail { // Indicates that elements of type `T` offer less precision than floats, thus operations // on elements of type `T` can be performed by upcasting them to ` float`. @@ -1130,118 +1287,6 @@ KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) namespace kernel_float { -namespace detail { - -template -struct broadcast_extent_helper; - -template -struct broadcast_extent_helper { - using type = E; -}; - -template -struct broadcast_extent_helper, extent> { - using type = extent; -}; - -template -struct broadcast_extent_helper, extent> { - using type = extent; -}; - -template -struct broadcast_extent_helper, extent<1>> { - using type = extent; -}; - -template<> -struct broadcast_extent_helper, extent<1>> { - using type = extent<1>; -}; - -template -struct broadcast_extent_helper: - broadcast_extent_helper::type, C, Rest...> {}; - -} // namespace detail - -template -using broadcast_extent = typename detail::broadcast_extent_helper::type; - -template -using broadcast_vector_extent_type = broadcast_extent...>; - -template -static constexpr bool is_broadcastable = is_same_type, To>; - -template -static constexpr bool is_vector_broadcastable = is_broadcastable, To>; - -namespace detail { - -template -struct broadcast_impl; - -template -struct broadcast_impl, extent> { - KERNEL_FLOAT_INLINE static vector_storage call(const vector_storage& input) { - vector_storage output; - for (size_t i = 0; i < N; i++) { - output.data()[i] = input.data()[0]; - } - return output; - } -}; - -template -struct broadcast_impl, extent> { - KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { - return input; - } -}; - -template -struct broadcast_impl, extent<1>> { - KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { - return input; - } -}; - -} // namespace detail - -/** - * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` - * is 1 or `N`. - * - * Example - * ======= - * ``` - * vec a = {1.0f}; - * vec x = broadcast<5>(a); // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f] - * - * vec b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; - * vec y = broadcast<5>(b); // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f] - * ``` - */ -template -KERNEL_FLOAT_INLINE vector, extent> -broadcast(const V& input, extent new_size = {}) { - using T = vector_value_type; - return detail::broadcast_impl, extent>::call( - into_vector_storage(input)); -} - -/** - * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the - * size of `input` is 1 or the same as `other`. - */ -template -KERNEL_FLOAT_INLINE vector, vector_extent_type> -broadcast_like(const V& input, const R& other) { - return broadcast(input, vector_extent_type {}); -} - namespace detail { /** * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`. @@ -2813,23 +2858,56 @@ struct aligned_ptr { namespace kernel_float { namespace detail { + +template +struct reduce_recur_impl; + template struct reduce_impl { KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { - return call(fun, input, make_index_sequence {}); + return reduce_recur_impl::call(fun, input); } +}; - private: - template - KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) { - T result = input[0]; +template +struct reduce_recur_impl { + static constexpr size_t K = round_up_to_power_of_two(N) / 2; + + template + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { + vector_storage temp; + apply_impl::call(fun, temp.data(), input, input + K); + + if constexpr (N < 2 * K) { #pragma unroll - for (size_t i = 1; i < N; i++) { - result = fun(result, input[i]); + for (size_t i = N - K; i < K; i++) { + temp.data()[i] = input[i]; + } } - return result; + + return reduce_impl::call(fun, temp.data()); } }; + +template<> +struct reduce_recur_impl<0> {}; + +template<> +struct reduce_recur_impl<1> { + template + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { + return input[0]; + } +}; + +template<> +struct reduce_recur_impl<2> { + template + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { + return fun(input[0], input[1]); + } +}; + } // namespace detail /** @@ -3210,6 +3288,7 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { + namespace kernel_float { /** @@ -3573,103 +3652,26 @@ template<> struct allow_float_fallback<__half> { static constexpr bool value = true; }; - -template -struct map_halfx2 { - KERNEL_FLOAT_INLINE - static __half2 call(F fun, __half2 input) { - __half a = fun(input.x); - __half b = fun(input.y); - return {a, b}; - } -}; - -template -struct zip_halfx2 { - KERNEL_FLOAT_INLINE - static __half2 call(F fun, __half2 left, __half2 right) { - __half a = fun(left.x, left.y); - __half b = fun(right.y, right.y); - return {a, b}; - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __half2 a = {input[2 * i], input[2 * i + 1]}; - __half2 b = map_halfx2::call(fun, a); - result[2 * i + 0] = b.x; - result[2 * i + 1] = b.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); - } - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void - call(F fun, __half* result, const __half* left, const __half* right) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __half2 a = {left[2 * i], left[2 * i + 1]}; - __half2 b = {right[2 * i], right[2 * i + 1]}; - __half2 c = zip_halfx2::call(fun, a, b); - result[2 * i + 0] = c.x; - result[2 * i + 1] = c.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); - } - } -}; - -template -struct reduce_impl= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) { - __half2 accum = {input[0], input[1]}; - -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __half2 a = {input[2 * i], input[2 * i + 1]}; - accum = zip_halfx2::call(fun, accum, a); - } - - __half result = fun(accum.x, accum.y); - - if (N % 2 != 0) { - result = fun(result, input[N - 1]); - } - - return result; - } -}; - }; // namespace detail #if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_halfx2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ - return FUN2(input); \ - } \ - }; \ +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __half, __half> { \ + KERNEL_FLOAT_INLINE static void call(ops::NAME<__half>, __half* result, const __half* a) { \ + __half2 r = FUN2(__half2 {a[0], a[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) @@ -3696,22 +3698,24 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_halfx2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ - return FUN2(left, right); \ - } \ - }; \ +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __half, __half, __half> { \ + KERNEL_FLOAT_INLINE static void \ + call(ops::NAME<__half>, __half* result, const __half* a, const __half* b) { \ + __half2 r = FUN2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) @@ -3732,6 +3736,28 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) +#if KERNEL_FLOAT_IS_DEVICE +namespace ops { +template<> +struct fma<__half> { + KERNEL_FLOAT_INLINE __half operator()(__half a, __half b, __half c) const { + return __hfma(a, b, c); + } +}; +} // namespace ops + +namespace detail { +template<> +struct apply_impl, 2, __half, __half, __half, __half> { + KERNEL_FLOAT_INLINE static void + call(ops::fma<__half>, __half* result, const __half* a, const __half* b, const __half* c) { + __half2 r = __hfma2(__half2 {a[0], a[1]}, __half2 {b[0], b[1]}, __half2 {c[0], c[1]}); + result[0] = r.x, result[1] = r.y; + } +}; +} // namespace detail +#endif + #define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ namespace ops { \ template<> \ @@ -3857,103 +3883,27 @@ template<> struct allow_float_fallback<__nv_bfloat16> { static constexpr bool value = true; }; - -template -struct map_bfloat16x2 { - KERNEL_FLOAT_INLINE - static __nv_bfloat162 call(F fun, __nv_bfloat162 input) { - __nv_bfloat16 a = fun(input.x); - __nv_bfloat16 b = fun(input.y); - return {a, b}; - } -}; - -template -struct zip_bfloat16x2 { - KERNEL_FLOAT_INLINE - static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) { - __nv_bfloat16 a = fun(left.x, left.y); - __nv_bfloat16 b = fun(right.y, right.y); - return {a, b}; - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; - __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result[2 * i + 0] = b.x; - result[2 * i + 1] = b.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); - } - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static void - call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) { -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]}; - __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]}; - __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result[2 * i + 0] = c.x; - result[2 * i + 1] = c.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); - } - } -}; - -template -struct reduce_impl= 2)>> { - KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) { - __nv_bfloat162 accum = {input[0], input[1]}; - -#pragma unroll - for (size_t i = 0; 2 * i + 1 < N; i++) { - __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; - accum = zip_bfloat16x2::call(fun, accum, a); - } - - __nv_bfloat16 result = fun(accum.x, accum.y); - - if (N % 2 != 0) { - result = fun(result, input[N - 1]); - } - - return result; - } -}; -} // namespace detail +}; // namespace detail #if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_bfloat16x2> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ - return FUN2(input); \ - } \ - }; \ +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __nv_bfloat16, __nv_bfloat16> { \ + KERNEL_FLOAT_INLINE static void \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat16* result, const __nv_bfloat16* a) { \ + __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) @@ -3982,24 +3932,28 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #endif #if KERNEL_FLOAT_CUDA_ARCH >= 800 -#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 \ - operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_bfloat16x2> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ - return FUN2(left, right); \ - } \ - }; \ +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct apply_impl, 2, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> { \ + KERNEL_FLOAT_INLINE static void call( \ + ops::NAME<__nv_bfloat16>, \ + __nv_bfloat16* result, \ + const __nv_bfloat16* a, \ + const __nv_bfloat16* b) { \ + __nv_bfloat162 r = FUN2(__nv_bfloat162 {a[0], a[1]}, __nv_bfloat162 {b[0], b[1]}); \ + result[0] = r.x, result[1] = r.y; \ + } \ + }; \ } #else #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) @@ -4021,6 +3975,42 @@ KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) +#if KERNEL_FLOAT_CUDA_ARCH >= 800 +namespace ops { +template<> +struct fma<__nv_bfloat16> { + KERNEL_FLOAT_INLINE __nv_bfloat16 + operator()(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) const { + return __hfma(a, b, c); + } +}; +} // namespace ops + +namespace detail { +template<> +struct apply_impl< + ops::fma<__nv_bfloat16>, + 2, + __nv_bfloat16, + __nv_bfloat16, + __nv_bfloat16, + __nv_bfloat16> { + KERNEL_FLOAT_INLINE static void call( + ops::fma<__nv_bfloat16>, + __nv_bfloat16* result, + const __nv_bfloat16* a, + const __nv_bfloat16* b, + const __nv_bfloat16* c) { + __nv_bfloat162 r = __hfma2( + __nv_bfloat162 {a[0], a[1]}, + __nv_bfloat162 {b[0], b[1]}, + __nv_bfloat162 {c[0], c[1]}); + result[0] = r.x, result[1] = r.y; + } +}; +} // namespace detail +#endif + namespace ops { template<> struct cast { From d6b7199e1fc35785109a2fdf10eb232b03d50c84 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 24 Oct 2023 14:16:18 +0200 Subject: [PATCH 16/16] Change example to use FMA --- examples/vector_add/main.cu | 2 +- include/kernel_float/meta.h | 1 + single_include/kernel_float.h | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 7b88292..448bbe5 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -18,7 +18,7 @@ __global__ void my_kernel(int length, const __half* input, double constant, floa if (i * N < length) { auto a = kf::read_aligned(input + i * N); - auto b = (a * a) * constant; + auto b = kf::fma(a, a, kf::cast<__half>(constant)); kf::write_aligned(output + i * N, b); } } diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index 8becc0e..5256129 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -270,6 +270,7 @@ struct enable_if_impl { template using enable_if_t = typename detail::enable_if_impl::type; +KERNEL_FLOAT_INLINE constexpr size_t round_up_to_power_of_two(size_t n) { size_t result = 1; while (result < n) { diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 8c2ef41..b5f3c8a 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-10-24 14:03:49.849025 -// git hash: a35b9f600525b2253f9b2e1fb3cb91d382ac2a7d +// date: 2023-10-24 14:14:37.228322 +// git hash: 28f811af866d73bef37acd541bac6a95df9a94c3 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -354,6 +354,7 @@ struct enable_if_impl { template using enable_if_t = typename detail::enable_if_impl::type; +KERNEL_FLOAT_INLINE constexpr size_t round_up_to_power_of_two(size_t n) { size_t result = 1; while (result < n) {