Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
stijnh committed Oct 30, 2023
2 parents f40ff58 + d6b7199 commit 9760b9b
Show file tree
Hide file tree
Showing 25 changed files with 3,200 additions and 1,369 deletions.
2 changes: 2 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ API Reference
api/binary_operators.rst
api/reductions.rst
api/mathematical.rst
api/fast_math.rst
api/conditional.rst
api/memory_read_write.rst
api/utilities.rst

29 changes: 23 additions & 6 deletions docs/build_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def build_doxygen_page(name, items):
content += "-" * len(title) + "\n"

for symbol in symbols:
content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
if directive == "define":
content += f".. doxygendefine:: {symbol}\n\n"
else:
content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"

stripped_name = name.lower().replace(" ", "_").replace("/", "_")
filename = f"api/{stripped_name}.rst"
Expand Down Expand Up @@ -90,7 +93,8 @@ def build_index_page(groups):
"for_each",
],
"Generation": [
"range",
("range", "range()"),
("range", "range(F fun)"),
"range_like",
"each_index",
"fill",
Expand Down Expand Up @@ -193,6 +197,14 @@ def build_index_page(groups):
"isinf",
"isnan",
],
"Fast math": [
"fast_exp",
"fast_log",
"fast_cos",
"fast_sin",
"fast_tan",
"fast_div",
],
"Conditional": [
("where", "where(const C&, const L&, const R&)"),
("where", "where(const C&, const L&)"),
Expand All @@ -202,13 +214,18 @@ def build_index_page(groups):
"cast_to",
("load", "load(const T*, const I&)"),
("load", "load(const T*, const I&, const M&)"),
("loadn", "loadn(const T*, ptrdiff_t)"),
("loadn", "loadn(const T*, ptrdiff_t, ptrdiff_t)"),
("loadn", "loadn(const T*, size_t)"),
("loadn", "loadn(const T*, size_t, size_t)"),
("store", "store(const V&, T *ptr, const I&)"),
("store", "store(const V&, T *ptr, const I&, const M&)"),
("storen", "storen(const V&, T*, ptrdiff_t)"),
("storen", "storen(const V&, T*, ptrdiff_t, ptrdiff_t)"),
("storen", "storen(const V&, T*, size_t)"),
("storen", "storen(const V&, T*, size_t, size_t)"),
("aligned_ptr", "aligned_ptr", "struct"),
],
"Utilities": [
("constant", "constant", "struct"),
("tiling", "tiling", "struct"),
("KERNEL_FLOAT_TILING_FOR", "KERNEL_FLOAT_TILING_FOR", "define"),
]
}

Expand Down
1 change: 1 addition & 0 deletions docs/guides.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ Guides
guides/introduction.rst
guides/promotion.rst
guides/prelude.rst
guides/constant.rst
37 changes: 37 additions & 0 deletions docs/guides/constant.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
Using `kernel_float::constant`
===

When working with mixed precision types, you will find that working with constants presents a bit of a challenge.

For example, a simple expression such as `3.14 * x` where `x` is of type `vec<float, 2>` will NOT be performed
in `float` precision as you might expect, but instead in `double` precision.
This happens since the left-hand side of this expression
(a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`.

To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represent
constants. Any binary operations between a value of type `U` and a `constant<T>` will result in both
operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes
`constant<T>` useful for representing constants in your code.

For example, consider the following code:

```
#include "kernel_float.h"
namespace kf = kernel_float;
int main() {
using Type = float;
const int N = 8;
static constexpr auto PI = kf::make_constant(3.14);
kf::vec<int, N> i = kf::range<int, N>();
kf::vec<Type, N> x = kf::cast<Type>(i) * PI;
kf::vec<Type, N> y = x * kf::sin(x);
Type result = kf::sum(y);
printf("result=%f", double(result));
return EXIT_SUCCESS;
}
```

This code example uses the ``make_constant`` utility function to create `constant<T>`.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
add_subdirectory(vector_add)
add_subdirectory(vector_add_tiling)
16 changes: 11 additions & 5 deletions examples/vector_add/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ void cuda_check(cudaError_t code) {
}

template<int N>
__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i * N < length) {
output[i] = kf::cast<float>((input[i] * input[i]) * constant);
auto a = kf::read_aligned<N>(input + i * N);
auto b = kf::fma(a, a, kf::cast<__half>(constant));
kf::write_aligned<N>(output + i * N, b);
}
}

Expand All @@ -35,8 +37,8 @@ void run_kernel(int n) {
}

// Allocate device memory
khalf<items_per_thread>* input_dev;
kfloat<items_per_thread>* output_dev;
__half* input_dev;
float* output_dev;
cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));

Expand All @@ -47,7 +49,11 @@ void run_kernel(int n) {
int block_size = 256;
int items_per_block = block_size * items_per_thread;
int grid_size = (n + items_per_block - 1) / items_per_block;
my_kernel<items_per_thread><<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
my_kernel<items_per_thread><<<grid_size, block_size>>>(
n,
kf::aligned_ptr(input_dev),
constant,
kf::aligned_ptr(output_dev));

// Copy results back
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
Expand Down
12 changes: 12 additions & 0 deletions examples/vector_add_tiling/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
cmake_minimum_required(VERSION 3.17)

set (PROJECT_NAME kernel_float_vecadd_tiling)
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
set (CMAKE_CXX_STANDARD 17)

add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
target_link_libraries(${PROJECT_NAME} kernel_float)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")

find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
97 changes: 97 additions & 0 deletions examples/vector_add_tiling/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <vector>

#include "kernel_float.h"
#include "kernel_float/tiling.h"
using namespace kernel_float::prelude;

void cuda_check(cudaError_t code) {
if (code != cudaSuccess) {
throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code));
}
}

template<int N, int B>
__global__ void my_kernel(
int length,
kf::aligned_ptr<const __half> input,
double constant,
kf::aligned_ptr<float> output) {
auto tiling = kf::tiling<
kf::tile_factor<N>,
kf::block_size<B>,
kf::distributions<kf::dist::block_cyclic<2>>>();

auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
auto mask = tiling.local_mask();

auto a = input.read(points, mask);
auto b = (a * a) * constant;
output.write(points, b, mask);
}

template<int items_per_thread, int block_size = 256>
void run_kernel(int n) {
double constant = 1.0;
std::vector<half> input(n);
std::vector<float> output_expected;
std::vector<float> output_result;

// Generate input data
for (int i = 0; i < n; i++) {
input[i] = half(i);
output_expected[i] = float(i + constant);
}

// Allocate device memory
__half* input_dev;
float* output_dev;
cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));

// Copy device memory
cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));

// Launch kernel!
int items_per_block = block_size * items_per_thread;
int grid_size = (n + items_per_block - 1) / items_per_block;
my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
n,
kf::aligned_ptr(input_dev),
constant,
kf::aligned_ptr(output_dev));

// Copy results back
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));

// Check results
for (int i = 0; i < n; i++) {
float result = output_result[i];
float answer = output_expected[i];

if (result != answer) {
std::stringstream msg;
msg << "error: index " << i << " is incorrect: " << result << " != " << answer;
throw std::runtime_error(msg.str());
}
}

cuda_check(cudaFree(input_dev));
cuda_check(cudaFree(output_dev));
}

int main() {
int n = 84000; // divisible by 1, 2, 3, 4, 5, 6, 7, 8
cuda_check(cudaSetDevice(0));

run_kernel<1>(n);
run_kernel<2>(n);
run_kernel<3>(n);
run_kernel<4>(n);
run_kernel<8>(n);

std::cout << "result correct\n";
return EXIT_SUCCESS;
}
Loading

0 comments on commit 9760b9b

Please sign in to comment.