-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
25 changed files
with
3,200 additions
and
1,369 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ Guides | |
guides/introduction.rst | ||
guides/promotion.rst | ||
guides/prelude.rst | ||
guides/constant.rst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
Using `kernel_float::constant` | ||
=== | ||
|
||
When working with mixed precision types, you will find that working with constants presents a bit of a challenge. | ||
|
||
For example, a simple expression such as `3.14 * x` where `x` is of type `vec<float, 2>` will NOT be performed | ||
in `float` precision as you might expect, but instead in `double` precision. | ||
This happens since the left-hand side of this expression | ||
(a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`. | ||
|
||
To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represent | ||
constants. Any binary operations between a value of type `U` and a `constant<T>` will result in both | ||
operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes | ||
`constant<T>` useful for representing constants in your code. | ||
|
||
For example, consider the following code: | ||
|
||
``` | ||
#include "kernel_float.h" | ||
namespace kf = kernel_float; | ||
int main() { | ||
using Type = float; | ||
const int N = 8; | ||
static constexpr auto PI = kf::make_constant(3.14); | ||
kf::vec<int, N> i = kf::range<int, N>(); | ||
kf::vec<Type, N> x = kf::cast<Type>(i) * PI; | ||
kf::vec<Type, N> y = x * kf::sin(x); | ||
Type result = kf::sum(y); | ||
printf("result=%f", double(result)); | ||
return EXIT_SUCCESS; | ||
} | ||
``` | ||
|
||
This code example uses the ``make_constant`` utility function to create `constant<T>`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
add_subdirectory(vector_add) | ||
add_subdirectory(vector_add_tiling) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
cmake_minimum_required(VERSION 3.17) | ||
|
||
set (PROJECT_NAME kernel_float_vecadd_tiling) | ||
project(${PROJECT_NAME} LANGUAGES CXX CUDA) | ||
set (CMAKE_CXX_STANDARD 17) | ||
|
||
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu") | ||
target_link_libraries(${PROJECT_NAME} kernel_float) | ||
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80") | ||
|
||
find_package(CUDA REQUIRED) | ||
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#include <iostream> | ||
#include <sstream> | ||
#include <stdexcept> | ||
#include <vector> | ||
|
||
#include "kernel_float.h" | ||
#include "kernel_float/tiling.h" | ||
using namespace kernel_float::prelude; | ||
|
||
void cuda_check(cudaError_t code) { | ||
if (code != cudaSuccess) { | ||
throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code)); | ||
} | ||
} | ||
|
||
template<int N, int B> | ||
__global__ void my_kernel( | ||
int length, | ||
kf::aligned_ptr<const __half> input, | ||
double constant, | ||
kf::aligned_ptr<float> output) { | ||
auto tiling = kf::tiling< | ||
kf::tile_factor<N>, | ||
kf::block_size<B>, | ||
kf::distributions<kf::dist::block_cyclic<2>>>(); | ||
|
||
auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0); | ||
auto mask = tiling.local_mask(); | ||
|
||
auto a = input.read(points, mask); | ||
auto b = (a * a) * constant; | ||
output.write(points, b, mask); | ||
} | ||
|
||
template<int items_per_thread, int block_size = 256> | ||
void run_kernel(int n) { | ||
double constant = 1.0; | ||
std::vector<half> input(n); | ||
std::vector<float> output_expected; | ||
std::vector<float> output_result; | ||
|
||
// Generate input data | ||
for (int i = 0; i < n; i++) { | ||
input[i] = half(i); | ||
output_expected[i] = float(i + constant); | ||
} | ||
|
||
// Allocate device memory | ||
__half* input_dev; | ||
float* output_dev; | ||
cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n)); | ||
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); | ||
|
||
// Copy device memory | ||
cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault)); | ||
|
||
// Launch kernel! | ||
int items_per_block = block_size * items_per_thread; | ||
int grid_size = (n + items_per_block - 1) / items_per_block; | ||
my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>( | ||
n, | ||
kf::aligned_ptr(input_dev), | ||
constant, | ||
kf::aligned_ptr(output_dev)); | ||
|
||
// Copy results back | ||
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault)); | ||
|
||
// Check results | ||
for (int i = 0; i < n; i++) { | ||
float result = output_result[i]; | ||
float answer = output_expected[i]; | ||
|
||
if (result != answer) { | ||
std::stringstream msg; | ||
msg << "error: index " << i << " is incorrect: " << result << " != " << answer; | ||
throw std::runtime_error(msg.str()); | ||
} | ||
} | ||
|
||
cuda_check(cudaFree(input_dev)); | ||
cuda_check(cudaFree(output_dev)); | ||
} | ||
|
||
int main() { | ||
int n = 84000; // divisible by 1, 2, 3, 4, 5, 6, 7, 8 | ||
cuda_check(cudaSetDevice(0)); | ||
|
||
run_kernel<1>(n); | ||
run_kernel<2>(n); | ||
run_kernel<3>(n); | ||
run_kernel<4>(n); | ||
run_kernel<8>(n); | ||
|
||
std::cout << "result correct\n"; | ||
return EXIT_SUCCESS; | ||
} |
Oops, something went wrong.