From 9f8660cf385acc86a337d6ab4c781e31922110d7 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 20:56:17 +0200 Subject: [PATCH 01/63] Fix bug in `export.cpp` when grid/block size was derived from tunable parameters --- src/export.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/export.cpp b/src/export.cpp index 37c43bd..45f1b4e 100644 --- a/src/export.cpp +++ b/src/export.cpp @@ -204,13 +204,21 @@ struct KernelBuilderSerializerHack { result["name"] = builder.kernel_name_; result["compile_flags"] = expr_list_to_json(builder.compile_flags_); - result["block_size"] = expr_list_to_json(builder.block_size_); - result["grid_size"] = expr_list_to_json(builder.grid_size_); result["shared_memory"] = expr_to_json(builder.shared_mem_); result["template_args"] = expr_list_to_json(builder.template_args_); result["defines"] = std::move(defines); result["headers"] = std::move(headers); + result["block_size"] = expr_list_to_json(std::array { + builder.determine_block_size(0), + builder.determine_block_size(1), + builder.determine_block_size(2)}); + + result["grid_size"] = expr_list_to_json(std::array { + builder.determine_block_size(0), + builder.determine_block_size(1), + builder.determine_block_size(2)}); + return result; } }; From 766abb69b7ce016669c1d94ac0f01b44b832b3dc Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:12:05 +0200 Subject: [PATCH 02/63] Print debug information in `KernelInstance::launch` --- include/kernel_launcher/arg.h | 2 ++ include/kernel_launcher/cuda.h | 11 ++++++++++- include/kernel_launcher/utils.h | 4 ++++ src/arg.cpp | 27 +++++++++++++++++++++++++++ src/builder.cpp | 29 +++++++++++++++++++++++++++++ src/compiler.cpp | 3 ++- src/cuda.cpp | 11 +++++++++-- src/utils.cpp | 23 ++++++++++++++++++++--- tests/arg.cpp | 30 ++++++++++++++++++++++++++++++ 9 files changed, 133 insertions(+), 7 deletions(-) diff --git a/include/kernel_launcher/arg.h b/include/kernel_launcher/arg.h index 8b146e4..e4030ed 100644 --- a/include/kernel_launcher/arg.h +++ b/include/kernel_launcher/arg.h @@ -98,6 +98,8 @@ struct KernelArg { std::vector to_bytes() const; void* as_void_ptr() const; + friend std::ostream& operator<<(std::ostream&, const KernelArg&); + private: TypeInfo type_; bool scalar_; diff --git a/include/kernel_launcher/cuda.h b/include/kernel_launcher/cuda.h index 3dd7050..1d08e11 100644 --- a/include/kernel_launcher/cuda.h +++ b/include/kernel_launcher/cuda.h @@ -38,7 +38,10 @@ void cuda_check(CUresult result, const char* msg); * Wrapper around `CUfunction` and the accompanying `CUmodule`. */ struct CudaModule { - CudaModule(const char* image, const char* fun_name); + CudaModule( + const char* image, + const char* lowered_name, + const char* human_name = nullptr); ~CudaModule(); CudaModule() = default; CudaModule(const CudaModule&) = delete; @@ -51,6 +54,7 @@ struct CudaModule { CudaModule& operator=(CudaModule&& that) noexcept { std::swap(that.module_, module_); std::swap(that.fun_ptr_, fun_ptr_); + std::swap(that.fun_name_, fun_name_); return *this; } @@ -61,6 +65,10 @@ struct CudaModule { uint32_t shared_mem, void** args) const; + const std::string& function_name() const { + return fun_name_; + } + CUfunction function() const { return fun_ptr_; } @@ -70,6 +78,7 @@ struct CudaModule { } private: + std::string fun_name_; CUfunction fun_ptr_ = nullptr; CUmodule module_ = nullptr; }; diff --git a/include/kernel_launcher/utils.h b/include/kernel_launcher/utils.h index ac9399f..c8e068e 100644 --- a/include/kernel_launcher/utils.h +++ b/include/kernel_launcher/utils.h @@ -14,6 +14,10 @@ namespace kernel_launcher { +bool log_debug_enabled(); +bool log_info_enabled(); +bool log_warning_enabled(); + std::ostream& log_debug(); std::ostream& log_info(); std::ostream& log_warning(); diff --git a/src/arg.cpp b/src/arg.cpp index 1f34c5c..a0bca93 100644 --- a/src/arg.cpp +++ b/src/arg.cpp @@ -207,4 +207,31 @@ void* KernelArg::as_void_ptr() const { } } +std::ostream& operator<<(std::ostream& os, const KernelArg& arg) { + // There are four possible representations: + // - pointer which is an array (length is known) + // - pointer which is not an array (length is unknown) + // - scalars convertible to `Value` + // - scalars without a representation + if (arg.type().is_pointer()) { + void* ptr; + ::memcpy(&ptr, arg.as_void_ptr(), sizeof(ptr)); + os << "array " << ptr; + + if (arg.is_array()) { + os << " of length " << arg.data_.array.nelements; + } + } else { + Value v = arg.to_value_or_empty(); + + if (!v.is_empty()) { + os << "scalar " << v; + } else { + os << "scalar <...>"; + } + } + + return os << " (type: " << arg.type_.name() << ")"; +} + } // namespace kernel_launcher \ No newline at end of file diff --git a/src/builder.cpp b/src/builder.cpp index 9ab14b7..602f546 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -102,6 +102,35 @@ void KernelInstance::launch( ptrs[i] = args[i].as_void_ptr(); } + if (log_debug_enabled()) { + auto p = problem_size; + auto b = block_size; + auto g = grid_size; + + log_debug() << "launching kernel " << module_.function_name() << "\n"; + log_debug() << " - device: " << CudaDevice::current().name() << "\n"; + log_debug() << " - problem size: [" // + << p.x << ", " << p.y << ", " << p.z << "]\n"; + log_debug() << " - grid size: [" // + << g.x << ", " << g.y << ", " << g.z << "]\n"; + log_debug() << " - block size: [" // + << b.x << ", " << b.y << ", " << b.z << "]\n"; + + if (smem > 0) { + log_debug() << " - shared memory: " << smem << " bytes\n"; + } + + if (stream != nullptr) { + log_debug() << " - stream: " << stream << "\n"; + } + + log_debug() << " - using " << args.size() << " arguments:\n"; + + for (const auto& arg : args) { + log_debug() << " - - " << arg << "\n"; + } + } + module_.launch(stream, grid_size, block_size, smem, ptrs.data()); } diff --git a/src/compiler.cpp b/src/compiler.cpp index fa362c7..5dc55c7 100644 --- a/src/compiler.cpp +++ b/src/compiler.cpp @@ -42,10 +42,11 @@ void KernelDef::add_compiler_option(std::string option) { } CudaModule ICompiler::compile(CudaContextHandle ctx, KernelDef def) const { + std::string human_name = def.name; std::string lowered_name; std::string ptx; compile_ptx(std::move(def), ctx.device().arch(), ptx, lowered_name); - return {ptx.c_str(), lowered_name.c_str()}; + return {ptx.c_str(), lowered_name.c_str(), human_name.c_str()}; } void Compiler::compile_ptx( diff --git a/src/cuda.cpp b/src/cuda.cpp index cc3241a..9247c6e 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -24,11 +24,18 @@ void cuda_check(CUresult result, const char* msg) { } } -CudaModule::CudaModule(const char* image, const char* fun_name) { +CudaModule::CudaModule( + const char* image, + const char* lowered_name, + const char* human_name) { + if (human_name != nullptr) { + fun_name_ = human_name; + } + KERNEL_LAUNCHER_CUDA_CHECK( cuModuleLoadDataEx(&module_, image, 0, nullptr, nullptr)); KERNEL_LAUNCHER_CUDA_CHECK( - cuModuleGetFunction(&fun_ptr_, module_, fun_name)); + cuModuleGetFunction(&fun_ptr_, module_, lowered_name)); } CudaModule::~CudaModule() { diff --git a/src/utils.cpp b/src/utils.cpp index 76b26ea..03ac230 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -46,9 +46,8 @@ struct DummyStream: std::ostream { DummyStream() = default; }; -static std::ostream& log_level(LogLevel level) { +bool log_enabled(LogLevel level) { static constexpr const char* ENV_KEY = "KERNEL_LAUNCHER_LOG"; - static DummyStream dummy_stream; static LogLevel min_level = LogLevel::Unknown; if (min_level == LogLevel::Unknown) { @@ -70,7 +69,25 @@ static std::ostream& log_level(LogLevel level) { } } - if (level < min_level) { + return level >= min_level; +} + +bool log_debug_enabled() { + return log_enabled(LogLevel::Debug); +} + +bool log_info_enabled() { + return log_enabled(LogLevel::Info); +} + +bool log_warning_enabled() { + return log_enabled(LogLevel::Warning); +} + +static std::ostream& log_level(LogLevel level) { + static DummyStream dummy_stream; + + if (!log_enabled(level)) { return dummy_stream; } diff --git a/tests/arg.cpp b/tests/arg.cpp index 9eeac20..5d4949a 100644 --- a/tests/arg.cpp +++ b/tests/arg.cpp @@ -1,9 +1,13 @@ +#include + #include "catch.hpp" #include "kernel_launcher/kernel.h" #include "test_utils.h" using namespace kernel_launcher; +struct MyObject {}; + TEST_CASE("test KernelArg") { SECTION("scalar int") { KernelArg v = into_kernel_arg(5); @@ -109,4 +113,30 @@ TEST_CASE("test KernelArg") { CHECK_THROWS( KernelArg::from_array(input.data(), input.size()).to_array(5)); } + + SECTION("operator<<") { + std::stringstream stream; + + SECTION("scalar primitive") { + stream << KernelArg::from_scalar(int(5)); + CHECK(stream.str() == "scalar 5 (type: int)"); + } + + SECTION("scalar arbitrary") { + stream << KernelArg::from_scalar(MyObject {}); + CHECK(stream.str() == "scalar <...> (type: MyObject)"); + } + + SECTION("scalar pointer") { + int* ptr = reinterpret_cast(0x123); + stream << KernelArg::from_scalar(ptr); + CHECK(stream.str() == "array 0x123 (type: int*)"); + } + + SECTION("array") { + int* ptr = reinterpret_cast(0x123); + stream << KernelArg::from_array(ptr, 5); + CHECK(stream.str() == "array 0x123 of length 5 (type: int*)"); + } + } } \ No newline at end of file From 73e04a23ef1695920197466d323a0be11f889ac8 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:12:41 +0200 Subject: [PATCH 03/63] Fix incorrect device in `DeviceAttrEval` --- src/builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/builder.cpp b/src/builder.cpp index 602f546..bb7868d 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -39,7 +39,7 @@ struct DeviceAttrEval: Eval { bool lookup(const Variable& v, Value& out) const override { if (const auto* that = dynamic_cast(&v)) { - out = CudaDevice::current().attribute(that->get()); + out = device_.attribute(that->get()); return true; } From fdd5ffda5a7ea144dda2885adceb3d1332748731 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:13:27 +0200 Subject: [PATCH 04/63] Check if config is valid before compiling in `KernelBuilder::compile` --- src/builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/builder.cpp b/src/builder.cpp index bb7868d..ab3bcd2 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -446,7 +446,6 @@ KernelInstance KernelBuilder::compile( const ICompiler& compiler, CudaContextHandle ctx) const { DeviceAttrEval eval = {ctx.device(), config}; - CudaModule module = compiler.compile(ctx, build(eval, param_types)); if (!is_valid(eval)) { std::stringstream ss; @@ -466,6 +465,7 @@ KernelInstance KernelBuilder::compile( TypedExpr shared_mem = shared_mem_.resolve(eval); + CudaModule module = compiler.compile(ctx, build(eval, param_types)); return { std::move(module), std::move(block_size), From 2b42931fcc53f16fc25e33a16595a563c0a90fad Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:20:04 +0200 Subject: [PATCH 05/63] Fix incorrect grid size calculation in captures --- src/export.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/export.cpp b/src/export.cpp index 45f1b4e..b53a32f 100644 --- a/src/export.cpp +++ b/src/export.cpp @@ -215,9 +215,9 @@ struct KernelBuilderSerializerHack { builder.determine_block_size(2)}); result["grid_size"] = expr_list_to_json(std::array { - builder.determine_block_size(0), - builder.determine_block_size(1), - builder.determine_block_size(2)}); + builder.determine_grid_size(0), + builder.determine_grid_size(1), + builder.determine_grid_size(2)}); return result; } From 92df853c6d07c91b26aee157c90a1620ec932d05 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:20:17 +0200 Subject: [PATCH 06/63] Add `make pretty` --- Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 87bb8da..fb6b000 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ BUILD_DIR=build -fmt: +pretty: clang-format -i include/kernel_launcher/*.h src/*.cpp tests/*.cpp examples/*/*.cu +fmt: pretty + test: ${BUILD_DIR} cd ${BUILD_DIR} && make kernel_launcher_tests cd tests && KERNEL_LAUNCHER_LOG=debug ../${BUILD_DIR}/tests/kernel_launcher_tests ${TEST} @@ -11,7 +13,7 @@ ${BUILD_DIR}: mkdir ${BUILD_DIR} cd ${BUILD_DIR} && cmake -DKERNEL_LAUNCHER_BUILD_TEST=1 -DCMAKE_BUILD_TYPE=debug .. -all: fmt test +all: pretty test clean: -.PHONY: fmt test all clean +.PHONY: pretty fmt test all clean From 52d77038685fb01968cbb794543486809270ef31 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 2 Feb 2023 21:54:58 +0100 Subject: [PATCH 07/63] Initial work on the pragma parser --- CMakeLists.txt | 2 +- Makefile | 2 +- include/kernel_launcher/internal/directives.h | 18 + include/kernel_launcher/internal/parser.h | 35 ++ include/kernel_launcher/internal/tokens.h | 139 +++++++ src/internal/directives.cpp | 325 +++++++++++++++ src/internal/parser.cpp | 188 +++++++++ src/internal/tokens.cpp | 390 ++++++++++++++++++ tests/internal.cpp | 142 +++++++ 9 files changed, 1239 insertions(+), 2 deletions(-) create mode 100644 include/kernel_launcher/internal/directives.h create mode 100644 include/kernel_launcher/internal/parser.h create mode 100644 include/kernel_launcher/internal/tokens.h create mode 100644 src/internal/directives.cpp create mode 100644 src/internal/parser.cpp create mode 100644 src/internal/tokens.cpp create mode 100644 tests/internal.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b7f3e38..fb72ebd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ if(NOT CMAKE_BUILD_TYPE) endif() -file(GLOB sources "${PROJECT_SOURCE_DIR}/src/*.cpp") +file(GLOB sources "${PROJECT_SOURCE_DIR}/src/*.cpp" "${PROJECT_SOURCE_DIR}/src/*/*.cpp") add_library(${PROJECT_NAME} STATIC ${sources}) set(KERNEL_LAUNCHER_CLANG_TIDY clang-tidy -checks=-*,readability-*,bugprone-*,-readability-magic-numbers,-readability-use-anyofallof,-readability-else-after-return) diff --git a/Makefile b/Makefile index fb6b000..6316928 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ BUILD_DIR=build pretty: - clang-format -i include/kernel_launcher/*.h src/*.cpp tests/*.cpp examples/*/*.cu + clang-format -i include/*.h include/*/*.h include/*/*/*.h src/*.cpp src/*/*.cpp tests/*.cpp examples/*/*.cu fmt: pretty diff --git a/include/kernel_launcher/internal/directives.h b/include/kernel_launcher/internal/directives.h new file mode 100644 index 0000000..6d3abdf --- /dev/null +++ b/include/kernel_launcher/internal/directives.h @@ -0,0 +1,18 @@ +#ifndef KERNEL_LAUNCHER_DIRECTIVES_H +#define KERNEL_LAUNCHER_DIRECTIVES_H + +#include "../builder.h" +#include "tokens.h" + +namespace kernel_launcher { +namespace internal { + +KernelBuilder process_kernel( + TokenStream& stream, + const KernelDef& def, + const std::vector& template_args); + +} +} // namespace kernel_launcher + +#endif //KERNEL_LAUNCHER_DIRECTIVES_H diff --git a/include/kernel_launcher/internal/parser.h b/include/kernel_launcher/internal/parser.h new file mode 100644 index 0000000..6c116c0 --- /dev/null +++ b/include/kernel_launcher/internal/parser.h @@ -0,0 +1,35 @@ +#ifndef KERNEL_LAUNCHER_PARSER_H +#define KERNEL_LAUNCHER_PARSER_H + +#include "../value.h" +#include "tokens.h" + +namespace kernel_launcher { +namespace internal { + +struct TemplateParam { + Token name; + bool is_integral; + std::string integral_type; + Value default_value; +}; + +struct FunctionParam { + std::string type; + Token name; +}; + +struct KernelDef { + std::string qualified_name; + Token name; + std::vector directives; + std::vector template_params; + std::vector fun_params; +}; + +std::vector parse_kernels(TokenStream& stream); + +} // namespace internal +} // namespace kernel_launcher + +#endif //KERNEL_LAUNCHER_PARSER_H diff --git a/include/kernel_launcher/internal/tokens.h b/include/kernel_launcher/internal/tokens.h new file mode 100644 index 0000000..d423bd0 --- /dev/null +++ b/include/kernel_launcher/internal/tokens.h @@ -0,0 +1,139 @@ +#ifndef KERNEL_LAUNCHER_TOKENIZER_H +#define KERNEL_LAUNCHER_TOKENIZER_H + +#include +#include +#include +#include + +namespace kernel_launcher { +namespace internal { + +enum class TokenKind { + String, + Number, + ParenL, + ParenR, + BracketL, + BracketR, + BraceL, + BraceR, + AngleL, + AngleR, + DirectiveBegin, + DirectiveEnd, + Ident, + Punct, + Comma, + Unknown, + EndOfFile, +}; + +struct Token { + uint16_t begin = 0; + uint16_t end = 0; + TokenKind kind = TokenKind::Unknown; + + Token() = default; + Token(uint16_t begin, uint16_t end, TokenKind kind) : + begin(begin), + end(end), + kind(kind) {} + + bool operator==(const Token& that) const { + return begin == that.begin && end == that.end && kind == that.kind; + } + + bool operator!=(const Token& that) const { + return !(*this == that); + } +}; + +struct TokenStream; +struct TokenStreamImpl { + friend TokenStream; + + std::string file_; + std::string text_; + std::vector tokens_; +}; + +struct TokenStream { + explicit TokenStream(const TokenStream&) = default; + TokenStream(TokenStream&&) = default; + + TokenStream(std::string file, std::string input); + void reset(Token t); + bool has_next() const; + Token next(); + Token peek(); + void prev(); + + bool matches(Token t, char c) const; + bool matches(Token t, const char* c) const; + + bool matches(Token t, const std::string& s) const { + return matches(t, s.c_str()); + } + + bool matches(Token t, TokenKind kind) const { + return t.kind == kind; + } + + template + bool next_if(T&& pattern) { + if (!matches(peek(), std::forward(pattern))) { + return false; + } + + next(); + return true; + } + + [[noreturn]] void throw_expecting_token(Token t, TokenKind k) const; + [[noreturn]] void throw_expecting_token(Token t, const char* c) const; + + [[noreturn]] void + throw_expecting_token(Token t, const std::string& s) const { + throw_expecting_token(t, s.c_str()); + } + + [[noreturn]] void throw_expecting_token(Token t, char c) const { + char str[2] = {c, '\0'}; + throw_expecting_token(t, str); + } + + template + Token consume(const T& pattern) { + Token t = next(); + if (!matches(t, pattern)) { + throw_expecting_token(t, pattern); + } + + return t; + } + + std::string span(size_t begin, size_t end) const; + + std::string span(Token t) const { + return span(t.begin, t.end); + } + + std::string span(Token begin, Token end) const { + return span(begin.begin, end.end); + } + + std::pair extract_line_column(Token t) const; + + [[noreturn]] void + throw_unexpected_token(Token t, const std::string& reason = "") const; + + private: + size_t index_; + std::shared_ptr impl_; +}; + +} // namespace internal +} // namespace kernel_launcher + +#endif \ No newline at end of file diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp new file mode 100644 index 0000000..a5d5e74 --- /dev/null +++ b/src/internal/directives.cpp @@ -0,0 +1,325 @@ +#include "kernel_launcher/internal/directives.h" + +#include +#include + +#include "kernel_launcher/builder.h" +#include "kernel_launcher/expr.h" +#include "kernel_launcher/internal/parser.h" + +namespace kernel_launcher { +namespace internal { + +struct Context { + std::unordered_map runtime_args; + std::unordered_map compile_args; + std::unordered_map config_args; +}; + +static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec = 0); + +static Expr process_function_call( + Token t, + const TokenStream& stream, + std::vector args) { + std::string name = stream.span(t); + auto assert_nargs = [&](size_t n) { + if (n != args.size()) { + stream.throw_unexpected_token( + t, + "function expects " + std::to_string(n) + "arguments but " + + std::to_string(args.size()) + " arguments were given"); + } + }; + + if (name == "round") { + assert_nargs(1); + return round(args[0]); + } else if (name == "ceil") { + assert_nargs(1); + return ceil(args[0]); + } else if (name == "floor") { + assert_nargs(1); + return floor(args[0]); + } else if (name == "div_ceil") { + assert_nargs(2); + return div_ceil(args[0], args[1]); + } else if (name == "float") { + assert_nargs(1); + return cast(args[0]); + } else { + stream.throw_unexpected_token(t, "unknown function name"); + } +} + +static Expr parse_ident(Token t, TokenStream& stream, const Context& ctx) { + if (stream.next_if(TokenKind::ParenL)) { + std::vector args; + + do { + args.push_back(parse_expr(stream, ctx)); + } while (stream.next_if(TokenKind::Comma)); + + stream.consume(TokenKind::ParenR); + + return process_function_call(t, stream, args); + } + + std::string name = stream.span(t); + + if (name == "null") { + return ScalarExpr(Value {}); + } + + if (name == "true") { + return ScalarExpr(true); + } + + if (name == "false") { + return ScalarExpr(false); + } + + // Is it a config parameter? + { + auto it = ctx.config_args.find(name); + if (it != ctx.config_args.end()) { + return it->second; + } + } + + // Is it a compile-time parameter? + { + auto it = ctx.compile_args.find(name); + if (it != ctx.compile_args.end()) { + return it->second; + } + } + + // Is it a runtime parameter? + { + auto it = ctx.runtime_args.find(name); + if (it != ctx.runtime_args.end()) { + return it->second; + } + } + + stream.throw_unexpected_token(t, "unknown variable name"); +} + +static bool parse_long(const std::string& input, long& output) { + char* endptr = nullptr; + output = strtol(input.c_str(), &endptr, 10); + return *endptr == '\0'; +} + +static Expr parse_prim(TokenStream& stream, const Context& ctx) { + Token t = stream.next(); + + if (t.kind == TokenKind::Ident) { + return parse_ident(t, stream, ctx); + } else if (t.kind == TokenKind::ParenL) { + Expr e = parse_expr(stream, ctx); + stream.consume(TokenKind::ParenR); + return e; + } else if (t.kind == TokenKind::Number) { + long l; + if (!parse_long(stream.span(t), l)) { + stream.throw_unexpected_token(t, "failed to parse as integer"); + } + return ScalarExpr(l); + } else if (stream.matches(t, '-')) { + return -parse_expr(stream, ctx); + } else if (stream.matches(t, '+')) { + return +parse_expr(stream, ctx); + } else if (stream.matches(t, '!')) { + return !parse_expr(stream, ctx); + } else { + stream.throw_unexpected_token(t, "expecting expression"); + } +} + +static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec) { + // TODO: == != <= >= && || % + Expr lhs = parse_prim(stream, ctx); + + while (true) { + if (prec < 6 && stream.next_if('*')) { + lhs = lhs * parse_expr(stream, ctx, 6); + } else if (prec < 6 && stream.next_if('/')) { + lhs = lhs / parse_expr(stream, ctx, 6); + } else if (prec < 5 && stream.next_if('+')) { + lhs = lhs + parse_expr(stream, ctx, 5); + } else if (prec < 5 && stream.next_if('-')) { + lhs = lhs - parse_expr(stream, ctx, 5); + } else if (prec < 3 && stream.next_if('<')) { + lhs = lhs < parse_expr(stream, ctx, 3); + } else if (prec < 3 && stream.next_if('>')) { + lhs = lhs > parse_expr(stream, ctx, 3); + } else { + return lhs; + } + } +} + +static std::vector parse_expr_list( + TokenStream& stream, + const Context& ctx, + size_t max_params = 1024) { + std::vector output; + + stream.consume(TokenKind::ParenL); + + // Empty list + if (stream.next_if(TokenKind::ParenR)) { + return output; + } + + while (output.size() < max_params) { + output.push_back(parse_expr(stream, ctx)); + + if (!stream.next_if(TokenKind::Comma)) { + break; + } + } + + stream.consume(TokenKind::ParenR); + return output; +} + +static std::array +parse_expr_list3(TokenStream& stream, const Context& ctx) { + auto list = parse_expr_list(stream, ctx, 3); + return { + list.size() > 0 ? list[0] : 1, + list.size() > 1 ? list[1] : 1, + list.size() > 2 ? list[2] : 1, + }; +} + +struct DummyEval: Eval { + bool lookup(const Variable& v, Value& out) const override { + throw std::runtime_error("internal error"); + } +}; + +static void +process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { + stream.consume("pragma"); + stream.consume("kernel_tuner"); + + while (true) { + if (stream.next_if(TokenKind::DirectiveEnd)) { + break; + } + + Token t = stream.consume(TokenKind::Ident); + std::string name = stream.span(t); + + if (name == "tune") { + stream.consume(TokenKind::ParenL); + Token var_token = stream.consume(TokenKind::Ident); + stream.consume('='); + + std::string var = stream.span(var_token); + + if (ctx.config_args.count(var)) { + stream.throw_unexpected_token(var_token, "variable redefined"); + } + + if (ctx.compile_args.count(var)) { + stream.throw_unexpected_token( + var_token, + "variable already passed as compile-time value"); + } + + std::vector values; + std::vector priors; + + do { + Value v = parse_expr(stream, {}).eval(DummyEval {}); + + values.push_back(v); + priors.push_back(1.0); + } while (stream.next_if(TokenKind::Comma)); + + stream.consume(TokenKind::ParenR); + + auto param = builder.add(var, values, priors, values.front()); + + ctx.config_args.insert({var, param}); + } else if (name == "grid_size") { + auto l = parse_expr_list3(stream, ctx); + builder.grid_size(l[0], l[1], l[2]); + } else if (name == "block_size") { + auto l = parse_expr_list3(stream, ctx); + builder.block_size(l[0], l[1], l[2]); + } else if (name == "grid_divisor") { + auto l = parse_expr_list3(stream, ctx); + builder.grid_divisors(l[0], l[1], l[2]); + } else if (name == "problem_size") { + auto l = parse_expr_list3(stream, ctx); + builder.problem_size(l[0], l[1], l[2]); + } else { + stream.throw_unexpected_token( + t, + "this is not a supported action in kernel_launcher"); + } + } +} + +KernelBuilder process_kernel( + TokenStream& stream, + const KernelDef& def, + const std::vector& template_args) { + auto builder = + KernelBuilder(def.qualified_name, KernelSource("TODO", "TODO")); + + Context ctx; + + for (size_t i = 0; i < def.fun_params.size(); i++) { + std::string name = stream.span(def.fun_params[i].name); + ctx.runtime_args.insert({name, ArgExpr(uint8_t(i))}); + } + + if (template_args.size() > def.template_params.size()) { + throw std::runtime_error( + "cannot provide " + std::to_string(template_args.size()) + + " arguments to kernel " + def.qualified_name + + " since it takes at most " + + std::to_string(def.template_params.size()) + " arguments"); + } + + for (size_t i = 0; i < template_args.size(); i++) { + std::string name = stream.span(def.template_params[i].name); + ctx.compile_args.insert({name, template_args[i]}); + } + + for (const auto& directive : def.directives) { + stream.reset(directive); + process_directive(stream, builder, ctx); + } + + for (const auto& param : def.template_params) { + std::string name = stream.span(param.name); + Expr e = nullptr; + + if (ctx.compile_args.count(name) > 0) { + e = ctx.compile_args.at(name); + } else if (ctx.config_args.count(name) > 0) { + e = ctx.config_args.at(name); + } else { + stream.throw_unexpected_token( + param.name, + "this parameter is not defined, please add " + "`#pragma kernel_tuner tune(" + + name + "=...)`"); + } + + builder.template_arg(e); + } + + return builder; +} + +} // namespace internal +} // namespace kernel_launcher \ No newline at end of file diff --git a/src/internal/parser.cpp b/src/internal/parser.cpp new file mode 100644 index 0000000..4dda812 --- /dev/null +++ b/src/internal/parser.cpp @@ -0,0 +1,188 @@ +#include "kernel_launcher/internal/parser.h" + +#include + +#include "kernel_launcher/expr.h" + +namespace kernel_launcher { +namespace internal { + +static bool parse_long(const std::string& s, long& output) { + char* end_ptr = nullptr; + output = strtol(s.c_str(), &end_ptr, 10); + return *end_ptr == '\0'; +} + +static std::vector parse_template_params(TokenStream& stream) { + std::vector params; + + do { + std::string ty = stream.span(stream.consume(TokenKind::Ident)); + Token name = stream.consume(TokenKind::Ident); + Value default_value; + + bool is_integral = ty != "typename" && ty != "class"; + + if (is_integral && stream.next_if('=')) { + Token v = stream.consume(TokenKind::Number); + long l; + + if (!parse_long(stream.span(v), l)) { + stream.throw_unexpected_token(v, "invalid integer"); + } + + default_value = l; + } + + params.push_back(TemplateParam {name, is_integral, ty, default_value}); + } while (stream.next_if(TokenKind::Comma)); + + return params; +} + +static std::vector parse_kernel_params(TokenStream& stream) { + std::vector params; + + do { + Token begin = stream.next(); + Token before_name = begin; + Token name = stream.next(); + Token end = stream.peek(); + + while (end.kind != TokenKind::Comma && end.kind != TokenKind::ParenR) { + before_name = name; + name = stream.next(); + end = stream.peek(); + } + + if (name.kind != TokenKind::Ident) { + stream.throw_expecting_token(name, TokenKind::Ident); + } + + params.push_back({ + stream.span(begin, before_name), + name, + }); + } while (stream.next_if(TokenKind::Comma)); + + return params; +} + +static KernelDef +parse_kernel(TokenStream& stream, const std::vector& namespaces) { + std::vector directives; + std::vector template_params; + + // Advance the stream past all directives + while (stream.next_if(TokenKind::DirectiveBegin)) { + Token t = stream.next(); + directives.push_back(t); + + // Find the directive end + while (t.kind != TokenKind::DirectiveEnd) { + t = stream.next(); + } + } + + if (stream.next_if("template")) { + stream.consume(TokenKind::AngleL); + template_params = parse_template_params(stream); + stream.consume(TokenKind::AngleR); + } + + stream.consume("__global__"); + stream.consume("void"); + Token name = stream.consume(TokenKind::Ident); + + stream.consume(TokenKind::ParenL); + auto fun_params = parse_kernel_params(stream); + stream.consume(TokenKind::ParenR); + + std::string qualified_name; + for (const auto& n : namespaces) { + qualified_name += n; + qualified_name += "::"; + } + qualified_name += stream.span(name); + + return { + qualified_name, + name, + directives, + template_params, + fun_params, + }; +} + +enum struct Scope { + Paren, + Bracket, + Brace, + Namespace, +}; + +std::vector parse_kernels(TokenStream& stream) { + std::vector namespace_stack; + std::vector scope_stack; + std::vector kernels; + + while (stream.has_next()) { + Token t = stream.next(); + + if (t.kind == TokenKind::Ident && stream.matches(t, "namespace")) { + t = stream.consume(TokenKind::Ident); + namespace_stack.push_back(stream.span(t)); + + stream.consume(TokenKind::BraceL); + scope_stack.push_back(Scope::Namespace); + } else if (t.kind == TokenKind::BraceL) { + scope_stack.push_back(Scope::Brace); + } else if (t.kind == TokenKind::BraceR) { + if (scope_stack.empty()) { + stream.throw_unexpected_token(t, "no matching '{' found"); + } + + Scope s = scope_stack.back(); + scope_stack.pop_back(); + + if (s == Scope::Namespace) { + namespace_stack.pop_back(); + } else if (s != Scope::Brace) { + stream.throw_unexpected_token(t, "no matching '{' found"); + } + } else if (t.kind == TokenKind::ParenL) { + scope_stack.push_back(Scope::Paren); + } else if (t.kind == TokenKind::ParenR) { + if (scope_stack.empty() || scope_stack.back() != Scope::Paren) { + stream.throw_unexpected_token(t, "no matching '(' found"); + } + + scope_stack.pop_back(); + } else if (t.kind == TokenKind::BracketL) { + scope_stack.push_back(Scope::Bracket); + } else if (t.kind == TokenKind::BracketR) { + if (scope_stack.empty() || scope_stack.back() != Scope::Bracket) { + stream.throw_unexpected_token(t, "no matching '[' found"); + } + + scope_stack.pop_back(); + } else if (t.kind == TokenKind::DirectiveBegin) { + bool is_pragma = + stream.next_if("pragma") && stream.next_if("kernel_tuner"); + stream.reset(t); + + if (is_pragma) { + kernels.push_back(parse_kernel(stream, namespace_stack)); + } else { + while (t.kind != TokenKind::DirectiveEnd) { + t = stream.next(); + } + } + } + } + + return kernels; +} + +} // namespace internal +} // namespace kernel_launcher \ No newline at end of file diff --git a/src/internal/tokens.cpp b/src/internal/tokens.cpp new file mode 100644 index 0000000..53c77fe --- /dev/null +++ b/src/internal/tokens.cpp @@ -0,0 +1,390 @@ +#include "kernel_launcher/internal/tokens.h" + +#include +#include +#include +#include +#include + +namespace kernel_launcher { +namespace internal { + +using index_t = uint16_t; + +static bool iswhitespace(char c) { + return c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v' + || c == '\f'; +} + +static bool isdigit(char c) { + return c >= '0' && c <= '9'; +} + +static bool isident(char c) { + return isdigit(c) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') + || c == '_' || c == '$'; +} + +static index_t +advance_single_line_comment(index_t i, const std::string& input) { + while (i < input.size() && input[i] != '\n') { + i++; + } + + return i; +} + +static index_t advance_multi_line_comment(index_t i, const std::string& input) { + char prev = '\0'; + + while (i < input.size()) { + char curr = input[i++]; + + if (prev == '*' && curr == '/') { + break; + } + + prev = curr; + } + + return i; +} + +static index_t advance_number(index_t i, const std::string& input) { + while (isdigit(input[i])) { + i++; + } + + return i; +} + +static index_t advance_ident(index_t i, const std::string& input) { + while (isident(input[i])) { + i++; + } + + return i; +} + +static index_t advance_string(index_t i, const std::string& input) { + char quote = input[i]; + bool prev_backslash = false; + i++; + + while (i < input.size()) { + if (input[i] == quote && !prev_backslash) { + i++; + return i; + } + + prev_backslash = input[i] == '\\' && !prev_backslash; + i++; + } + + return i; +} + +TokenKind char2_to_kind(char a, char b) { + if ((a == '=' && b == '=') || (a == '<' && b == '=') + || (a == '>' && b == '=') || (a == '&' && b == '&') + || (a == '|' && b == '|') || (a == '<' && b == '<') + || (a == '>' && b == '>') || (a == ':' && b == ':')) { + return TokenKind::Punct; + } + + return TokenKind::Unknown; +} + +TokenKind char_to_kind(char c) { + switch (c) { + case '{': + return TokenKind::BraceL; + case '}': + return TokenKind::BraceR; + case '[': + return TokenKind::BracketL; + case ']': + return TokenKind::BracketR; + case '(': + return TokenKind::ParenL; + case ')': + return TokenKind::ParenR; + case ',': + return TokenKind::Comma; + case '<': + return TokenKind::AngleL; + case '>': + return TokenKind::AngleR; + case '+': + case '=': + case '-': + case '*': + case '/': + case '!': + case '~': + case '&': + case '|': + case '^': + case '%': + return TokenKind::Punct; + default: + // Remaining: .:;?@ + return TokenKind::Unknown; + } +} + +std::vector tokenize(const std::string& input) { + std::vector tokens; + + if (input.size() >= std::numeric_limits::max()) { + throw std::runtime_error("TODO"); + } + + index_t index = 0; + bool inside_directive = false; + + while (index < input.size()) { + index_t begin = index; + char c = input[index]; + char next = input[index + 1]; + TokenKind kind = TokenKind::Unknown; + + if (!inside_directive && c == '#') { + kind = TokenKind::DirectiveBegin; + index++; + inside_directive = true; + } else if (inside_directive && c == '\n') { + kind = TokenKind::DirectiveEnd; + index++; + inside_directive = false; + } else if (inside_directive && c == '\\' && next == '\n') { + index++; // skip backslash + index++; // skip newline + continue; + } else if (iswhitespace(c)) { + index++; + continue; + } else if (c == '/' && next == '/') { + index = advance_single_line_comment(index, input); + continue; + } else if (c == '/' && next == '*') { + index = advance_multi_line_comment(index, input); + continue; + } else if (isdigit(c)) { + index = advance_number(index, input); + kind = TokenKind::Number; + } else if (isident(c)) { + index = advance_ident(index, input); + kind = TokenKind::Ident; + } else if (c == '"' || c == '\'') { + index = advance_string(index, input); + kind = TokenKind::String; + } else if ((kind = char2_to_kind(c, next)) != TokenKind::Unknown) { + index++; + index++; + } else if ((kind = char_to_kind(c)) != TokenKind::Unknown) { + index++; + } else { + // Unknown character :( + kind = TokenKind::Unknown; + index++; + } + + tokens.emplace_back(begin, index, kind); + } + + if (inside_directive) { + tokens.emplace_back(index, index, TokenKind::DirectiveEnd); + } + + tokens.emplace_back(index, index, TokenKind::EndOfFile); + return tokens; +} + +TokenStream::TokenStream(std::string file, std::string input) : index_(0) { + auto tokens = tokenize(input); + impl_ = std::make_shared( + TokenStreamImpl {std::move(file), std::move(input), std::move(tokens)}); +} + +void TokenStream::reset(Token t) { + const auto& tokens = impl_->tokens_; + auto it = std::lower_bound( + tokens.begin(), + tokens.end(), + t, + [&](const auto& lhs, const auto& rhs) { + return lhs.begin < rhs.begin; + }); + + if (it == tokens.end() || *it != t) { + throw std::runtime_error("cannot reset to unknown token"); + } + + index_ = static_cast(it - tokens.begin()); +} + +bool TokenStream::has_next() const { + return index_ < impl_->tokens_.size(); +} + +Token TokenStream::next() { + Token t = peek(); + index_++; + return t; +} + +Token TokenStream::peek() { + const auto& tokens = impl_->tokens_; + + if (index_ >= tokens.size()) { + throw std::runtime_error("unexpected EOF while parsing"); + } + + return tokens[index_]; +} + +void TokenStream::prev() { + if (index_ > 0) { + index_--; + } +} + +bool TokenStream::matches(Token t, char c) const { + const auto& text = impl_->text_; + + if (t.begin + 1 != t.end || t.begin >= text.size()) { + return false; + } + + return text[t.begin] == c; +} + +bool TokenStream::matches(Token t, const char* needle) const { + const auto& text = impl_->text_; + + if (t.end > text.size()) { + return false; + } + + for (index_t i = t.begin; i < t.end; i++) { + if (*needle != text[i] || *needle == '\0') { + return false; + } + + needle++; + } + + return *needle == '\0'; +} + +std::pair TokenStream::extract_line_column(Token t) const { + const auto& text = impl_->text_; + int lineno = 1; + int colno = 1; + + for (size_t i = 0; i < text.size() && i < t.begin; i++) { + if (text[i] == '\n') { + lineno++; + colno = 1; + } else { + colno++; + } + } + + return {lineno, colno}; +} + +static std::string clean_string(const std::string& input) { + std::stringstream output; + + for (char c : input) { + if (isprint(c) != 0) { + output << c; + } else if (c == '\n') { + output << "\\n"; + } else if (c == '\0') { + output << "\\0"; + } else { + output << "?"; + } + } + + return output.str(); +} + +static std::string token_description(TokenKind k) { + switch (k) { + case TokenKind::String: + return "string"; + case TokenKind::Number: + return "integer"; + case TokenKind::ParenL: + return "'('"; + case TokenKind::ParenR: + return "')'"; + case TokenKind::BracketL: + return "'['"; + case TokenKind::BracketR: + return "']'"; + case TokenKind::BraceL: + return "'{'"; + case TokenKind::BraceR: + return "'}'"; + case TokenKind::AngleL: + return "'<'"; + case TokenKind::AngleR: + return "'>'"; + case TokenKind::DirectiveBegin: + return "'#'"; + case TokenKind::DirectiveEnd: + return ""; + case TokenKind::Ident: + return "identifier"; + case TokenKind::Comma: + return "','"; + case TokenKind::EndOfFile: + return ""; + default: + return "unknown token"; + } +} + +[[noreturn]] void +TokenStream::throw_expecting_token(Token t, TokenKind k) const { + throw_unexpected_token(t, "expecting token " + token_description(k)); +} + +[[noreturn]] void +TokenStream::throw_expecting_token(Token t, const char* c) const { + std::string reason = "expecting token \"" + clean_string(c) + "\""; + throw_unexpected_token(t, reason); +} + +void TokenStream::throw_unexpected_token(Token t, const std::string& reason) + const { + auto line_col = extract_line_column(t); + + std::stringstream msg; + msg << "error:" << impl_->file_ << ":" << line_col.first << ":" + << line_col.second << ": found invalid token \"" + << clean_string(span(t)) << "\""; + + if (!reason.empty()) { + msg << ", " << reason; + } + + throw std::runtime_error(msg.str()); +} + +std::string TokenStream::span(size_t begin, size_t end) const { + const auto& text = impl_->text_; + if (begin > end || end > text.size()) { + throw std::runtime_error("index out of bounds"); + } + + return text.substr(begin, end - begin); +} + +} // namespace internal +} // namespace kernel_launcher \ No newline at end of file diff --git a/tests/internal.cpp b/tests/internal.cpp new file mode 100644 index 0000000..3d8d0c3 --- /dev/null +++ b/tests/internal.cpp @@ -0,0 +1,142 @@ +#include + +#include "catch.hpp" +#include "kernel_launcher/internal/directives.h" +#include "kernel_launcher/internal/parser.h" +#include "kernel_launcher/internal/tokens.h" +#include "test_utils.h" + +using namespace kernel_launcher; + +TEST_CASE("tokenizer") { + using internal::TokenKind; + + std::string input = R"( + #include + + #pragma kernel_tuner test + void foo(int x) { + /* Test + multi-line * / + comment */ + + "a" == 1.2 // should not compile + } + )"; + + std::vector> expected = { + {"#", TokenKind::DirectiveBegin}, + {"include", TokenKind::Ident}, + {"<", TokenKind::AngleL}, + {"stdio", TokenKind::Ident}, + {">", TokenKind::AngleR}, + {"", TokenKind::DirectiveEnd}, + {"#", TokenKind::DirectiveBegin}, + {"pragma", TokenKind::Ident}, + {"kernel_tuner", TokenKind::Ident}, + {"test", TokenKind::Ident}, + {"", TokenKind::DirectiveEnd}, + {"void", TokenKind::Ident}, + {"foo", TokenKind::Ident}, + {"(", TokenKind::ParenL}, + {"int", TokenKind::Ident}, + {"x", TokenKind::Ident}, + {")", TokenKind::ParenR}, + {"{", TokenKind::BraceL}, + {"\"a\"", TokenKind::String}, + {"=", TokenKind::Punct}, + {"=", TokenKind::Punct}, + {"1.2", TokenKind::Number}, + {"}", TokenKind::BraceR}}; + + auto stream = internal::TokenStream("", input); + + for (size_t i = 0; i < expected.size(); i++) { + auto token = stream.next(); + + INFO("index=" << i); + CHECK(stream.span(token) == expected[i].first); + CHECK(token.kind == expected[i].second); + } +} + +TEST_CASE("parser") { + std::string input = R"( +// This is a comment +namespace foo { +namespace bar { + +#ifdef SOMECONSTANT +#endif + +#pragma kernel_tuner tune(block_size=32, 64, 128, 256) +#pragma kernel_tuner problem_size(n) +__global__ void baz(int n, const float* a) { + if (threadIdx.x < 10) { + return a[threadIdx.x]; + } +} +} // namespace bar + +#pragma kernel_tuner tune(block_size=32, 64, 128, 256) +#pragma kernel_tuner tune(tiling_factor=1,2,3,4) +#pragma kernel_tuner problem_size(n) \ + grid_divisor(tiling_factor * block_size) +template +__global__ void spaz(int n, const float* input, float* output) { + if (threadIdx.x < 10) { + return a[threadIdx.x]; + } +} +} // namespace foo + )"; + + auto stream = internal::TokenStream("", input); + auto kernels = parse_kernels(stream); + + REQUIRE(kernels.size() == 2); + + const auto& baz = kernels[0]; + CHECK(baz.qualified_name == "foo::bar::baz"); + REQUIRE(baz.fun_params.size() == 2); + CHECK(stream.span(baz.fun_params[0].name) == "n"); + CHECK(stream.span(baz.fun_params[1].name) == "a"); + REQUIRE(baz.template_params.size() == 0); + + const auto& spaz = kernels[1]; + CHECK(spaz.qualified_name == "foo::spaz"); + REQUIRE(spaz.fun_params.size() == 3); + CHECK(stream.span(spaz.fun_params[0].name) == "n"); + CHECK(stream.span(spaz.fun_params[1].name) == "input"); + CHECK(stream.span(spaz.fun_params[2].name) == "output"); + REQUIRE(spaz.template_params.size() == 1); + CHECK(stream.span(spaz.template_params[0].name) == "tiling_factor"); +} + +TEST_CASE("directives") { + std::string input = R"( + namespace bar { + namespace foo { + + #pragma kernel_tuner tune(block_size=32, 64, 128, 256) + #pragma kernel_tuner tune(tile_factor=1, 2, 3, 4) + #pragma kernel_tuner problem_size(n) + #pragma kernel_tuner grid_divisor(block_size * tile_factor) + template + __global__ void baz(int n, const T* a) { + if (threadIdx.x < 10) { + return a[threadIdx.x]; + } + } + } // namespace foo + } // namespace bar + )"; + + auto stream = internal::TokenStream("", input); + auto kernels = parse_kernels(stream); + + REQUIRE(kernels.size() == 1); + const auto& kernel = kernels[0]; + + KernelBuilder builder = process_kernel(stream, kernels[0], {"float"}); +} \ No newline at end of file From 57a2462f7ecfc26db607806136f7c23f74839896 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 2 Feb 2023 21:55:41 +0100 Subject: [PATCH 08/63] Fix bug in Value when dealing with NULL ptrs for strings --- include/kernel_launcher/value.h | 1 + src/value.cpp | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/kernel_launcher/value.h b/include/kernel_launcher/value.h index 3beb3a9..8503856 100644 --- a/include/kernel_launcher/value.h +++ b/include/kernel_launcher/value.h @@ -54,6 +54,7 @@ struct Value { static constexpr DataType type_bool = DataType::bool_; Value() = default; + Value(std::nullptr_t) : Value() {} Value(const Value& val) { *this = val; diff --git a/src/value.cpp b/src/value.cpp index 6bea2b4..e397336 100644 --- a/src/value.cpp +++ b/src/value.cpp @@ -46,6 +46,12 @@ static const char* data_type_name(Value::DataType dtype) { } const std::string& intern_string(const char* input) { + if (input == nullptr) { + throw std::runtime_error( + "cannot create kernel_launcher::Value " + "from NULL pointer as string"); + } + static constexpr size_t initial_capacity = 64; auto equal = [](const char* a, const char* b) { return strcmp(a, b) == 0; }; auto hash = [](const char* v) { return hash_string(v, ::strlen(v)); }; From d74dd9bcc9ff76b8930f0211d0fb1a48c47c89a0 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 2 Feb 2023 21:56:15 +0100 Subject: [PATCH 09/63] Change clang-format to never indent namespaces --- .clang-format | 2 +- include/kernel_launcher/expr.h | 90 ++++++++++++++++----------------- include/kernel_launcher/utils.h | 62 +++++++++++------------ 3 files changed, 77 insertions(+), 77 deletions(-) diff --git a/.clang-format b/.clang-format index d309bdc..0c0afcc 100644 --- a/.clang-format +++ b/.clang-format @@ -55,7 +55,7 @@ IndentWidth: 4 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MaxEmptyLinesToKeep: 1 -NamespaceIndentation: Inner +NamespaceIndentation: None PointerAlignment: Left ReflowComments: false SortIncludes: true diff --git a/include/kernel_launcher/expr.h b/include/kernel_launcher/expr.h index 6af0c4a..21e5cb1 100644 --- a/include/kernel_launcher/expr.h +++ b/include/kernel_launcher/expr.h @@ -83,56 +83,56 @@ struct ParamExpr: BaseExpr { }; namespace detail { - std::true_type is_expr_helper(const BaseExpr*); - std::false_type is_expr_helper(...); +std::true_type is_expr_helper(const BaseExpr*); +std::false_type is_expr_helper(...); - template - constexpr bool is_expr = decltype(detail::is_expr_helper( - std::declval::type*>()))::value; - - template - struct into_expr_helper; - - // TunableParam -> ParamExpr - template - struct into_expr_helper< - I, - T, - typename std::enable_if< - std::is_same::type, TunableParam>::value>:: - type> { - using type = ParamExpr; - - static type call(const TunableParam& p) { - return ParamExpr(std::move(p)); - } - }; +template +constexpr bool is_expr = decltype(detail::is_expr_helper( + std::declval::type*>()))::value; + +template +struct into_expr_helper; + +// TunableParam -> ParamExpr +template +struct into_expr_helper< + I, + T, + typename std::enable_if< + std::is_same::type, TunableParam>::value>:: + type> { + using type = ParamExpr; + + static type call(const TunableParam& p) { + return ParamExpr(std::move(p)); + } +}; - // TypedExpr -> TypedExpr - template - struct into_expr_helper< - E, - T, - typename std::enable_if>::type> { - using type = typename std::decay::type; +// TypedExpr -> TypedExpr +template +struct into_expr_helper< + E, + T, + typename std::enable_if>::type> { + using type = typename std::decay::type; - static type call(E&& expr) { - return std::forward(expr); - } - }; + static type call(E&& expr) { + return std::forward(expr); + } +}; - // R -> ScalarExpr (Where R is convertible to T) - template - struct into_expr_helper< - R, - T, - typename std::enable_if::value>::type> { - using type = ScalarExpr; +// R -> ScalarExpr (Where R is convertible to T) +template +struct into_expr_helper< + R, + T, + typename std::enable_if::value>::type> { + using type = ScalarExpr; - static ScalarExpr call(R&& value) { - return ScalarExpr(T(std::forward(value))); - } - }; + static ScalarExpr call(R&& value) { + return ScalarExpr(T(std::forward(value))); + } +}; } // namespace detail struct SharedExpr: BaseExpr { diff --git a/include/kernel_launcher/utils.h b/include/kernel_launcher/utils.h index c8e068e..ba579b3 100644 --- a/include/kernel_launcher/utils.h +++ b/include/kernel_launcher/utils.h @@ -25,38 +25,38 @@ std::ostream& log_warning(); std::string demangle_type_info(const std::type_info& type); namespace detail { - template - inline const std::string& demangle_type_info_for() { - static std::string result = demangle_type_info(typeid(T)); - return result; - } - - struct TypeInfoInternalImpl { - size_t alignment; - size_t size; - const std::type_info& type_info; - const std::string& (*name_fun)(); - const TypeInfoInternalImpl* remove_pointer_type; - const TypeInfoInternalImpl* remove_const; - const TypeInfoInternalImpl* add_const; - bool is_const; - bool is_empty; - bool is_trivial_copy; - }; +template +inline const std::string& demangle_type_info_for() { + static std::string result = demangle_type_info(typeid(T)); + return result; +} - template - static constexpr TypeInfoInternalImpl type_impl_for = { - alignof(T), - sizeof(T), - typeid(T), - demangle_type_info_for, - &type_impl_for::type>, - &type_impl_for::type>, - &type_impl_for::type>, - std::is_const::value, - std::is_empty::value, - std::is_trivially_copyable::value, - }; +struct TypeInfoInternalImpl { + size_t alignment; + size_t size; + const std::type_info& type_info; + const std::string& (*name_fun)(); + const TypeInfoInternalImpl* remove_pointer_type; + const TypeInfoInternalImpl* remove_const; + const TypeInfoInternalImpl* add_const; + bool is_const; + bool is_empty; + bool is_trivial_copy; +}; + +template +static constexpr TypeInfoInternalImpl type_impl_for = { + alignof(T), + sizeof(T), + typeid(T), + demangle_type_info_for, + &type_impl_for::type>, + &type_impl_for::type>, + &type_impl_for::type>, + std::is_const::value, + std::is_empty::value, + std::is_trivially_copyable::value, +}; } // namespace detail /** From ef7d04198604fffd8d5b08b2191b86d59d46dac1 Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 3 Feb 2023 19:46:25 +0100 Subject: [PATCH 10/63] Add `PragmaKernel` and `build_pragma_kernel` to public API --- ...{main_annotated.cu => kernel_annotated.cu} | 24 +++--- examples/vector_add/main.cu | 9 +- include/kernel_launcher/internal/directives.h | 1 + include/kernel_launcher/internal/tokens.h | 27 +++--- include/kernel_launcher/pragma.h | 32 +++++++ src/internal/directives.cpp | 30 ++++--- src/internal/parser.cpp | 45 +++++----- src/internal/tokens.cpp | 55 +++++------- src/pragma.cpp | 70 +++++++++++++++ tests/internal.cpp | 85 +++++++++++++------ 10 files changed, 257 insertions(+), 121 deletions(-) rename examples/vector_add/{main_annotated.cu => kernel_annotated.cu} (67%) create mode 100644 include/kernel_launcher/pragma.h create mode 100644 src/pragma.cpp diff --git a/examples/vector_add/main_annotated.cu b/examples/vector_add/kernel_annotated.cu similarity index 67% rename from examples/vector_add/main_annotated.cu rename to examples/vector_add/kernel_annotated.cu index bd233c8..993d383 100644 --- a/examples/vector_add/main_annotated.cu +++ b/examples/vector_add/kernel_annotated.cu @@ -1,16 +1,16 @@ -#pragma kernel_tuner tune(block_size) values(32, 64, 128, 256, 512, 1024) -#pragma kernel_tuner tune(elements_per_thread) values(1, 2, 3, 4) -#pragma kernel_tuner tune(tiling_strategy) values(0, 1, 2) -#pragma kernel_tuner template_args( \ - block_size, \ - elements_per_thread, \ - tiling_strategy) -#pragma kernel_tuner grid_divisor(block_size* elements_per_thread) -#pragma kernel_tuner restriction(block_size* elements_per_thread >= 64) +#pragma kernel_tuner tune(block_size = 32, 64, 128, 256, 512, 1024) +#pragma kernel_tuner tune(elements_per_thread = 1, 2, 3, 4) +#pragma kernel_tuner tune(tiling_strategy = 0, 1, 2) +#pragma kernel_tuner block_size(block_size) #pragma kernel_tuner problem_size(n) -#pragma kernel_tuner buffers(A[n], B[n], C[n]) -template -__global__ void vector_add(int n, float* C, const float* A, const float* B) { +#pragma kernel_tuner grid_divisor(block_size* elements_per_thread) +#pragma kernel_tuner restriction(block_size* elements_per_thread >= 32) +template< + typename T, + int block_size = 32, + int elements_per_thread = 2, + int tiling_strategy = 2> +__global__ void vector_add(int n, T* C, const T* A, const T* B) { static_assert( tiling_strategy >= 0 && tiling_strategy <= 2, "invalid tiling strategy"); diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 8435f83..c2cd9db 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -1,6 +1,7 @@ #include #include "kernel_launcher.h" +#include "kernel_launcher/pragma.h" namespace kl = kernel_launcher; @@ -11,14 +12,16 @@ void cuda_check(cudaError_t code) { } } -kl::KernelBuilder build_vector_add() { +std::string kernel_directory() { // Find kernel file std::string this_file = __FILE__; std::string this_directory = this_file.substr(0, this_file.rfind('/')); - std::string kernel_file = this_directory + "/kernel.cu"; + return this_directory + "/"; +} +kl::KernelBuilder build_vector_add() { // Tunable parameters - kl::KernelBuilder builder("vector_add", kernel_file); + kl::KernelBuilder builder("vector_add", kernel_directory() + "/kernel.cu"); auto threads_per_block = builder.tune("threads_per_block", {32, 64, 128, 256, 512, 1024}); auto blocks_per_sm = diff --git a/include/kernel_launcher/internal/directives.h b/include/kernel_launcher/internal/directives.h index 6d3abdf..7ce3ca1 100644 --- a/include/kernel_launcher/internal/directives.h +++ b/include/kernel_launcher/internal/directives.h @@ -2,6 +2,7 @@ #define KERNEL_LAUNCHER_DIRECTIVES_H #include "../builder.h" +#include "parser.h" #include "tokens.h" namespace kernel_launcher { diff --git a/include/kernel_launcher/internal/tokens.h b/include/kernel_launcher/internal/tokens.h index d423bd0..68d67a7 100644 --- a/include/kernel_launcher/internal/tokens.h +++ b/include/kernel_launcher/internal/tokens.h @@ -49,28 +49,19 @@ struct Token { } }; -struct TokenStream; -struct TokenStreamImpl { - friend TokenStream; - - std::string file_; - std::string text_; - std::vector tokens_; -}; - struct TokenStream { explicit TokenStream(const TokenStream&) = default; TokenStream(TokenStream&&) = default; TokenStream(std::string file, std::string input); - void reset(Token t); + void seek(Token t); bool has_next() const; Token next(); Token peek(); void prev(); bool matches(Token t, char c) const; - bool matches(Token t, const char* c) const; + bool matches(Token t, const char* needle) const; bool matches(Token t, const std::string& s) const { return matches(t, s.c_str()); @@ -128,9 +119,19 @@ struct TokenStream { [[noreturn]] void throw_unexpected_token(Token t, const std::string& reason = "") const; + const std::string& file() const { + return file_; + } + + const std::string& content() const { + return text_; + } + private: - size_t index_; - std::shared_ptr impl_; + std::string file_; + std::string text_; + size_t index_ = 0; + std::vector tokens_; }; } // namespace internal diff --git a/include/kernel_launcher/pragma.h b/include/kernel_launcher/pragma.h new file mode 100644 index 0000000..b325080 --- /dev/null +++ b/include/kernel_launcher/pragma.h @@ -0,0 +1,32 @@ +#ifndef KERNEL_LAUNCHER_PRAGMA_H +#define KERNEL_LAUNCHER_PRAGMA_H + +#include "kernel_launcher/registry.h" + +namespace kernel_launcher { + +KernelBuilder build_pragma_kernel( + const KernelSource& source, + const std::string& name, + const std::vector& template_args = {}, + const FileLoader& fs = DefaultLoader {}); + +struct PragmaKernel: IKernelDescriptor { + PragmaKernel( + std::string path, + std::string name, + std::vector template_args = {}); + + KernelBuilder build() const override; + bool equals(const IKernelDescriptor& that) const override; + hash_t hash() const override; + + private: + std::string kernel_name_; + std::string file_path_; + std::vector template_args_; +}; + +} // namespace kernel_launcher + +#endif //KERNEL_LAUNCHER_PRAGMA_H diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index a5d5e74..2fd52be 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -155,6 +155,14 @@ static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec) { lhs = lhs < parse_expr(stream, ctx, 3); } else if (prec < 3 && stream.next_if('>')) { lhs = lhs > parse_expr(stream, ctx, 3); + } else if (prec < 3 && stream.next_if("<=")) { + lhs = lhs <= parse_expr(stream, ctx, 3); + } else if (prec < 3 && stream.next_if(">=")) { + lhs = lhs >= parse_expr(stream, ctx, 3); + } else if (prec < 3 && stream.next_if("!=")) { + lhs = lhs != parse_expr(stream, ctx, 3); + } else if (prec < 3 && stream.next_if("==")) { + lhs = lhs == parse_expr(stream, ctx, 3); } else { return lhs; } @@ -190,7 +198,7 @@ static std::array parse_expr_list3(TokenStream& stream, const Context& ctx) { auto list = parse_expr_list(stream, ctx, 3); return { - list.size() > 0 ? list[0] : 1, + list.size() > 0 ? list[0] : 1, // NOLINT list.size() > 1 ? list[1] : 1, list.size() > 2 ? list[2] : 1, }; @@ -207,11 +215,7 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { stream.consume("pragma"); stream.consume("kernel_tuner"); - while (true) { - if (stream.next_if(TokenKind::DirectiveEnd)) { - break; - } - + while (!stream.next_if(TokenKind::DirectiveEnd)) { Token t = stream.consume(TokenKind::Ident); std::string name = stream.span(t); @@ -222,11 +226,11 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { std::string var = stream.span(var_token); - if (ctx.config_args.count(var)) { + if (ctx.config_args.count(var) > 0) { stream.throw_unexpected_token(var_token, "variable redefined"); } - if (ctx.compile_args.count(var)) { + if (ctx.compile_args.count(var) > 0) { stream.throw_unexpected_token( var_token, "variable already passed as compile-time value"); @@ -259,6 +263,10 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { } else if (name == "problem_size") { auto l = parse_expr_list3(stream, ctx); builder.problem_size(l[0], l[1], l[2]); + } else if (name == "restriction") { + for (const auto& expr : parse_expr_list(stream, ctx)) { + builder.restriction(expr); + } } else { stream.throw_unexpected_token( t, @@ -271,8 +279,8 @@ KernelBuilder process_kernel( TokenStream& stream, const KernelDef& def, const std::vector& template_args) { - auto builder = - KernelBuilder(def.qualified_name, KernelSource("TODO", "TODO")); + auto source = KernelSource(stream.file(), stream.content()); + auto builder = KernelBuilder(def.qualified_name, source); Context ctx; @@ -295,7 +303,7 @@ KernelBuilder process_kernel( } for (const auto& directive : def.directives) { - stream.reset(directive); + stream.seek(directive); process_directive(stream, builder, ctx); } diff --git a/src/internal/parser.cpp b/src/internal/parser.cpp index 4dda812..5305bc5 100644 --- a/src/internal/parser.cpp +++ b/src/internal/parser.cpp @@ -1,9 +1,5 @@ #include "kernel_launcher/internal/parser.h" -#include - -#include "kernel_launcher/expr.h" - namespace kernel_launcher { namespace internal { @@ -84,16 +80,19 @@ parse_kernel(TokenStream& stream, const std::vector& namespaces) { } } + // check for 'template' '<' ... '>' if (stream.next_if("template")) { stream.consume(TokenKind::AngleL); template_params = parse_template_params(stream); stream.consume(TokenKind::AngleR); } + // check for '__global__' 'void' IDENT stream.consume("__global__"); stream.consume("void"); Token name = stream.consume(TokenKind::Ident); + // check for '(' ... ')' stream.consume(TokenKind::ParenL); auto fun_params = parse_kernel_params(stream); stream.consume(TokenKind::ParenR); @@ -121,11 +120,20 @@ enum struct Scope { Namespace, }; +// NOLINTNEXTLINE(readability-function-cognitive-complexity) std::vector parse_kernels(TokenStream& stream) { std::vector namespace_stack; std::vector scope_stack; std::vector kernels; + auto assert_pop_scope = [&](Token t, Scope scope, const char* msg) { + if (scope_stack.empty() || scope_stack.back() != scope) { + stream.throw_unexpected_token(t, msg); + } + + scope_stack.pop_back(); + }; + while (stream.has_next()) { Token t = stream.next(); @@ -138,38 +146,25 @@ std::vector parse_kernels(TokenStream& stream) { } else if (t.kind == TokenKind::BraceL) { scope_stack.push_back(Scope::Brace); } else if (t.kind == TokenKind::BraceR) { - if (scope_stack.empty()) { - stream.throw_unexpected_token(t, "no matching '{' found"); - } - - Scope s = scope_stack.back(); - scope_stack.pop_back(); - - if (s == Scope::Namespace) { + if (!scope_stack.empty() + && scope_stack.back() == Scope::Namespace) { namespace_stack.pop_back(); - } else if (s != Scope::Brace) { - stream.throw_unexpected_token(t, "no matching '{' found"); + scope_stack.back() = Scope::Brace; } + + assert_pop_scope(t, Scope::Brace, "no matching '{' found"); } else if (t.kind == TokenKind::ParenL) { scope_stack.push_back(Scope::Paren); } else if (t.kind == TokenKind::ParenR) { - if (scope_stack.empty() || scope_stack.back() != Scope::Paren) { - stream.throw_unexpected_token(t, "no matching '(' found"); - } - - scope_stack.pop_back(); + assert_pop_scope(t, Scope::Paren, "no matching '(' found"); } else if (t.kind == TokenKind::BracketL) { scope_stack.push_back(Scope::Bracket); } else if (t.kind == TokenKind::BracketR) { - if (scope_stack.empty() || scope_stack.back() != Scope::Bracket) { - stream.throw_unexpected_token(t, "no matching '[' found"); - } - - scope_stack.pop_back(); + assert_pop_scope(t, Scope::Bracket, "no matching '[' found"); } else if (t.kind == TokenKind::DirectiveBegin) { bool is_pragma = stream.next_if("pragma") && stream.next_if("kernel_tuner"); - stream.reset(t); + stream.seek(t); if (is_pragma) { kernels.push_back(parse_kernel(stream, namespace_stack)); diff --git a/src/internal/tokens.cpp b/src/internal/tokens.cpp index 53c77fe..71a439f 100644 --- a/src/internal/tokens.cpp +++ b/src/internal/tokens.cpp @@ -201,31 +201,31 @@ std::vector tokenize(const std::string& input) { return tokens; } -TokenStream::TokenStream(std::string file, std::string input) : index_(0) { - auto tokens = tokenize(input); - impl_ = std::make_shared( - TokenStreamImpl {std::move(file), std::move(input), std::move(tokens)}); +TokenStream::TokenStream(std::string file, std::string input) : + file_(std::move(file)), + text_(std::move(input)), + index_(0) { + tokens_ = tokenize(text_); } -void TokenStream::reset(Token t) { - const auto& tokens = impl_->tokens_; +void TokenStream::seek(Token t) { auto it = std::lower_bound( - tokens.begin(), - tokens.end(), + tokens_.begin(), + tokens_.end(), t, [&](const auto& lhs, const auto& rhs) { return lhs.begin < rhs.begin; }); - if (it == tokens.end() || *it != t) { + if (it == tokens_.end() || *it != t) { throw std::runtime_error("cannot reset to unknown token"); } - index_ = static_cast(it - tokens.begin()); + index_ = static_cast(it - tokens_.begin()); } bool TokenStream::has_next() const { - return index_ < impl_->tokens_.size(); + return index_ < tokens_.size(); } Token TokenStream::next() { @@ -235,13 +235,11 @@ Token TokenStream::next() { } Token TokenStream::peek() { - const auto& tokens = impl_->tokens_; - - if (index_ >= tokens.size()) { + if (index_ >= tokens_.size()) { throw std::runtime_error("unexpected EOF while parsing"); } - return tokens[index_]; + return tokens_[index_]; } void TokenStream::prev() { @@ -251,24 +249,20 @@ void TokenStream::prev() { } bool TokenStream::matches(Token t, char c) const { - const auto& text = impl_->text_; - - if (t.begin + 1 != t.end || t.begin >= text.size()) { + if (t.begin + 1 != t.end || t.begin >= text_.size()) { return false; } - return text[t.begin] == c; + return text_[t.begin] == c; } bool TokenStream::matches(Token t, const char* needle) const { - const auto& text = impl_->text_; - - if (t.end > text.size()) { + if (t.end > text_.size()) { return false; } for (index_t i = t.begin; i < t.end; i++) { - if (*needle != text[i] || *needle == '\0') { + if (*needle != text_[i] || *needle == '\0') { return false; } @@ -279,12 +273,11 @@ bool TokenStream::matches(Token t, const char* needle) const { } std::pair TokenStream::extract_line_column(Token t) const { - const auto& text = impl_->text_; int lineno = 1; int colno = 1; - for (size_t i = 0; i < text.size() && i < t.begin; i++) { - if (text[i] == '\n') { + for (size_t i = 0; i < text_.size() && i < t.begin; i++) { + if (text_[i] == '\n') { lineno++; colno = 1; } else { @@ -366,9 +359,8 @@ void TokenStream::throw_unexpected_token(Token t, const std::string& reason) auto line_col = extract_line_column(t); std::stringstream msg; - msg << "error:" << impl_->file_ << ":" << line_col.first << ":" - << line_col.second << ": found invalid token \"" - << clean_string(span(t)) << "\""; + msg << "error:" << file_ << ":" << line_col.first << ":" << line_col.second + << ": found invalid token \"" << clean_string(span(t)) << "\""; if (!reason.empty()) { msg << ", " << reason; @@ -378,12 +370,11 @@ void TokenStream::throw_unexpected_token(Token t, const std::string& reason) } std::string TokenStream::span(size_t begin, size_t end) const { - const auto& text = impl_->text_; - if (begin > end || end > text.size()) { + if (begin > end || end > text_.size()) { throw std::runtime_error("index out of bounds"); } - return text.substr(begin, end - begin); + return text_.substr(begin, end - begin); } } // namespace internal diff --git a/src/pragma.cpp b/src/pragma.cpp new file mode 100644 index 0000000..0d0803f --- /dev/null +++ b/src/pragma.cpp @@ -0,0 +1,70 @@ +#include "kernel_launcher/pragma.h" + +#include "kernel_launcher/internal/directives.h" + +namespace kernel_launcher { + +KernelBuilder build_pragma_kernel( + const KernelSource& source, + const std::string& name, + const std::vector& template_args, + const FileLoader& fs) { + std::string filename = source.file_name(); + std::string content = source.read(fs); + + internal::TokenStream stream(filename, content); + std::vector kernels = internal::parse_kernels(stream); + + for (const auto& kernel : kernels) { + if (stream.matches(kernel.name, name)) { + return internal::process_kernel(stream, kernel, template_args); + } + } + + throw std::runtime_error( + "kernel '" + name + "' was not found in file \'" + filename + "\'"); +} + +PragmaKernel::PragmaKernel( + std::string path, + std::string name, + std::vector template_args) : + kernel_name_(std::move(name)), + template_args_(std::move(template_args)) { + // Resolve absolute file path + const char* abs_path = realpath(path.c_str(), nullptr); + if (abs_path == nullptr) { + throw std::runtime_error("failed to resolve path: '" + path + "'"); + } + + file_path_ = abs_path; +} + +KernelBuilder PragmaKernel::build() const { + return build_pragma_kernel( + KernelSource(file_path_), + kernel_name_, + template_args_); +} + +bool PragmaKernel::equals(const IKernelDescriptor& that) const { + if (const auto* m = dynamic_cast(&that)) { + return m->kernel_name_ == this->kernel_name_ + && m->file_path_ == this->file_path_ + && m->template_args_ == this->template_args_; + } + + return false; +} + +hash_t PragmaKernel::hash() const { + hash_t h = hash_fields(kernel_name_, file_path_); + + for (const auto& v : template_args_) { + h = hash_combine(h, hash_fields(v)); + } + + return h; +} + +} // namespace kernel_launcher diff --git a/tests/internal.cpp b/tests/internal.cpp index 3d8d0c3..bcf1bd2 100644 --- a/tests/internal.cpp +++ b/tests/internal.cpp @@ -8,7 +8,7 @@ using namespace kernel_launcher; -TEST_CASE("tokenizer") { +TEST_CASE("tokenizer real") { using internal::TokenKind; std::string input = R"( @@ -20,34 +20,69 @@ TEST_CASE("tokenizer") { multi-line * / comment */ - "a" == 1.2 // should not compile + "a" == 1.2 // just a silly comment } )"; std::vector> expected = { - {"#", TokenKind::DirectiveBegin}, - {"include", TokenKind::Ident}, - {"<", TokenKind::AngleL}, - {"stdio", TokenKind::Ident}, - {">", TokenKind::AngleR}, - {"", TokenKind::DirectiveEnd}, - {"#", TokenKind::DirectiveBegin}, - {"pragma", TokenKind::Ident}, - {"kernel_tuner", TokenKind::Ident}, - {"test", TokenKind::Ident}, - {"", TokenKind::DirectiveEnd}, - {"void", TokenKind::Ident}, - {"foo", TokenKind::Ident}, - {"(", TokenKind::ParenL}, - {"int", TokenKind::Ident}, - {"x", TokenKind::Ident}, - {")", TokenKind::ParenR}, - {"{", TokenKind::BraceL}, - {"\"a\"", TokenKind::String}, - {"=", TokenKind::Punct}, - {"=", TokenKind::Punct}, - {"1.2", TokenKind::Number}, - {"}", TokenKind::BraceR}}; + {"#", TokenKind::DirectiveBegin}, {"include", TokenKind::Ident}, + {"<", TokenKind::AngleL}, {"stdio", TokenKind::Ident}, + {">", TokenKind::AngleR}, {"\n", TokenKind::DirectiveEnd}, + {"#", TokenKind::DirectiveBegin}, {"pragma", TokenKind::Ident}, + {"kernel_tuner", TokenKind::Ident}, {"test", TokenKind::Ident}, + {"\n", TokenKind::DirectiveEnd}, {"void", TokenKind::Ident}, + {"foo", TokenKind::Ident}, {"(", TokenKind::ParenL}, + {"int", TokenKind::Ident}, {"x", TokenKind::Ident}, + {")", TokenKind::ParenR}, {"{", TokenKind::BraceL}, + {"\"a\"", TokenKind::String}, {"==", TokenKind::Punct}, + {"1", TokenKind::Number}, {".", TokenKind::Unknown}, + {"2", TokenKind::Number}, {"}", TokenKind::BraceR}, + {"", TokenKind::EndOfFile}}; + + auto stream = internal::TokenStream("", input); + + for (size_t i = 0; i < expected.size(); i++) { + auto token = stream.next(); + + INFO("index=" << i); + CHECK(stream.span(token) == expected[i].first); + CHECK(token.kind == expected[i].second); + } +} + +TEST_CASE("tokenizer exhaustive") { + using internal::TokenKind; + + std::string input = R"( + == <= >= && || << >> :: + { } [ ] ( ) , < > + = - * / ! ~ & | ^ % + 123 4.5 + "test" "a\"b" 'c' + foo $foo foo_bar _bar _ bar1 + )"; + + std::vector> expected = { + {"==", TokenKind::Punct}, {"<=", TokenKind::Punct}, + {">=", TokenKind::Punct}, {"&&", TokenKind::Punct}, + {"||", TokenKind::Punct}, {"<<", TokenKind::Punct}, + {">>", TokenKind::Punct}, {"::", TokenKind::Punct}, + {"{", TokenKind::BraceL}, {"}", TokenKind::BraceR}, + {"[", TokenKind::BracketL}, {"]", TokenKind::BracketR}, + {"(", TokenKind::ParenL}, {")", TokenKind::ParenR}, + {",", TokenKind::Comma}, {"<", TokenKind::AngleL}, + {">", TokenKind::AngleR}, {"+", TokenKind::Punct}, + {"=", TokenKind::Punct}, {"-", TokenKind::Punct}, + {"*", TokenKind::Punct}, {"/", TokenKind::Punct}, + {"!", TokenKind::Punct}, {"~", TokenKind::Punct}, + {"&", TokenKind::Punct}, {"|", TokenKind::Punct}, + {"^", TokenKind::Punct}, {"%", TokenKind::Punct}, + {"123", TokenKind::Number}, {"4", TokenKind::Number}, + {".", TokenKind::Unknown}, {"5", TokenKind::Number}, + {R"("test")", TokenKind::String}, {R"("a\"b")", TokenKind::String}, + {"'c'", TokenKind::String}, {"foo", TokenKind::Ident}, + {"$foo", TokenKind::Ident}, {"foo_bar", TokenKind::Ident}, + {"_bar", TokenKind::Ident}, {"_", TokenKind::Ident}, + {"bar1", TokenKind::Ident}, {"", TokenKind::EndOfFile}}; auto stream = internal::TokenStream("", input); From 8564fc31af61f63c6c36a0684ca4a2bbdc993826 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 7 Feb 2023 10:46:55 +0100 Subject: [PATCH 11/63] While extracting kernel annotations, emit the preprocessed source code --- include/kernel_launcher/internal/directives.h | 5 +- include/kernel_launcher/internal/parser.h | 9 +- include/kernel_launcher/internal/tokens.h | 8 -- src/internal/directives.cpp | 11 +- src/internal/parser.cpp | 109 ++++++++++++------ src/pragma.cpp | 15 ++- tests/internal.cpp | 41 ++++++- 7 files changed, 139 insertions(+), 59 deletions(-) diff --git a/include/kernel_launcher/internal/directives.h b/include/kernel_launcher/internal/directives.h index 7ce3ca1..436300b 100644 --- a/include/kernel_launcher/internal/directives.h +++ b/include/kernel_launcher/internal/directives.h @@ -8,9 +8,10 @@ namespace kernel_launcher { namespace internal { -KernelBuilder process_kernel( +KernelBuilder builder_from_annotated_kernel( TokenStream& stream, - const KernelDef& def, + KernelSource source, + const AnnotatedKernelSpec& def, const std::vector& template_args); } diff --git a/include/kernel_launcher/internal/parser.h b/include/kernel_launcher/internal/parser.h index 6c116c0..36c570d 100644 --- a/include/kernel_launcher/internal/parser.h +++ b/include/kernel_launcher/internal/parser.h @@ -19,7 +19,7 @@ struct FunctionParam { Token name; }; -struct KernelDef { +struct AnnotatedKernelSpec { std::string qualified_name; Token name; std::vector directives; @@ -27,7 +27,12 @@ struct KernelDef { std::vector fun_params; }; -std::vector parse_kernels(TokenStream& stream); +struct AnnotatedDocument { + std::vector kernels; + std::string processed_source; +}; + +AnnotatedDocument extract_annotated_kernels(TokenStream& stream); } // namespace internal } // namespace kernel_launcher diff --git a/include/kernel_launcher/internal/tokens.h b/include/kernel_launcher/internal/tokens.h index 68d67a7..eeae28e 100644 --- a/include/kernel_launcher/internal/tokens.h +++ b/include/kernel_launcher/internal/tokens.h @@ -119,14 +119,6 @@ struct TokenStream { [[noreturn]] void throw_unexpected_token(Token t, const std::string& reason = "") const; - const std::string& file() const { - return file_; - } - - const std::string& content() const { - return text_; - } - private: std::string file_; std::string text_; diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 2fd52be..44e7928 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -1,6 +1,7 @@ #include "kernel_launcher/internal/directives.h" #include +#include #include #include "kernel_launcher/builder.h" @@ -275,12 +276,12 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { } } -KernelBuilder process_kernel( +KernelBuilder builder_from_annotated_kernel( TokenStream& stream, - const KernelDef& def, + KernelSource source, + const AnnotatedKernelSpec& def, const std::vector& template_args) { - auto source = KernelSource(stream.file(), stream.content()); - auto builder = KernelBuilder(def.qualified_name, source); + auto builder = KernelBuilder(def.qualified_name, std::move(source)); Context ctx; @@ -318,7 +319,7 @@ KernelBuilder process_kernel( } else { stream.throw_unexpected_token( param.name, - "this parameter is not defined, please add " + "this template parameter is not defined, please add " "`#pragma kernel_tuner tune(" + name + "=...)`"); } diff --git a/src/internal/parser.cpp b/src/internal/parser.cpp index 5305bc5..b9b76fc 100644 --- a/src/internal/parser.cpp +++ b/src/internal/parser.cpp @@ -17,9 +17,11 @@ static std::vector parse_template_params(TokenStream& stream) { Token name = stream.consume(TokenKind::Ident); Value default_value; - bool is_integral = ty != "typename" && ty != "class"; + // integral parameters do not start with "typename" or "class" + bool is_integral = !(ty == "typename" || ty == "class"); if (is_integral && stream.next_if('=')) { + // We only support numbers for now... Token v = stream.consume(TokenKind::Number); long l; @@ -64,21 +66,41 @@ static std::vector parse_kernel_params(TokenStream& stream) { return params; } -static KernelDef -parse_kernel(TokenStream& stream, const std::vector& namespaces) { - std::vector directives; - std::vector template_params; +static bool extract_kernel_tuner_directives( + TokenStream& stream, + std::vector& directives_out) { + static constexpr const char* PRAGMA_NAME = "kernel_tuner"; - // Advance the stream past all directives - while (stream.next_if(TokenKind::DirectiveBegin)) { - Token t = stream.next(); - directives.push_back(t); + // Check if directive starts with correct pragma. If not, this is + // not a relevant pragma and we do not need to parse it. + Token t = stream.peek(); + bool is_relevant = stream.next_if("pragma") && stream.next_if(PRAGMA_NAME); + stream.seek(t); + + if (!is_relevant) { + return false; + } + + // Parse all pragmas + do { + stream.consume("pragma"); + stream.consume(PRAGMA_NAME); + t = stream.next(); + directives_out.push_back(t); - // Find the directive end while (t.kind != TokenKind::DirectiveEnd) { t = stream.next(); } - } + } while (stream.next_if(TokenKind::DirectiveBegin)); + + return true; +} + +static AnnotatedKernelSpec parse_kernel( + TokenStream& stream, + const std::vector& namespaces, + std::vector directives) { + std::vector template_params; // check for 'template' '<' ... '>' if (stream.next_if("template")) { @@ -121,10 +143,12 @@ enum struct Scope { }; // NOLINTNEXTLINE(readability-function-cognitive-complexity) -std::vector parse_kernels(TokenStream& stream) { +AnnotatedDocument extract_annotated_kernels(TokenStream& stream) { std::vector namespace_stack; std::vector scope_stack; - std::vector kernels; + std::vector kernels; + std::vector directives; + std::string source; auto assert_pop_scope = [&](Token t, Scope scope, const char* msg) { if (scope_stack.empty() || scope_stack.back() != scope) { @@ -134,49 +158,62 @@ std::vector parse_kernels(TokenStream& stream) { scope_stack.pop_back(); }; + Token last {}; + Token cur; + while (stream.has_next()) { - Token t = stream.next(); + cur = stream.next(); - if (t.kind == TokenKind::Ident && stream.matches(t, "namespace")) { - t = stream.consume(TokenKind::Ident); - namespace_stack.push_back(stream.span(t)); + if (cur.kind == TokenKind::Ident && stream.matches(cur, "namespace")) { + cur = stream.consume(TokenKind::Ident); + namespace_stack.push_back(stream.span(cur)); stream.consume(TokenKind::BraceL); scope_stack.push_back(Scope::Namespace); - } else if (t.kind == TokenKind::BraceL) { + } else if (cur.kind == TokenKind::BraceL) { scope_stack.push_back(Scope::Brace); - } else if (t.kind == TokenKind::BraceR) { + } else if (cur.kind == TokenKind::BraceR) { if (!scope_stack.empty() && scope_stack.back() == Scope::Namespace) { namespace_stack.pop_back(); scope_stack.back() = Scope::Brace; } - assert_pop_scope(t, Scope::Brace, "no matching '{' found"); - } else if (t.kind == TokenKind::ParenL) { + assert_pop_scope(cur, Scope::Brace, "no matching '{' found"); + } else if (cur.kind == TokenKind::ParenL) { scope_stack.push_back(Scope::Paren); - } else if (t.kind == TokenKind::ParenR) { - assert_pop_scope(t, Scope::Paren, "no matching '(' found"); - } else if (t.kind == TokenKind::BracketL) { + } else if (cur.kind == TokenKind::ParenR) { + assert_pop_scope(cur, Scope::Paren, "no matching '(' found"); + } else if (cur.kind == TokenKind::BracketL) { scope_stack.push_back(Scope::Bracket); - } else if (t.kind == TokenKind::BracketR) { - assert_pop_scope(t, Scope::Bracket, "no matching '[' found"); - } else if (t.kind == TokenKind::DirectiveBegin) { - bool is_pragma = - stream.next_if("pragma") && stream.next_if("kernel_tuner"); - stream.seek(t); - - if (is_pragma) { - kernels.push_back(parse_kernel(stream, namespace_stack)); + } else if (cur.kind == TokenKind::BracketR) { + assert_pop_scope(cur, Scope::Bracket, "no matching '[' found"); + } else if (cur.kind == TokenKind::DirectiveBegin) { + if (extract_kernel_tuner_directives(stream, directives)) { + Token before_dir = cur; + Token after_dir = stream.peek(); + + source.append(stream.span(last.begin, before_dir.begin)); + source.append("/*"); + source.append(stream.span(before_dir.begin, after_dir.begin)); + source.append("*/"); + + kernels.push_back(parse_kernel( + stream, + namespace_stack, + std::move(directives))); + + last = after_dir; } else { - while (t.kind != TokenKind::DirectiveEnd) { - t = stream.next(); + while (cur.kind != TokenKind::DirectiveEnd) { + cur = stream.next(); } } } } - return kernels; + source.append(stream.span(last, cur)); + return AnnotatedDocument {kernels, source}; } } // namespace internal diff --git a/src/pragma.cpp b/src/pragma.cpp index 0d0803f..b4c044b 100644 --- a/src/pragma.cpp +++ b/src/pragma.cpp @@ -9,15 +9,24 @@ KernelBuilder build_pragma_kernel( const std::string& name, const std::vector& template_args, const FileLoader& fs) { + // Read file std::string filename = source.file_name(); std::string content = source.read(fs); + // Tokenize content of file internal::TokenStream stream(filename, content); - std::vector kernels = internal::parse_kernels(stream); - for (const auto& kernel : kernels) { + // Extract annotated kernels from token stream + auto result = internal::extract_annotated_kernels(stream); + auto processed_source = KernelSource(filename, result.processed_source); + + for (const auto& kernel : result.kernels) { if (stream.matches(kernel.name, name)) { - return internal::process_kernel(stream, kernel, template_args); + return internal::builder_from_annotated_kernel( + stream, + processed_source, + kernel, + template_args); } } diff --git a/tests/internal.cpp b/tests/internal.cpp index bcf1bd2..721488f 100644 --- a/tests/internal.cpp +++ b/tests/internal.cpp @@ -126,9 +126,41 @@ __global__ void spaz(int n, const float* input, float* output) { } // namespace foo )"; + std::string expected = R"( +// This is a comment +namespace foo { +namespace bar { + +#ifdef SOMECONSTANT +#endif + +/*#pragma kernel_tuner tune(block_size=32, 64, 128, 256) +#pragma kernel_tuner problem_size(n) +*/__global__ void baz(int n, const float* a) { + if (threadIdx.x < 10) { + return a[threadIdx.x]; + } +} +} // namespace bar + +/*#pragma kernel_tuner tune(block_size=32, 64, 128, 256) +#pragma kernel_tuner tune(tiling_factor=1,2,3,4) +#pragma kernel_tuner problem_size(n) \ + grid_divisor(tiling_factor * block_size) +*/template +__global__ void spaz(int n, const float* input, float* output) { + if (threadIdx.x < 10) { + return a[threadIdx.x]; + } +} +} // namespace foo + )"; + auto stream = internal::TokenStream("", input); - auto kernels = parse_kernels(stream); + auto result = extract_annotated_kernels(stream); + CHECK(result.processed_source == expected); + const auto& kernels = result.kernels; REQUIRE(kernels.size() == 2); const auto& baz = kernels[0]; @@ -168,10 +200,13 @@ TEST_CASE("directives") { )"; auto stream = internal::TokenStream("", input); - auto kernels = parse_kernels(stream); + auto result = extract_annotated_kernels(stream); + const auto& kernels = result.kernels; REQUIRE(kernels.size() == 1); const auto& kernel = kernels[0]; - KernelBuilder builder = process_kernel(stream, kernels[0], {"float"}); + KernelSource source("", result.processed_source); + KernelBuilder builder = + builder_from_annotated_kernel(stream, source, kernels[0], {"float"}); } \ No newline at end of file From 9e495efa4bdae2b79742fef1e2c654d49baa61ce Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 7 Feb 2023 10:49:47 +0100 Subject: [PATCH 12/63] Add support for `set` directives --- examples/vector_add/kernel_annotated.cu | 21 +++++++++++---------- src/internal/directives.cpp | 8 ++++++++ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/examples/vector_add/kernel_annotated.cu b/examples/vector_add/kernel_annotated.cu index 993d383..f21c5ea 100644 --- a/examples/vector_add/kernel_annotated.cu +++ b/examples/vector_add/kernel_annotated.cu @@ -1,36 +1,37 @@ #pragma kernel_tuner tune(block_size = 32, 64, 128, 256, 512, 1024) -#pragma kernel_tuner tune(elements_per_thread = 1, 2, 3, 4) +#pragma kernel_tuner tune(items_per_thread = 1, 2, 3, 4) #pragma kernel_tuner tune(tiling_strategy = 0, 1, 2) +#pragma kernel_tuner set(items_per_block = block_size * items_per_thread) #pragma kernel_tuner block_size(block_size) #pragma kernel_tuner problem_size(n) -#pragma kernel_tuner grid_divisor(block_size* elements_per_thread) -#pragma kernel_tuner restriction(block_size* elements_per_thread >= 32) +#pragma kernel_tuner grid_divisor(items_per_block) +#pragma kernel_tuner restriction(items_per_block <= 1024) template< typename T, int block_size = 32, - int elements_per_thread = 2, + int items_per_thread = 2, int tiling_strategy = 2> __global__ void vector_add(int n, T* C, const T* A, const T* B) { static_assert( tiling_strategy >= 0 && tiling_strategy <= 2, "invalid tiling strategy"); - for (int k = 0; k < elements_per_thread; k++) { + for (int k = 0; k < items_per_thread; k++) { int i; - // contiguous. thread processes element i, i+1, i+2, ... + // contiguous. thread processes items i, i+1, i+2, ... if (tiling_strategy == 0) { - i = (blockIdx.x * block_size + threadIdx.x) * elements_per_thread + i = (blockIdx.x * block_size + threadIdx.x) * items_per_thread + k; } - // block-strided. thread processes elements i, i + block_size, i + 2*block_size + // block-strided. thread processes items i, i + block_size, i + 2*block_size else if (tiling_strategy == 1) { - i = blockIdx.x * elements_per_thread * block_size + threadIdx.x + i = blockIdx.x * items_per_thread * block_size + threadIdx.x + k * block_size; } - // grid-strided. thread processes elements i, i + grid_size, i + 2 * grid_size + // grid-strided. thread processes items i, i + grid_size, i + 2 * grid_size else if (tiling_strategy == 2) { i = blockIdx.x * block_size + threadIdx.x + k * (gridDim.x * block_size); diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 44e7928..496db1c 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -252,6 +252,14 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { auto param = builder.add(var, values, priors, values.front()); ctx.config_args.insert({var, param}); + } else if (name == "set") { + stream.consume(TokenKind::ParenL); + Token var_token = stream.consume(TokenKind::Ident); + stream.consume('='); + Value value = parse_expr(stream, ctx).eval(DummyEval {}); + + std::string var = stream.span(var_token); + ctx.compile_args.insert({var, value}); } else if (name == "grid_size") { auto l = parse_expr_list3(stream, ctx); builder.grid_size(l[0], l[1], l[2]); From ec1e4586f7e0b33adc73ca62e337705721c3fa24 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 7 Feb 2023 18:46:57 +0100 Subject: [PATCH 13/63] Add support for `buffer` directive --- examples/vector_add/kernel_annotated.cu | 4 +- examples/vector_add/main.cu | 6 +- include/kernel_launcher/expr.h | 14 +-- src/internal/directives.cpp | 109 ++++++++++++++++-------- 4 files changed, 86 insertions(+), 47 deletions(-) diff --git a/examples/vector_add/kernel_annotated.cu b/examples/vector_add/kernel_annotated.cu index f21c5ea..bf442b0 100644 --- a/examples/vector_add/kernel_annotated.cu +++ b/examples/vector_add/kernel_annotated.cu @@ -6,6 +6,7 @@ #pragma kernel_tuner problem_size(n) #pragma kernel_tuner grid_divisor(items_per_block) #pragma kernel_tuner restriction(items_per_block <= 1024) +#pragma kernel_tuner buffers(C[n], A[n], B[n]) template< typename T, int block_size = 32, @@ -21,8 +22,7 @@ __global__ void vector_add(int n, T* C, const T* A, const T* B) { // contiguous. thread processes items i, i+1, i+2, ... if (tiling_strategy == 0) { - i = (blockIdx.x * block_size + threadIdx.x) * items_per_thread - + k; + i = (blockIdx.x * block_size + threadIdx.x) * items_per_thread + k; } // block-strided. thread processes items i, i + block_size, i + 2*block_size diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index c2cd9db..6670f1a 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -86,7 +86,11 @@ int main(int argc, char* argv[]) { kl::WisdomKernel vector_add(build_vector_add()); // Call kernel - vector_add(n, C_dev, (const float*)A_dev, (const float*)B_dev); + vector_add( + n, + kl::cuda_span(C_dev, n), + kl::cuda_span(A_dev, n), + kl::cuda_span(B_dev, n)); // Copy results back cuda_check(cudaMemcpy( diff --git a/include/kernel_launcher/expr.h b/include/kernel_launcher/expr.h index 21e5cb1..9e7a8f8 100644 --- a/include/kernel_launcher/expr.h +++ b/include/kernel_launcher/expr.h @@ -289,14 +289,14 @@ inline ArgExpr arg(uint8_t i) { } namespace detail { - template - struct ArgsHelper; +template +struct ArgsHelper; - template - struct ArgsHelper> { - using type = std::tuple::max(), - ArgExpr>::type...>; +template +struct ArgsHelper> { + using type = std::tuple::max(), + ArgExpr>::type...>; static type call() { return {Is...}; diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 496db1c..6a2e992 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -211,55 +211,90 @@ struct DummyEval: Eval { } }; -static void -process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { - stream.consume("pragma"); - stream.consume("kernel_tuner"); +static void parse_buffer_directive( + TokenStream& stream, + KernelBuilder& builder, + Context& ctx) { + stream.consume(TokenKind::ParenL); + do { + Token var_token = stream.consume(TokenKind::Ident); + stream.consume(TokenKind::BracketL); + Expr length = parse_expr(stream, ctx); + stream.consume(TokenKind::BracketR); + + auto it = ctx.runtime_args.find(stream.span(var_token)); + if (it == ctx.runtime_args.end()) { + stream.throw_unexpected_token( + var_token, + "this is not the name of a kernel argument"); + } - while (!stream.next_if(TokenKind::DirectiveEnd)) { - Token t = stream.consume(TokenKind::Ident); - std::string name = stream.span(t); + builder.buffer_size(it->second, length); + } while (stream.next_if(TokenKind::Comma)); + stream.consume(TokenKind::ParenR); +} - if (name == "tune") { - stream.consume(TokenKind::ParenL); - Token var_token = stream.consume(TokenKind::Ident); - stream.consume('='); +static void parse_tune_directive( + TokenStream& stream, + KernelBuilder& builder, + Context& ctx) { + std::vector values; + std::vector priors; - std::string var = stream.span(var_token); + stream.consume(TokenKind::ParenL); + Token var_token = stream.consume(TokenKind::Ident); + stream.consume('='); - if (ctx.config_args.count(var) > 0) { - stream.throw_unexpected_token(var_token, "variable redefined"); - } + do { + Value v = parse_expr(stream, {}).eval(DummyEval {}); + values.push_back(v); + priors.push_back(1.0); + } while (stream.next_if(TokenKind::Comma)); - if (ctx.compile_args.count(var) > 0) { - stream.throw_unexpected_token( - var_token, - "variable already passed as compile-time value"); - } + stream.consume(TokenKind::ParenR); - std::vector values; - std::vector priors; + std::string var = stream.span(var_token); - do { - Value v = parse_expr(stream, {}).eval(DummyEval {}); + if (ctx.config_args.count(var) > 0) { + stream.throw_unexpected_token(var_token, "variable redefined"); + } - values.push_back(v); - priors.push_back(1.0); - } while (stream.next_if(TokenKind::Comma)); + if (ctx.compile_args.count(var) > 0) { + stream.throw_unexpected_token( + var_token, + "variable already passed as compile-time value"); + } - stream.consume(TokenKind::ParenR); + auto param = builder.add(var, values, priors, values.front()); + ctx.config_args.insert({var, param}); +} - auto param = builder.add(var, values, priors, values.front()); +static void parse_set_directive(TokenStream& stream, Context& ctx) { + stream.consume(TokenKind::ParenL); + Token var_token = stream.consume(TokenKind::Ident); + stream.consume('='); + Value value = parse_expr(stream, ctx).eval(DummyEval {}); + stream.consume(TokenKind::ParenR); - ctx.config_args.insert({var, param}); - } else if (name == "set") { - stream.consume(TokenKind::ParenL); - Token var_token = stream.consume(TokenKind::Ident); - stream.consume('='); - Value value = parse_expr(stream, ctx).eval(DummyEval {}); + std::string var = stream.span(var_token); + ctx.compile_args.insert({var, value}); +} + +static void +process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { + stream.consume("pragma"); + stream.consume("kernel_tuner"); - std::string var = stream.span(var_token); - ctx.compile_args.insert({var, value}); + while (!stream.next_if(TokenKind::DirectiveEnd)) { + Token t = stream.consume(TokenKind::Ident); + std::string name = stream.span(t); + + if (name == "tune") { + parse_tune_directive(stream, builder, ctx); + } else if (name == "set") { + parse_set_directive(stream, ctx); + } else if (name == "buffers" || name == "buffer") { + parse_buffer_directive(stream, builder, ctx); } else if (name == "grid_size") { auto l = parse_expr_list3(stream, ctx); builder.grid_size(l[0], l[1], l[2]); From e7981567218c3c22a81cc9da0d976a0af594b1f3 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 7 Feb 2023 19:34:42 +0100 Subject: [PATCH 14/63] Add documentation on pragma kernels --- docs/example.rst | 1 + docs/examples/pragma.rst | 90 +++++++++++++++++++++++++++ docs/examples/vector_add_annotated.cu | 20 ++++++ 3 files changed, 111 insertions(+) create mode 100644 docs/examples/pragma.rst create mode 100644 docs/examples/vector_add_annotated.cu diff --git a/docs/example.rst b/docs/example.rst index 9968966..5f4e9d7 100644 --- a/docs/example.rst +++ b/docs/example.rst @@ -9,4 +9,5 @@ Guides examples/basic examples/wisdom examples/registry + examples/pragma diff --git a/docs/examples/pragma.rst b/docs/examples/pragma.rst new file mode 100644 index 0000000..38b76d1 --- /dev/null +++ b/docs/examples/pragma.rst @@ -0,0 +1,90 @@ +Pragma Kernels +=========================== + +In the previous examples, we saw how it was possible to specify a tunable kernel by defining a ``KernelBuilder`` instance in the host-side code. +While this API offers flexibility, it is also somewhat cumbersome and it requires keeping the actual kernel code in CUDA in sync with the host-side code in C++. + +Kernel Launcher also offers a way to define the kernel specifications inside the actual CUDA code by annotating the kernel code with directives. +While this method is less flexible than the ``KernelBuilder`` API, it is a lot more convenient and should be usable for the majority of CUDA kernels. + + +Source Code +----------- + +Below shows the CUDA kernel code. +This is valid regular CUDA code since the ``#pragma`` will be ignored by the ``nvcc`` compiler (although they will emit a warning). + +.. literalinclude:: vector_add_annotated.cu + :lines: 1-20 + :lineno-start: 1 + + +Code Explanation +---------------- + +The kernel contains the following ``pragma`` directives: + +.. literalinclude:: vector_add_annotated.cu + :lines: 1-2 + :lineno-start: 1 + +The ``tune`` directives defines the tunable parameters. +In this case, there are two parameters: ``items_per_block`` and ``items_per_thread``. +Since ``items_per_thread`` is also the name of template parameter (line 8), it is passed to the kernel as compile-time constant to the kernel via this parameter. +The value of ``items_per_block`` is not passed to the kernel but is used by subsequent pragmas. + +.. literalinclude:: vector_add_annotated.cu + :lines: 3-3 + :lineno-start: 3 + +The ``set`` directives defines a constant. +In this case, the constant ``items_per_block`` is defined as the product of ``items_per_block`` and ``items_per_block``. + +.. literalinclude:: vector_add_annotated.cu + :lines: 4-6 + :lineno-start: 4 + +The above lines specify information required to launch the kernel. +The ``problem_size`` defines the problem size as discussed in :doc:`basic`. +The ``block_size`` specifies the thread block size and ``grid_divisors`` specifies how the problem size should be divided to obtain the thread grid size. +Alternatively, it is possible to specify the grid size directly using the ``grid_size`` directive. + +.. literalinclude:: vector_add_annotated.cu + :lines: 7-7 + :lineno-start: 7 + +The above line specifies that the kernel arguments ``A``, ``B``, and ``C`` are buffers each having ``n`` elements. +This is required since Kernel Launcher requires the size of each buffer to be known, but the kernel could be called with raw pointers for which no size information is available. +If the ``buffers`` pragma is not specified, Kernel Launcher can still be used but it is not possible to capture kernel launches. + +.. literalinclude:: vector_add_annotated.cu + :lines: 8-8 + :lineno-start: 8 + +The ``tuning_key`` pragma specifies the tuning key. +All arguments given to this pragma will be concatenated and these arguments can be either strings or variables. +In this example, the tuning key is ``"vector_add_" + T`` where ``T`` is the name of the type. + + +Host Code +--------- + +The below code shows how to call the kernel from the host in C++:: + + #include "kernel_launcher/pragma.h" + using namespace kl = kernel_launcher; + + void launch_vector_add(float* C, const float* A, const float* B) { + kl::launch( + kl::PragmaKernel("vector_add_annotated.cu", "vector_add", {"float"}), + n, C, A, B + ); + ); + + +The ``PragmaKernel`` class implements the ``IKernelDescriptor`` interface, as described in :doc:`registry`. +This class will read the specified file, extract the Kernel Launcher pragmas from the source code, and compile the kernel. + +The ``launch`` function launches the kernel and, as discussed in :doc:`registry`, it uses the default registry to cache kernel compilations. +This means that the kernel is only compiled once, even if the same kernel is called from different locations in the program. + diff --git a/docs/examples/vector_add_annotated.cu b/docs/examples/vector_add_annotated.cu new file mode 100644 index 0000000..ec29275 --- /dev/null +++ b/docs/examples/vector_add_annotated.cu @@ -0,0 +1,20 @@ +#pragma kernel_tuner tune(items_per_block=32, 64, 128, 256, 512, 1024) +#pragma kernel_tuner tune(items_per_thread=1, 2, 4, 8) +#pragma kernel_tuner set(items_per_block=items_per_block * items_per_thread) +#pragma kernel_tuner problem_size(n) +#pragma kernel_tuner block_size(items_per_block) +#pragma kernel_tuner grid_divisor(items_per_thread) +#pragma kernel_tuner buffers(C[n], A[n], B[n]) +#pragma kernel_tuner tuning_key("vector_add_", T) +template +__global__ +void vector_add(int n, T* C, const T* A, const T* B) { + for (int k = 0; k < items_per_thread; k++) { + int i = blockIdx.x * items_per_thread * blockDim.x + k * blockDim.x + threadIdx.x; + + if (i < n) { + C[i] = A[i] + B[i]; + } + } +} + From 14e19dff6846839af40c809d0f6310a4e1b4140d Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 09:49:38 +0100 Subject: [PATCH 15/63] Fix mistake in docs on pragma kernels --- docs/examples/pragma.rst | 8 ++-- docs/examples/vector_add_annotated.cu | 8 ++-- src/internal/directives.cpp | 58 +++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/examples/pragma.rst b/docs/examples/pragma.rst index 38b76d1..7cca44b 100644 --- a/docs/examples/pragma.rst +++ b/docs/examples/pragma.rst @@ -29,16 +29,16 @@ The kernel contains the following ``pragma`` directives: :lineno-start: 1 The ``tune`` directives defines the tunable parameters. -In this case, there are two parameters: ``items_per_block`` and ``items_per_thread``. -Since ``items_per_thread`` is also the name of template parameter (line 8), it is passed to the kernel as compile-time constant to the kernel via this parameter. -The value of ``items_per_block`` is not passed to the kernel but is used by subsequent pragmas. +In this case, there are two parameters: ``threads_per_block`` and ``items_per_thread``. +Since ``items_per_thread`` is also the name of template parameter (line 9), it is passed to the kernel as compile-time constant to the kernel via this parameter. +The value of ``threads_per_block`` is not passed to the kernel but is used by subsequent pragmas. .. literalinclude:: vector_add_annotated.cu :lines: 3-3 :lineno-start: 3 The ``set`` directives defines a constant. -In this case, the constant ``items_per_block`` is defined as the product of ``items_per_block`` and ``items_per_block``. +In this case, the constant ``items_per_block`` is defined as the product of ``threads_per_block`` and ``items_per_thread``. .. literalinclude:: vector_add_annotated.cu :lines: 4-6 diff --git a/docs/examples/vector_add_annotated.cu b/docs/examples/vector_add_annotated.cu index ec29275..0c234a0 100644 --- a/docs/examples/vector_add_annotated.cu +++ b/docs/examples/vector_add_annotated.cu @@ -1,9 +1,9 @@ -#pragma kernel_tuner tune(items_per_block=32, 64, 128, 256, 512, 1024) +#pragma kernel_tuner tune(threads_per_block=32, 64, 128, 256, 512, 1024) #pragma kernel_tuner tune(items_per_thread=1, 2, 4, 8) -#pragma kernel_tuner set(items_per_block=items_per_block * items_per_thread) +#pragma kernel_tuner set(items_per_block=threads_per_block * items_per_thread) #pragma kernel_tuner problem_size(n) -#pragma kernel_tuner block_size(items_per_block) -#pragma kernel_tuner grid_divisor(items_per_thread) +#pragma kernel_tuner block_size(threads_per_block) +#pragma kernel_tuner grid_divisor(items_per_block) #pragma kernel_tuner buffers(C[n], A[n], B[n]) #pragma kernel_tuner tuning_key("vector_add_", T) template diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 6a2e992..4b1dbf2 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -107,6 +107,49 @@ static Expr parse_ident(Token t, TokenStream& stream, const Context& ctx) { stream.throw_unexpected_token(t, "unknown variable name"); } +static bool parse_string(const std::string& input, std::string& output) { + size_t n = input.size(); + if (n < 2 || input[0] != input[n - 1]) { + return false; + } + + bool prev_backslash = false; + + for (size_t i = 1; i < n - 1; i++) { + char c = input[i]; + if (prev_backslash) { + char x; + + switch (c) { + case 'n': + x = '\n'; + break; + case 't': + x = '\t'; + case 'r': + x = '\r'; + break; + case '"': + case '\'': + case '\\': + x = c; + break; + default: + return false; + } + + prev_backslash = false; + output += x; + } else if (c == '\\') { + prev_backslash = true; + } else { + output += c; + } + } + + return !prev_backslash; +} + static bool parse_long(const std::string& input, long& output) { char* endptr = nullptr; output = strtol(input.c_str(), &endptr, 10); @@ -122,6 +165,12 @@ static Expr parse_prim(TokenStream& stream, const Context& ctx) { Expr e = parse_expr(stream, ctx); stream.consume(TokenKind::ParenR); return e; + } else if (t.kind == TokenKind::String) { + std::string out; + if (!parse_string(stream.span(t), out)) { + stream.throw_unexpected_token(t, "failed to parse string"); + } + return ScalarExpr(out); } else if (t.kind == TokenKind::Number) { long l; if (!parse_long(stream.span(t), l)) { @@ -295,6 +344,15 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { parse_set_directive(stream, ctx); } else if (name == "buffers" || name == "buffer") { parse_buffer_directive(stream, builder, ctx); + } else if (name == "tuning_key") { + std::string key = ""; + + for (const auto& expr : parse_expr_list(stream, ctx)) { + key += expr.eval(DummyEval {}).to_string(); + } + + builder.tuning_key(std::move(key)); + } else if (name == "grid_size") { auto l = parse_expr_list3(stream, ctx); builder.grid_size(l[0], l[1], l[2]); From 61e568fa5ace1c50c815fe8dbb3c3cf257e4291b Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 10:59:27 +0100 Subject: [PATCH 16/63] Improve error message of `throw_unexpected_token` by underlining code snippet --- src/internal/tokens.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/internal/tokens.cpp b/src/internal/tokens.cpp index 71a439f..08e67e9 100644 --- a/src/internal/tokens.cpp +++ b/src/internal/tokens.cpp @@ -354,6 +354,41 @@ TokenStream::throw_expecting_token(Token t, const char* c) const { throw_unexpected_token(t, reason); } +static std::string +underlined_token(size_t begin, size_t end, const std::string& text) { + size_t begin_line = begin; + while (begin_line > 0 && text[begin_line - 1] != '\n') { + begin_line--; + } + + size_t end_line = begin_line; + while (end_line < text.size() && text[end_line] != '\n') { + end_line++; + } + + // In these cases, there is nothing to underline (empty token? empty line?) + if (begin >= end || begin_line >= end_line || begin >= end_line + || begin_line >= end) { + return ""; + } + + std::stringstream msg; + for (size_t i = begin_line; i < end_line; i++) { + msg << text[i]; + } + + msg << "\n"; + for (size_t i = begin_line; i < end_line; i++) { + if (i >= begin && i < end) { + msg << '^'; + } else { + msg << ' '; + } + } + + return msg.str(); +} + void TokenStream::throw_unexpected_token(Token t, const std::string& reason) const { auto line_col = extract_line_column(t); @@ -366,6 +401,11 @@ void TokenStream::throw_unexpected_token(Token t, const std::string& reason) msg << ", " << reason; } + std::string snippet = underlined_token(t.begin, t.end, text_); + if (!snippet.empty()) { + msg << "\n" << snippet; + } + throw std::runtime_error(msg.str()); } From 415db41ecf4fa2e8cfad2c794870bae1cd64d0e4 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 11:06:26 +0100 Subject: [PATCH 17/63] Fix bug in parsing of `set` directive --- src/internal/directives.cpp | 55 +++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 4b1dbf2..321613f 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -13,8 +13,8 @@ namespace internal { struct Context { std::unordered_map runtime_args; - std::unordered_map compile_args; - std::unordered_map config_args; + std::unordered_map compile_args; + std::unordered_map config_args; }; static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec = 0); @@ -126,6 +126,7 @@ static bool parse_string(const std::string& input, std::string& output) { break; case 't': x = '\t'; + break; case 'r': x = '\r'; break; @@ -256,10 +257,22 @@ parse_expr_list3(TokenStream& stream, const Context& ctx) { struct DummyEval: Eval { bool lookup(const Variable& v, Value& out) const override { - throw std::runtime_error("internal error"); + return false; } }; +static Value parse_comptime_expr(TokenStream& stream, const Context& ctx) { + Expr e = parse_expr(stream, ctx); + + try { + return e.eval(DummyEval {}); + } catch (const std::exception& err) { + throw std::runtime_error( + "error while evaluating expression '" + e.to_string() + + "': " + err.what()); + } +} + static void parse_buffer_directive( TokenStream& stream, KernelBuilder& builder, @@ -295,7 +308,7 @@ static void parse_tune_directive( stream.consume('='); do { - Value v = parse_expr(stream, {}).eval(DummyEval {}); + Value v = parse_comptime_expr(stream, ctx); values.push_back(v); priors.push_back(1.0); } while (stream.next_if(TokenKind::Comma)); @@ -319,21 +332,34 @@ static void parse_tune_directive( } static void parse_set_directive(TokenStream& stream, Context& ctx) { + // '(' IDENT '=' EXPR ')' stream.consume(TokenKind::ParenL); Token var_token = stream.consume(TokenKind::Ident); stream.consume('='); - Value value = parse_expr(stream, ctx).eval(DummyEval {}); + Expr expr = parse_expr(stream, ctx); stream.consume(TokenKind::ParenR); - std::string var = stream.span(var_token); - ctx.compile_args.insert({var, value}); + + ctx.config_args.insert({var, expr}); +} + +static void parse_tuning_key_directive( + TokenStream& stream, + KernelBuilder& builder, + Context& ctx) { + std::string key; + + stream.consume(TokenKind::ParenL); + do { + key += parse_comptime_expr(stream, ctx).to_string(); + } while (stream.next_if(TokenKind::Comma)); + stream.consume(TokenKind::ParenR); + + builder.tuning_key(std::move(key)); } static void process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { - stream.consume("pragma"); - stream.consume("kernel_tuner"); - while (!stream.next_if(TokenKind::DirectiveEnd)) { Token t = stream.consume(TokenKind::Ident); std::string name = stream.span(t); @@ -345,14 +371,7 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { } else if (name == "buffers" || name == "buffer") { parse_buffer_directive(stream, builder, ctx); } else if (name == "tuning_key") { - std::string key = ""; - - for (const auto& expr : parse_expr_list(stream, ctx)) { - key += expr.eval(DummyEval {}).to_string(); - } - - builder.tuning_key(std::move(key)); - + parse_tuning_key_directive(stream, builder, ctx); } else if (name == "grid_size") { auto l = parse_expr_list3(stream, ctx); builder.grid_size(l[0], l[1], l[2]); From 4ff602bd6cf47d10db16d194a0c8e4bd6792cc9d Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 11:33:52 +0100 Subject: [PATCH 18/63] Improve error reporting in `ArgExpr` --- src/expr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/expr.cpp b/src/expr.cpp index 3da3c6f..7ddfbad 100644 --- a/src/expr.cpp +++ b/src/expr.cpp @@ -316,4 +316,4 @@ Expr BinaryExpr::resolve(const Eval& eval) const { return result; } -} // namespace kernel_launcher \ No newline at end of file +} // namespace kernel_launcher From 18776226007e6a234687d24385a2aa76bb4433b6 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 11:35:13 +0100 Subject: [PATCH 19/63] Change how parameters are stored when parsing directives --- include/kernel_launcher/internal/tokens.h | 9 +++- src/internal/directives.cpp | 64 ++++++++++++++--------- src/internal/tokens.cpp | 47 +++++++++-------- 3 files changed, 71 insertions(+), 49 deletions(-) diff --git a/include/kernel_launcher/internal/tokens.h b/include/kernel_launcher/internal/tokens.h index eeae28e..ba57867 100644 --- a/include/kernel_launcher/internal/tokens.h +++ b/include/kernel_launcher/internal/tokens.h @@ -114,10 +114,15 @@ struct TokenStream { return span(begin.begin, end.end); } - std::pair extract_line_column(Token t) const; + [[noreturn]] void throw_unexpected_token( + size_t begin, + size_t end, + const std::string& reason = "") const; [[noreturn]] void - throw_unexpected_token(Token t, const std::string& reason = "") const; + throw_unexpected_token(Token t, const std::string& reason = "") const { + throw_unexpected_token(t.begin, t.end, reason); + } private: std::string file_; diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 321613f..4d855d2 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -12,9 +12,17 @@ namespace kernel_launcher { namespace internal { struct Context { - std::unordered_map runtime_args; - std::unordered_map compile_args; - std::unordered_map config_args; + // Runtime arguments of the kernel + std::unordered_map kernel_args; + + // Parameters from configuration space + std::unordered_map config_args; + + // Compile-time arguments passed by the user + std::unordered_map comptime_args; + + // User-defined parameters using `#pragma set(foo=1+2)` + std::unordered_map user_args; }; static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec = 0); @@ -90,16 +98,24 @@ static Expr parse_ident(Token t, TokenStream& stream, const Context& ctx) { // Is it a compile-time parameter? { - auto it = ctx.compile_args.find(name); - if (it != ctx.compile_args.end()) { + auto it = ctx.comptime_args.find(name); + if (it != ctx.comptime_args.end()) { + return it->second; + } + } + + // Is it a user-defined parameter? + { + auto it = ctx.user_args.find(name); + if (it != ctx.user_args.end()) { return it->second; } } // Is it a runtime parameter? { - auto it = ctx.runtime_args.find(name); - if (it != ctx.runtime_args.end()) { + auto it = ctx.kernel_args.find(name); + if (it != ctx.kernel_args.end()) { return it->second; } } @@ -262,14 +278,16 @@ struct DummyEval: Eval { }; static Value parse_comptime_expr(TokenStream& stream, const Context& ctx) { + Token before = stream.peek(); Expr e = parse_expr(stream, ctx); + Token after = stream.peek(); try { return e.eval(DummyEval {}); } catch (const std::exception& err) { - throw std::runtime_error( - "error while evaluating expression '" + e.to_string() - + "': " + err.what()); + auto msg = + std::string("error while evaluating expression: ") + err.what(); + stream.throw_unexpected_token(before.begin, after.begin, msg); } } @@ -284,8 +302,8 @@ static void parse_buffer_directive( Expr length = parse_expr(stream, ctx); stream.consume(TokenKind::BracketR); - auto it = ctx.runtime_args.find(stream.span(var_token)); - if (it == ctx.runtime_args.end()) { + auto it = ctx.kernel_args.find(stream.span(var_token)); + if (it == ctx.kernel_args.end()) { stream.throw_unexpected_token( var_token, "this is not the name of a kernel argument"); @@ -321,12 +339,6 @@ static void parse_tune_directive( stream.throw_unexpected_token(var_token, "variable redefined"); } - if (ctx.compile_args.count(var) > 0) { - stream.throw_unexpected_token( - var_token, - "variable already passed as compile-time value"); - } - auto param = builder.add(var, values, priors, values.front()); ctx.config_args.insert({var, param}); } @@ -336,11 +348,11 @@ static void parse_set_directive(TokenStream& stream, Context& ctx) { stream.consume(TokenKind::ParenL); Token var_token = stream.consume(TokenKind::Ident); stream.consume('='); - Expr expr = parse_expr(stream, ctx); + Expr expr = parse_expr(stream, ctx).resolve(DummyEval {}); stream.consume(TokenKind::ParenR); std::string var = stream.span(var_token); - ctx.config_args.insert({var, expr}); + ctx.user_args.insert({var, expr}); } static void parse_tuning_key_directive( @@ -407,7 +419,7 @@ KernelBuilder builder_from_annotated_kernel( for (size_t i = 0; i < def.fun_params.size(); i++) { std::string name = stream.span(def.fun_params[i].name); - ctx.runtime_args.insert({name, ArgExpr(uint8_t(i))}); + ctx.kernel_args.insert({name, ArgExpr(uint8_t(i))}); } if (template_args.size() > def.template_params.size()) { @@ -420,7 +432,7 @@ KernelBuilder builder_from_annotated_kernel( for (size_t i = 0; i < template_args.size(); i++) { std::string name = stream.span(def.template_params[i].name); - ctx.compile_args.insert({name, template_args[i]}); + ctx.comptime_args.insert({name, template_args[i]}); } for (const auto& directive : def.directives) { @@ -432,10 +444,12 @@ KernelBuilder builder_from_annotated_kernel( std::string name = stream.span(param.name); Expr e = nullptr; - if (ctx.compile_args.count(name) > 0) { - e = ctx.compile_args.at(name); - } else if (ctx.config_args.count(name) > 0) { + if (ctx.config_args.count(name) > 0) { e = ctx.config_args.at(name); + } else if (ctx.comptime_args.count(name) > 0) { + e = ctx.comptime_args.at(name); + } else if (ctx.user_args.count(name) > 0) { + e = ctx.user_args.at(name); } else { stream.throw_unexpected_token( param.name, diff --git a/src/internal/tokens.cpp b/src/internal/tokens.cpp index 08e67e9..b30eeb7 100644 --- a/src/internal/tokens.cpp +++ b/src/internal/tokens.cpp @@ -272,22 +272,6 @@ bool TokenStream::matches(Token t, const char* needle) const { return *needle == '\0'; } -std::pair TokenStream::extract_line_column(Token t) const { - int lineno = 1; - int colno = 1; - - for (size_t i = 0; i < text_.size() && i < t.begin; i++) { - if (text_[i] == '\n') { - lineno++; - colno = 1; - } else { - colno++; - } - } - - return {lineno, colno}; -} - static std::string clean_string(const std::string& input) { std::stringstream output; @@ -355,7 +339,7 @@ TokenStream::throw_expecting_token(Token t, const char* c) const { } static std::string -underlined_token(size_t begin, size_t end, const std::string& text) { +underlined_span(size_t begin, size_t end, const std::string& text) { size_t begin_line = begin; while (begin_line > 0 && text[begin_line - 1] != '\n') { begin_line--; @@ -389,19 +373,38 @@ underlined_token(size_t begin, size_t end, const std::string& text) { return msg.str(); } -void TokenStream::throw_unexpected_token(Token t, const std::string& reason) - const { - auto line_col = extract_line_column(t); +static std::pair +extract_line_column(size_t offset, const std::string& text) { + int lineno = 1; + int colno = 1; + + for (size_t i = 0; i < text.size() && i < offset; i++) { + if (text[i] == '\n') { + lineno++; + colno = 1; + } else { + colno++; + } + } + + return {lineno, colno}; +} + +void TokenStream::throw_unexpected_token( + size_t begin, + size_t end, + const std::string& reason) const { + auto line_col = extract_line_column(begin, text_); std::stringstream msg; msg << "error:" << file_ << ":" << line_col.first << ":" << line_col.second - << ": found invalid token \"" << clean_string(span(t)) << "\""; + << ": found invalid token \"" << clean_string(span(begin, end)) << "\""; if (!reason.empty()) { msg << ", " << reason; } - std::string snippet = underlined_token(t.begin, t.end, text_); + std::string snippet = underlined_span(begin, end, text_); if (!snippet.empty()) { msg << "\n" << snippet; } From f6fde5c19bec03f7fd86da823d5666534116f35f Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 13:09:36 +0100 Subject: [PATCH 20/63] Add annotation example --- examples/CMakeLists.txt | 1 + examples/vector_add/main.cu | 1 - examples/vector_add_annotated/CMakeLists.txt | 10 +++ .../kernel_annotated.cu | 10 ++- examples/vector_add_annotated/main.cu | 90 +++++++++++++++++++ 5 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 examples/vector_add_annotated/CMakeLists.txt rename examples/{vector_add => vector_add_annotated}/kernel_annotated.cu (81%) create mode 100644 examples/vector_add_annotated/main.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ed02466..12156a5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(vector_add) +add_subdirectory(vector_add_annotated) add_subdirectory(matmul) diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 6670f1a..9ced435 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -1,7 +1,6 @@ #include #include "kernel_launcher.h" -#include "kernel_launcher/pragma.h" namespace kl = kernel_launcher; diff --git a/examples/vector_add_annotated/CMakeLists.txt b/examples/vector_add_annotated/CMakeLists.txt new file mode 100644 index 0000000..1b8dca4 --- /dev/null +++ b/examples/vector_add_annotated/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.17) + +set (PROJECT_NAME kernel_launcher_vecadd_annotated) +project(${PROJECT_NAME} LANGUAGES CXX CUDA) + +set (CMAKE_CXX_STANDARD 17) + +add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu") +target_link_libraries(${PROJECT_NAME} kernel_launcher) + diff --git a/examples/vector_add/kernel_annotated.cu b/examples/vector_add_annotated/kernel_annotated.cu similarity index 81% rename from examples/vector_add/kernel_annotated.cu rename to examples/vector_add_annotated/kernel_annotated.cu index bf442b0..44e2db8 100644 --- a/examples/vector_add/kernel_annotated.cu +++ b/examples/vector_add_annotated/kernel_annotated.cu @@ -1,12 +1,14 @@ -#pragma kernel_tuner tune(block_size = 32, 64, 128, 256, 512, 1024) +#pragma kernel_tuner tune(threads_per_block = 32, 64, 128, 256, 512, 1024) #pragma kernel_tuner tune(items_per_thread = 1, 2, 3, 4) #pragma kernel_tuner tune(tiling_strategy = 0, 1, 2) -#pragma kernel_tuner set(items_per_block = block_size * items_per_thread) -#pragma kernel_tuner block_size(block_size) +#pragma kernel_tuner set(items_per_block = items_per_thread * threads_per_block) +#pragma kernel_tuner set(block_size = items_per_block) +#pragma kernel_tuner restriction(items_per_block <= 1024) #pragma kernel_tuner problem_size(n) +#pragma kernel_tuner block_size(threads_per_block) #pragma kernel_tuner grid_divisor(items_per_block) -#pragma kernel_tuner restriction(items_per_block <= 1024) #pragma kernel_tuner buffers(C[n], A[n], B[n]) +#pragma kernel_tuner tuning_key("vector_add_" + T) template< typename T, int block_size = 32, diff --git a/examples/vector_add_annotated/main.cu b/examples/vector_add_annotated/main.cu new file mode 100644 index 0000000..fc2ed02 --- /dev/null +++ b/examples/vector_add_annotated/main.cu @@ -0,0 +1,90 @@ +#include +#include + +// This is just to check that `kernel_annotated.cu` is still valid C++/CUDA code +#include "kernel_annotated.cu" + +#include "kernel_launcher.h" +#include "kernel_launcher/pragma.h" + +namespace kl = kernel_launcher; + +void cuda_check(cudaError_t code) { + if (code != cudaSuccess) { + throw std::runtime_error( + std::string("CUDA error: ") + cudaGetErrorString(code)); + } +} + +std::string kernel_directory() { + // Find kernel file + std::string this_file = __FILE__; + std::string this_directory = this_file.substr(0, this_file.rfind('/')); + return this_directory + "/"; +} + +int main(int argc, char* argv[]) { + chdir(kernel_directory().c_str()); + + // Parse the number of elements N + int n = 1'000'000; + + if (argc > 1) { + char* end = nullptr; + n = strtol(argv[1], &end, 10); + + if (strlen(end)) { + std::cerr << "usage: " << argv[0] << " n\n"; + return 1; + } + } + + // Initialize inputs + std::vector A(n), B(n), C_answer(n), C_result(n); + for (int i = 0; i < n; i++) { + A[i] = static_cast(i); + B[i] = 1.0f; + C_answer[i] = A[i] + B[i]; + } + + // Allocate GPU memory + float *A_dev, *B_dev, *C_dev; + cuda_check(cudaSetDevice(0)); + cuda_check(cudaMalloc(&A_dev, sizeof(float) * n)); + cuda_check(cudaMalloc(&B_dev, sizeof(float) * n)); + cuda_check(cudaMalloc(&C_dev, sizeof(float) * n)); + cuda_check( + cudaMemcpy(A_dev, A.data(), sizeof(float) * n, cudaMemcpyDefault)); + cuda_check( + cudaMemcpy(B_dev, B.data(), sizeof(float) * n, cudaMemcpyDefault)); + + // Call kernel + kl::launch( + kl::PragmaKernel("kernel_annotated.cu", "vector_add", {"float"}), + n, + C_dev, + (const float*)A_dev, + (const float*)B_dev); + + // Copy results back + cuda_check(cudaMemcpy( + C_result.data(), + C_dev, + sizeof(float) * n, + cudaMemcpyDefault)); + + // Check results + for (int i = 0; i < n; i++) { + float result = C_result[i]; + float answer = C_answer[i]; + + if (result != answer) { + std::cout << "error: index " << i << " is incorrect: " << result + << " != " << answer << "\n"; + return 1; + } + } + + std::cout << "result correct\n"; + return 0; +} From 712e1cc4f6fc8f76ac8cc29eea0c8d65707769f7 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Feb 2023 13:14:28 +0100 Subject: [PATCH 21/63] Typo in docs --- docs/examples/pragma.rst | 2 +- docs/examples/vector_add_annotated.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/pragma.rst b/docs/examples/pragma.rst index 7cca44b..64b2dd2 100644 --- a/docs/examples/pragma.rst +++ b/docs/examples/pragma.rst @@ -12,7 +12,7 @@ Source Code ----------- Below shows the CUDA kernel code. -This is valid regular CUDA code since the ``#pragma`` will be ignored by the ``nvcc`` compiler (although they will emit a warning). +This is valid regular CUDA code since the ``#pragma`` will be ignored by the ``nvcc`` compiler (although they might cause compiler warnings). .. literalinclude:: vector_add_annotated.cu :lines: 1-20 diff --git a/docs/examples/vector_add_annotated.cu b/docs/examples/vector_add_annotated.cu index 0c234a0..b08ba35 100644 --- a/docs/examples/vector_add_annotated.cu +++ b/docs/examples/vector_add_annotated.cu @@ -5,7 +5,7 @@ #pragma kernel_tuner block_size(threads_per_block) #pragma kernel_tuner grid_divisor(items_per_block) #pragma kernel_tuner buffers(C[n], A[n], B[n]) -#pragma kernel_tuner tuning_key("vector_add_", T) +#pragma kernel_tuner tuning_key("vector_add_" + T) template __global__ void vector_add(int n, T* C, const T* A, const T* B) { From 13b5ecc3ea3404c23be198642b1d0b6e05d0d40e Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 9 Feb 2023 15:01:47 +0100 Subject: [PATCH 22/63] Document pragma.h --- docs/build_api.py | 6 +++++- examples/vector_add_annotated/main.cu | 2 +- include/kernel_launcher/pragma.h | 27 +++++++++++++++++++++++++-- include/kernel_launcher/registry.h | 5 +++++ src/pragma.cpp | 11 ++++++----- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/docs/build_api.py b/docs/build_api.py index e9c17af..f2410a6 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -77,7 +77,7 @@ def build_index_page(groups): "KernelSource", "Kernel", ], - "Wisdom": [ + "Wisdom Kernels": [ "WisdomKernel", "WisdomSettings", "WisdomRecord", @@ -92,6 +92,10 @@ def build_index_page(groups): "export_capture_file", "capture_file_exists", ], + "Pragma Kernels": [ + "PragmaKernel", + "build_pragma_kernel" + ], "Registry": [ "KernelRegistry", "IKernelDescriptor", diff --git a/examples/vector_add_annotated/main.cu b/examples/vector_add_annotated/main.cu index fc2ed02..4951682 100644 --- a/examples/vector_add_annotated/main.cu +++ b/examples/vector_add_annotated/main.cu @@ -1,9 +1,9 @@ #include + #include // This is just to check that `kernel_annotated.cu` is still valid C++/CUDA code #include "kernel_annotated.cu" - #include "kernel_launcher.h" #include "kernel_launcher/pragma.h" diff --git a/include/kernel_launcher/pragma.h b/include/kernel_launcher/pragma.h index b325080..92e8372 100644 --- a/include/kernel_launcher/pragma.h +++ b/include/kernel_launcher/pragma.h @@ -5,16 +5,39 @@ namespace kernel_launcher { +/** + * Parses the given `KernelSource`, searches the source code for the kernel + * with the given `kernel_name`, extract the KernelLauncher-specific pragmas + * for that kernel, and returns `KernelBuilder`. + * + * @param source The source code. Can be either a filename (like `"kernel.cu"`) + * or a filename+content pair (like `{"kernel.cu", "#include ..."}`). + * @param kernel_name The name of the kernel in the source code. It may contain + * namespaces such as `mypackage::kernels::vector_add`. + * @param template_args Optional; template arguments passed to the kernel. + */ KernelBuilder build_pragma_kernel( const KernelSource& source, - const std::string& name, + const std::string& kernel_name, const std::vector& template_args = {}, const FileLoader& fs = DefaultLoader {}); +/** + * This is a `IKernelDescriptor` that uses `build_pragma_kernel` to construct + * a `KernelBuilder`. + */ struct PragmaKernel: IKernelDescriptor { + /** + * Construct `PragmaKernel`. + * + * @param path Filename of the source file. + * @param kernel_name The name of the kernel in the source code. It may + * contain namespaces such as `mypackage::kernels::vector_add`. + * @param template_args Optional; template arguments passed to the kernel. + */ PragmaKernel( std::string path, - std::string name, + std::string kernel_name, std::vector template_args = {}); KernelBuilder build() const override; diff --git a/include/kernel_launcher/registry.h b/include/kernel_launcher/registry.h index 355a861..c4c94a2 100644 --- a/include/kernel_launcher/registry.h +++ b/include/kernel_launcher/registry.h @@ -146,6 +146,11 @@ struct KernelRegistry { */ const KernelRegistry& default_registry(); +/** + * Launch the kernel given a `KernelDescriptor` using the global registry. + * This is a short-hand for + * `default_registry().lookup(descriptor).launch(args...)` + */ template void launch(KernelDescriptor descriptor, Args&&... args) { return default_registry().launch( diff --git a/src/pragma.cpp b/src/pragma.cpp index b4c044b..931bee9 100644 --- a/src/pragma.cpp +++ b/src/pragma.cpp @@ -6,7 +6,7 @@ namespace kernel_launcher { KernelBuilder build_pragma_kernel( const KernelSource& source, - const std::string& name, + const std::string& kernel_name, const std::vector& template_args, const FileLoader& fs) { // Read file @@ -21,7 +21,7 @@ KernelBuilder build_pragma_kernel( auto processed_source = KernelSource(filename, result.processed_source); for (const auto& kernel : result.kernels) { - if (stream.matches(kernel.name, name)) { + if (stream.matches(kernel.name, kernel_name)) { return internal::builder_from_annotated_kernel( stream, processed_source, @@ -31,14 +31,15 @@ KernelBuilder build_pragma_kernel( } throw std::runtime_error( - "kernel '" + name + "' was not found in file \'" + filename + "\'"); + "kernel '" + kernel_name + "' was not found in file \'" + filename + + "\'"); } PragmaKernel::PragmaKernel( std::string path, - std::string name, + std::string kernel_name, std::vector template_args) : - kernel_name_(std::move(name)), + kernel_name_(std::move(kernel_name)), template_args_(std::move(template_args)) { // Resolve absolute file path const char* abs_path = realpath(path.c_str(), nullptr); From 62f9811f7f9f430c9c4a66851176893e87410251 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 9 Feb 2023 16:53:48 +0100 Subject: [PATCH 23/63] Add missing `!=` in tokenizer --- src/internal/tokens.cpp | 9 +++++---- tests/internal.cpp | 45 +++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/internal/tokens.cpp b/src/internal/tokens.cpp index b30eeb7..eaa5a08 100644 --- a/src/internal/tokens.cpp +++ b/src/internal/tokens.cpp @@ -85,10 +85,11 @@ static index_t advance_string(index_t i, const std::string& input) { } TokenKind char2_to_kind(char a, char b) { - if ((a == '=' && b == '=') || (a == '<' && b == '=') - || (a == '>' && b == '=') || (a == '&' && b == '&') - || (a == '|' && b == '|') || (a == '<' && b == '<') - || (a == '>' && b == '>') || (a == ':' && b == ':')) { + if ((a == '=' && b == '=') || (a == '!' && b == '=') + || (a == '<' && b == '=') || (a == '>' && b == '=') + || (a == '&' && b == '&') || (a == '|' && b == '|') + || (a == '<' && b == '<') || (a == '>' && b == '>') + || (a == ':' && b == ':')) { return TokenKind::Punct; } diff --git a/tests/internal.cpp b/tests/internal.cpp index 721488f..f6260cc 100644 --- a/tests/internal.cpp +++ b/tests/internal.cpp @@ -54,7 +54,7 @@ TEST_CASE("tokenizer exhaustive") { using internal::TokenKind; std::string input = R"( - == <= >= && || << >> :: + == != <= >= && || << >> :: { } [ ] ( ) , < > + = - * / ! ~ & | ^ % 123 4.5 "test" "a\"b" 'c' @@ -62,27 +62,28 @@ TEST_CASE("tokenizer exhaustive") { )"; std::vector> expected = { - {"==", TokenKind::Punct}, {"<=", TokenKind::Punct}, - {">=", TokenKind::Punct}, {"&&", TokenKind::Punct}, - {"||", TokenKind::Punct}, {"<<", TokenKind::Punct}, - {">>", TokenKind::Punct}, {"::", TokenKind::Punct}, - {"{", TokenKind::BraceL}, {"}", TokenKind::BraceR}, - {"[", TokenKind::BracketL}, {"]", TokenKind::BracketR}, - {"(", TokenKind::ParenL}, {")", TokenKind::ParenR}, - {",", TokenKind::Comma}, {"<", TokenKind::AngleL}, - {">", TokenKind::AngleR}, {"+", TokenKind::Punct}, - {"=", TokenKind::Punct}, {"-", TokenKind::Punct}, - {"*", TokenKind::Punct}, {"/", TokenKind::Punct}, - {"!", TokenKind::Punct}, {"~", TokenKind::Punct}, - {"&", TokenKind::Punct}, {"|", TokenKind::Punct}, - {"^", TokenKind::Punct}, {"%", TokenKind::Punct}, - {"123", TokenKind::Number}, {"4", TokenKind::Number}, - {".", TokenKind::Unknown}, {"5", TokenKind::Number}, - {R"("test")", TokenKind::String}, {R"("a\"b")", TokenKind::String}, - {"'c'", TokenKind::String}, {"foo", TokenKind::Ident}, - {"$foo", TokenKind::Ident}, {"foo_bar", TokenKind::Ident}, - {"_bar", TokenKind::Ident}, {"_", TokenKind::Ident}, - {"bar1", TokenKind::Ident}, {"", TokenKind::EndOfFile}}; + {"==", TokenKind::Punct}, {"!=", TokenKind::Punct}, + {"<=", TokenKind::Punct}, {">=", TokenKind::Punct}, + {"&&", TokenKind::Punct}, {"||", TokenKind::Punct}, + {"<<", TokenKind::Punct}, {">>", TokenKind::Punct}, + {"::", TokenKind::Punct}, {"{", TokenKind::BraceL}, + {"}", TokenKind::BraceR}, {"[", TokenKind::BracketL}, + {"]", TokenKind::BracketR}, {"(", TokenKind::ParenL}, + {")", TokenKind::ParenR}, {",", TokenKind::Comma}, + {"<", TokenKind::AngleL}, {">", TokenKind::AngleR}, + {"+", TokenKind::Punct}, {"=", TokenKind::Punct}, + {"-", TokenKind::Punct}, {"*", TokenKind::Punct}, + {"/", TokenKind::Punct}, {"!", TokenKind::Punct}, + {"~", TokenKind::Punct}, {"&", TokenKind::Punct}, + {"|", TokenKind::Punct}, {"^", TokenKind::Punct}, + {"%", TokenKind::Punct}, {"123", TokenKind::Number}, + {"4", TokenKind::Number}, {".", TokenKind::Unknown}, + {"5", TokenKind::Number}, {R"("test")", TokenKind::String}, + {R"("a\"b")", TokenKind::String}, {"'c'", TokenKind::String}, + {"foo", TokenKind::Ident}, {"$foo", TokenKind::Ident}, + {"foo_bar", TokenKind::Ident}, {"_bar", TokenKind::Ident}, + {"_", TokenKind::Ident}, {"bar1", TokenKind::Ident}, + {"", TokenKind::EndOfFile}}; auto stream = internal::TokenStream("", input); From 6fd40b29f9815bad3eb5b260fca86dc099c2a5af Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 9 Feb 2023 16:56:35 +0100 Subject: [PATCH 24/63] Add `default` clause to `tune` directive --- src/internal/directives.cpp | 9 ++++++++- tests/internal.cpp | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 4d855d2..1a732cd 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -333,13 +333,20 @@ static void parse_tune_directive( stream.consume(TokenKind::ParenR); + Value default_value = values[0]; + if (stream.next_if("default")) { + stream.consume(TokenKind::ParenL); + default_value = parse_comptime_expr(stream, ctx); + stream.consume(TokenKind::ParenR); + } + std::string var = stream.span(var_token); if (ctx.config_args.count(var) > 0) { stream.throw_unexpected_token(var_token, "variable redefined"); } - auto param = builder.add(var, values, priors, values.front()); + auto param = builder.add(var, values, priors, default_value); ctx.config_args.insert({var, param}); } diff --git a/tests/internal.cpp b/tests/internal.cpp index f6260cc..3797a2a 100644 --- a/tests/internal.cpp +++ b/tests/internal.cpp @@ -105,7 +105,7 @@ namespace bar { #ifdef SOMECONSTANT #endif -#pragma kernel_tuner tune(block_size=32, 64, 128, 256) +#pragma kernel_tuner tune(block_size=32, 64, 128, 256) default(128) #pragma kernel_tuner problem_size(n) __global__ void baz(int n, const float* a) { if (threadIdx.x < 10) { @@ -135,7 +135,7 @@ namespace bar { #ifdef SOMECONSTANT #endif -/*#pragma kernel_tuner tune(block_size=32, 64, 128, 256) +/*#pragma kernel_tuner tune(block_size=32, 64, 128, 256) default(128) #pragma kernel_tuner problem_size(n) */__global__ void baz(int n, const float* a) { if (threadIdx.x < 10) { From 3ffd4dbb903de159ebff35a54370d0ec2c1ab4cf Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 10 Feb 2023 15:19:19 +0100 Subject: [PATCH 25/63] Support `&&`, `||`, and `?:` operators in directives --- src/internal/directives.cpp | 51 ++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 1a732cd..c10d121 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -25,7 +25,7 @@ struct Context { std::unordered_map user_args; }; -static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec = 0); +static Expr parse_expr(TokenStream& stream, const Context& ctx); static Expr process_function_call( Token t, @@ -195,47 +195,68 @@ static Expr parse_prim(TokenStream& stream, const Context& ctx) { } return ScalarExpr(l); } else if (stream.matches(t, '-')) { - return -parse_expr(stream, ctx); + return -parse_prim(stream, ctx); } else if (stream.matches(t, '+')) { - return +parse_expr(stream, ctx); + return +parse_prim(stream, ctx); } else if (stream.matches(t, '!')) { - return !parse_expr(stream, ctx); + return !parse_prim(stream, ctx); } else { stream.throw_unexpected_token(t, "expecting expression"); } } -static Expr parse_expr(TokenStream& stream, const Context& ctx, int prec) { +static Expr parse_binop(TokenStream& stream, const Context& ctx, int prec) { // TODO: == != <= >= && || % Expr lhs = parse_prim(stream, ctx); while (true) { if (prec < 6 && stream.next_if('*')) { - lhs = lhs * parse_expr(stream, ctx, 6); + lhs = lhs * parse_binop(stream, ctx, 6); } else if (prec < 6 && stream.next_if('/')) { - lhs = lhs / parse_expr(stream, ctx, 6); + lhs = lhs / parse_binop(stream, ctx, 6); } else if (prec < 5 && stream.next_if('+')) { - lhs = lhs + parse_expr(stream, ctx, 5); + lhs = lhs + parse_binop(stream, ctx, 5); } else if (prec < 5 && stream.next_if('-')) { - lhs = lhs - parse_expr(stream, ctx, 5); + lhs = lhs - parse_binop(stream, ctx, 5); } else if (prec < 3 && stream.next_if('<')) { - lhs = lhs < parse_expr(stream, ctx, 3); + lhs = lhs < parse_binop(stream, ctx, 3); } else if (prec < 3 && stream.next_if('>')) { - lhs = lhs > parse_expr(stream, ctx, 3); + lhs = lhs > parse_binop(stream, ctx, 3); } else if (prec < 3 && stream.next_if("<=")) { - lhs = lhs <= parse_expr(stream, ctx, 3); + lhs = lhs <= parse_binop(stream, ctx, 3); } else if (prec < 3 && stream.next_if(">=")) { - lhs = lhs >= parse_expr(stream, ctx, 3); + lhs = lhs >= parse_binop(stream, ctx, 3); } else if (prec < 3 && stream.next_if("!=")) { - lhs = lhs != parse_expr(stream, ctx, 3); + lhs = lhs != parse_binop(stream, ctx, 3); } else if (prec < 3 && stream.next_if("==")) { - lhs = lhs == parse_expr(stream, ctx, 3); + lhs = lhs == parse_binop(stream, ctx, 3); + } else if (prec < 2 && stream.next_if("&&")) { + lhs = lhs && parse_binop(stream, ctx, 2); + } else if (prec < 1 && stream.next_if("||")) { + lhs = lhs || parse_binop(stream, ctx, 1); } else { return lhs; } } } +static Expr parse_ternary(TokenStream& stream, const Context& ctx) { + Expr cond = parse_binop(stream, ctx, 0); + + if (stream.next_if('?')) { + Expr if_true = parse_expr(stream, ctx); + stream.consume(':'); + Expr if_false = parse_expr(stream, ctx); + return ifelse(std::move(cond), std::move(if_true), std::move(if_false)); + } else { + return cond; + } +} + +static Expr parse_expr(TokenStream& stream, const Context& ctx) { + return parse_ternary(stream, ctx); +} + static std::vector parse_expr_list( TokenStream& stream, const Context& ctx, From a968f9f5354129928bdfe9cdc0cd1ae7c48ef151 Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 10 Feb 2023 15:40:37 +0100 Subject: [PATCH 26/63] Support `DEVICE_*` attributes in directives --- src/internal/directives.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index c10d121..d599491 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -61,6 +61,7 @@ static Expr process_function_call( } } +// NOLINTNEXTLINE(readability-function-cognitive-complexity) static Expr parse_ident(Token t, TokenStream& stream, const Context& ctx) { if (stream.next_if(TokenKind::ParenL)) { std::vector args; @@ -120,6 +121,13 @@ static Expr parse_ident(Token t, TokenStream& stream, const Context& ctx) { } } +#define CHECK_DEVICE_ATTRIBUTE(attr) \ + if (name == "DEVICE_" #attr) { \ + return DEVICE_##attr; \ + } + + KERNEL_LAUNCHER_DEVICE_ATTRIBUTES_FORALL(CHECK_DEVICE_ATTRIBUTE) + stream.throw_unexpected_token(t, "unknown variable name"); } @@ -205,6 +213,7 @@ static Expr parse_prim(TokenStream& stream, const Context& ctx) { } } +// NOLINTNEXTLINE(readability-function-cognitive-complexity) static Expr parse_binop(TokenStream& stream, const Context& ctx, int prec) { // TODO: == != <= >= && || % Expr lhs = parse_prim(stream, ctx); From bc8b1e3ebe41425158fbd7645fd1a0290f02192a Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 20 Mar 2023 14:44:14 +0100 Subject: [PATCH 27/63] Add list of pragma directives in documentation --- docs/examples/pragma.rst | 73 +++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/docs/examples/pragma.rst b/docs/examples/pragma.rst index 64b2dd2..2445872 100644 --- a/docs/examples/pragma.rst +++ b/docs/examples/pragma.rst @@ -1,18 +1,19 @@ Pragma Kernels =========================== -In the previous examples, we saw how it was possible to specify a tunable kernel by defining a ``KernelBuilder`` instance in the host-side code. -While this API offers flexibility, it is also somewhat cumbersome and it requires keeping the actual kernel code in CUDA in sync with the host-side code in C++. +In the previous examples, we demonstrated how a tunable kernel can be specified by defining a ``KernelBuilder`` instance in the host-side code. +While this API offers flexiblity, it can be cumbersome and requires keeping the kernel code in CUDA in sync with the host-side code in C++. -Kernel Launcher also offers a way to define the kernel specifications inside the actual CUDA code by annotating the kernel code with directives. -While this method is less flexible than the ``KernelBuilder`` API, it is a lot more convenient and should be usable for the majority of CUDA kernels. +Kernel Launcher also provides a way to define kernel specifications directly in the CUDA code by using pragma directives to annotate the kernel code. +Although this method is less flexible than the ``KernelBuilder`` API, it is much more convenient and suitable for most CUDA kernels. Source Code ----------- -Below shows the CUDA kernel code. -This is valid regular CUDA code since the ``#pragma`` will be ignored by the ``nvcc`` compiler (although they might cause compiler warnings). +The following code example shows valid CUDA kernel code containing pragma directives. +The ``#pragma`` annotations will be ignored by the ``nvcc`` compiler (but they may produce compiler warnings). + .. literalinclude:: vector_add_annotated.cu :lines: 1-20 @@ -28,9 +29,8 @@ The kernel contains the following ``pragma`` directives: :lines: 1-2 :lineno-start: 1 -The ``tune`` directives defines the tunable parameters. -In this case, there are two parameters: ``threads_per_block`` and ``items_per_thread``. -Since ``items_per_thread`` is also the name of template parameter (line 9), it is passed to the kernel as compile-time constant to the kernel via this parameter. +The tune directives specify the tunable parameters: ``threads_per_block`` and ``items_per_thread``. +Since ``items_per_thread`` is also the name of the template parameter, so it is passed to the kernel as a compile-time constant via this parameter. The value of ``threads_per_block`` is not passed to the kernel but is used by subsequent pragmas. .. literalinclude:: vector_add_annotated.cu @@ -44,26 +44,24 @@ In this case, the constant ``items_per_block`` is defined as the product of ``th :lines: 4-6 :lineno-start: 4 -The above lines specify information required to launch the kernel. -The ``problem_size`` defines the problem size as discussed in :doc:`basic`. -The ``block_size`` specifies the thread block size and ``grid_divisors`` specifies how the problem size should be divided to obtain the thread grid size. -Alternatively, it is possible to specify the grid size directly using the ``grid_size`` directive. +The ``problem_size`` directive defines the problem size (as discussed in as discussed in :doc:`basic`), ``block_size`` specifies the thread block size, and ``grid_divisor`` specifies how the problem size should be divided to obtain the thread grid size. +Alternatively, ``grid_size`` can be used to specify the grid size directly. + .. literalinclude:: vector_add_annotated.cu :lines: 7-7 :lineno-start: 7 -The above line specifies that the kernel arguments ``A``, ``B``, and ``C`` are buffers each having ``n`` elements. -This is required since Kernel Launcher requires the size of each buffer to be known, but the kernel could be called with raw pointers for which no size information is available. +The ``buffers`` directive specifies the size of each buffer (``A``, ``B``, and ``C``) as ``n`` elements to be known by Kernel Launcher. +This is necessary since raw pointers can be used for buffer arguments, for which size information may not be available. If the ``buffers`` pragma is not specified, Kernel Launcher can still be used but it is not possible to capture kernel launches. .. literalinclude:: vector_add_annotated.cu :lines: 8-8 :lineno-start: 8 -The ``tuning_key`` pragma specifies the tuning key. -All arguments given to this pragma will be concatenated and these arguments can be either strings or variables. -In this example, the tuning key is ``"vector_add_" + T`` where ``T`` is the name of the type. +The ``tuning_key`` directive specifies the tuning key, which can be a concatenation of strings or variables. +In this example, the tuning key is ``"vector_add_" + T``, where ``T`` is the name of the type. Host Code @@ -83,8 +81,45 @@ The below code shows how to call the kernel from the host in C++:: The ``PragmaKernel`` class implements the ``IKernelDescriptor`` interface, as described in :doc:`registry`. -This class will read the specified file, extract the Kernel Launcher pragmas from the source code, and compile the kernel. +This class reads the specified file, extracts the Kernel Launcher pragmas from the source code, and compiles the kernel. The ``launch`` function launches the kernel and, as discussed in :doc:`registry`, it uses the default registry to cache kernel compilations. This means that the kernel is only compiled once, even if the same kernel is called from different locations in the program. + +List of pragmas +--------------- + +The table below lists the valid directives. + +.. list-table:: + + * - Directive + - Description + + * - ``tune`` + - Add a new tunable variable. + + * - ``set`` + - Add a new variable. + + * - ``buffers`` + - Specify the size of buffer arguments. This directive may occur multiple times. + + * - ``tuning_key`` + - Specify the tuning key used to search for the corresponding wisdom file. + + * - ``problem_size`` + - An N-dimensional vector that indicates workload size. + + * - ``grid_size`` + - An N-dimensional vector that indicates the CUDA grid size. + + * - ``block_size`` + - An N-dimensional vector that indicates the CUDA thread block size. + + * - ``grid_divisor`` + - Alternative way of specifying the grid size. The problem size is divided by the grid divisors to obtain the grid dimensions. + + * - ``restriction`` + - Boolean expression that must evaluate to ``true`` for a kernel configuration to be valid. From e302791e572bb5b9f7023ef38c35a99ccbd40fce Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 20 Mar 2023 14:45:32 +0100 Subject: [PATCH 28/63] Allow use of synonyms in pragma directives --- src/internal/directives.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index d599491..9693d71 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -421,19 +421,19 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { parse_buffer_directive(stream, builder, ctx); } else if (name == "tuning_key") { parse_tuning_key_directive(stream, builder, ctx); - } else if (name == "grid_size") { + } else if (name == "grid_size" || name == "grid_dim") { auto l = parse_expr_list3(stream, ctx); builder.grid_size(l[0], l[1], l[2]); - } else if (name == "block_size") { + } else if (name == "block_size" || name == "block_dim") { auto l = parse_expr_list3(stream, ctx); builder.block_size(l[0], l[1], l[2]); - } else if (name == "grid_divisor") { + } else if (name == "grid_divisor" || name == "grid_divisors") { auto l = parse_expr_list3(stream, ctx); builder.grid_divisors(l[0], l[1], l[2]); - } else if (name == "problem_size") { + } else if (name == "problem_size" || name == "problem_dim") { auto l = parse_expr_list3(stream, ctx); builder.problem_size(l[0], l[1], l[2]); - } else if (name == "restriction") { + } else if (name == "restriction" || name == "restrictions") { for (const auto& expr : parse_expr_list(stream, ctx)) { builder.restriction(expr); } @@ -502,4 +502,4 @@ KernelBuilder builder_from_annotated_kernel( } } // namespace internal -} // namespace kernel_launcher \ No newline at end of file +} // namespace kernel_launcher From 403da298cbe560d81f333defad619ab61535bf7a Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 20 Mar 2023 15:01:46 +0100 Subject: [PATCH 29/63] Rename directives from `kernel_tuner` to just `kernel` --- docs/examples/vector_add_annotated.cu | 16 +++++++------- .../vector_add_annotated/kernel_annotated.cu | 22 +++++++++---------- include/kernel_launcher/internal/tokens.h | 21 +++++++++++++++++- src/internal/parser.cpp | 10 +++++---- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/docs/examples/vector_add_annotated.cu b/docs/examples/vector_add_annotated.cu index b08ba35..003ab3c 100644 --- a/docs/examples/vector_add_annotated.cu +++ b/docs/examples/vector_add_annotated.cu @@ -1,11 +1,11 @@ -#pragma kernel_tuner tune(threads_per_block=32, 64, 128, 256, 512, 1024) -#pragma kernel_tuner tune(items_per_thread=1, 2, 4, 8) -#pragma kernel_tuner set(items_per_block=threads_per_block * items_per_thread) -#pragma kernel_tuner problem_size(n) -#pragma kernel_tuner block_size(threads_per_block) -#pragma kernel_tuner grid_divisor(items_per_block) -#pragma kernel_tuner buffers(C[n], A[n], B[n]) -#pragma kernel_tuner tuning_key("vector_add_" + T) +#pragma kernel tune(threads_per_block=32, 64, 128, 256, 512, 1024) +#pragma kernel tune(items_per_thread=1, 2, 4, 8) +#pragma kernel set(items_per_block=threads_per_block * items_per_thread) +#pragma kernel problem_size(n) +#pragma kernel block_size(threads_per_block) +#pragma kernel grid_divisor(items_per_block) +#pragma kernel buffers(C[n], A[n], B[n]) +#pragma kernel tuning_key("vector_add_" + T) template __global__ void vector_add(int n, T* C, const T* A, const T* B) { diff --git a/examples/vector_add_annotated/kernel_annotated.cu b/examples/vector_add_annotated/kernel_annotated.cu index 44e2db8..0b0fa4d 100644 --- a/examples/vector_add_annotated/kernel_annotated.cu +++ b/examples/vector_add_annotated/kernel_annotated.cu @@ -1,14 +1,14 @@ -#pragma kernel_tuner tune(threads_per_block = 32, 64, 128, 256, 512, 1024) -#pragma kernel_tuner tune(items_per_thread = 1, 2, 3, 4) -#pragma kernel_tuner tune(tiling_strategy = 0, 1, 2) -#pragma kernel_tuner set(items_per_block = items_per_thread * threads_per_block) -#pragma kernel_tuner set(block_size = items_per_block) -#pragma kernel_tuner restriction(items_per_block <= 1024) -#pragma kernel_tuner problem_size(n) -#pragma kernel_tuner block_size(threads_per_block) -#pragma kernel_tuner grid_divisor(items_per_block) -#pragma kernel_tuner buffers(C[n], A[n], B[n]) -#pragma kernel_tuner tuning_key("vector_add_" + T) +#pragma kernel tune(threads_per_block = 32, 64, 128, 256, 512, 1024) +#pragma kernel tune(items_per_thread = 1, 2, 3, 4) +#pragma kernel tune(tiling_strategy = 0, 1, 2) +#pragma kernel set(items_per_block = items_per_thread * threads_per_block) +#pragma kernel set(block_size = items_per_block) +#pragma kernel restriction(items_per_block <= 1024) +#pragma kernel problem_size(n) +#pragma kernel block_size(threads_per_block) +#pragma kernel grid_divisor(items_per_block) +#pragma kernel buffers(C[n], A[n], B[n]) +#pragma kernel tuning_key("vector_add_" + T) template< typename T, int block_size = 32, diff --git a/include/kernel_launcher/internal/tokens.h b/include/kernel_launcher/internal/tokens.h index ba57867..5cd9929 100644 --- a/include/kernel_launcher/internal/tokens.h +++ b/include/kernel_launcher/internal/tokens.h @@ -1,6 +1,7 @@ #ifndef KERNEL_LAUNCHER_TOKENIZER_H #define KERNEL_LAUNCHER_TOKENIZER_H +#include #include #include #include @@ -71,6 +72,17 @@ struct TokenStream { return t.kind == kind; } + template + bool matches(Token t, const std::array& options) const { + for (const auto& option : options) { + if (matches(t, option)) { + return true; + } + } + + return false; + } + template bool next_if(T&& pattern) { if (!matches(peek(), std::forward(pattern))) { @@ -94,6 +106,13 @@ struct TokenStream { throw_expecting_token(t, str); } + template + [[noreturn]] void + throw_expecting_token(Token t, const std::array& patterns) const { + static_assert(N > 0, "number of patterns cannot be zero"); + throw_expecting_token(t, patterns[0]); + } + template Token consume(const T& pattern) { Token t = next(); @@ -134,4 +153,4 @@ struct TokenStream { } // namespace internal } // namespace kernel_launcher -#endif \ No newline at end of file +#endif diff --git a/src/internal/parser.cpp b/src/internal/parser.cpp index b9b76fc..d16920d 100644 --- a/src/internal/parser.cpp +++ b/src/internal/parser.cpp @@ -69,12 +69,14 @@ static std::vector parse_kernel_params(TokenStream& stream) { static bool extract_kernel_tuner_directives( TokenStream& stream, std::vector& directives_out) { - static constexpr const char* PRAGMA_NAME = "kernel_tuner"; + static constexpr const std::array PRAGMA_NAMES = { + "kernel", + "kernel_tuner"}; // Check if directive starts with correct pragma. If not, this is // not a relevant pragma and we do not need to parse it. Token t = stream.peek(); - bool is_relevant = stream.next_if("pragma") && stream.next_if(PRAGMA_NAME); + bool is_relevant = stream.next_if("pragma") && stream.next_if(PRAGMA_NAMES); stream.seek(t); if (!is_relevant) { @@ -84,7 +86,7 @@ static bool extract_kernel_tuner_directives( // Parse all pragmas do { stream.consume("pragma"); - stream.consume(PRAGMA_NAME); + stream.consume(PRAGMA_NAMES); t = stream.next(); directives_out.push_back(t); @@ -217,4 +219,4 @@ AnnotatedDocument extract_annotated_kernels(TokenStream& stream) { } } // namespace internal -} // namespace kernel_launcher \ No newline at end of file +} // namespace kernel_launcher From 27c1d40b18d29c56b09dba3fd79e9d891f41d166 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 30 Mar 2023 10:55:08 +0200 Subject: [PATCH 30/63] Add argument name to argument expressions in directive parser --- src/internal/directives.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 9693d71..8a05cbb 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -456,7 +456,7 @@ KernelBuilder builder_from_annotated_kernel( for (size_t i = 0; i < def.fun_params.size(); i++) { std::string name = stream.span(def.fun_params[i].name); - ctx.kernel_args.insert({name, ArgExpr(uint8_t(i))}); + ctx.kernel_args.insert({name, ArgExpr(uint8_t(i), name)}); } if (template_args.size() > def.template_params.size()) { From fa916ec5e20a2690867054d1d7204562c96d3594 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 20:32:28 +0200 Subject: [PATCH 31/63] Allow multiple comma-seperated statements inside `#pragma set(...)` --- src/internal/directives.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index 8a05cbb..f9550e5 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -21,7 +21,7 @@ struct Context { // Compile-time arguments passed by the user std::unordered_map comptime_args; - // User-defined parameters using `#pragma set(foo=1+2)` + // User-defined parameters from `#pragma kernel set(foo=1+2)` std::unordered_map user_args; }; @@ -313,6 +313,8 @@ static Value parse_comptime_expr(TokenStream& stream, const Context& ctx) { Token after = stream.peek(); try { + // We can use `DummyEval` to evaluate the expression since compile-time + // expression should not contain any variables. return e.eval(DummyEval {}); } catch (const std::exception& err) { auto msg = @@ -383,13 +385,20 @@ static void parse_tune_directive( static void parse_set_directive(TokenStream& stream, Context& ctx) { // '(' IDENT '=' EXPR ')' stream.consume(TokenKind::ParenL); - Token var_token = stream.consume(TokenKind::Ident); - stream.consume('='); - Expr expr = parse_expr(stream, ctx).resolve(DummyEval {}); - stream.consume(TokenKind::ParenR); - std::string var = stream.span(var_token); + do { + Token var_token = stream.consume(TokenKind::Ident); + std::string var = stream.span(var_token); + if (ctx.comptime_args.count(var) > 0) { + stream.throw_unexpected_token(var_token, "variable redefined"); + } + + stream.consume('='); - ctx.user_args.insert({var, expr}); + Expr expr = parse_expr(stream, ctx); + ctx.user_args.insert({var, expr}); + } while (stream.next_if(TokenKind::Comma)); + + stream.consume(TokenKind::ParenR); } static void parse_tuning_key_directive( @@ -491,7 +500,7 @@ KernelBuilder builder_from_annotated_kernel( stream.throw_unexpected_token( param.name, "this template parameter is not defined, please add " - "`#pragma kernel_tuner tune(" + "`#pragma kernel tune(" + name + "=...)`"); } From 082008195c23c2f19a7d1cc4b6a07dfe92b19a52 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:13:30 +0200 Subject: [PATCH 32/63] Fix bug in parsing of `KERNEL_LAUNCHER_INCLUDE` --- src/fs.cpp | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/fs.cpp b/src/fs.cpp index eb32058..a9271b6 100644 --- a/src/fs.cpp +++ b/src/fs.cpp @@ -109,31 +109,32 @@ static void add_env_directories(std::vector& result) { return; } - while (true) { - if (paths[0] == '\0') { - break; - } - - if (paths[0] == ';') { - paths++; - continue; - } + size_t index = 0; + bool is_done = false; - size_t count = 0; + while (!is_done) { + size_t length = 0; while (true) { - char c = paths[count]; - if (c == '\0' || c == ';') { + char c = paths[index + length]; + if (c == '\0') { + is_done = true; break; } - count++; + + // allowed seperators are ';', ',' and ':' + if (c == ';' || c == ',' || c == ':') { + break; + } + + length++; } - if (count > 0) { - result.emplace_back(paths, count); + if (length > 0) { + result.emplace_back(&paths[index], length); } - paths += count; + index += length + 1; } } From aebf0276b8ebb18f9e002d470b7dd998b22b1b2e Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:15:18 +0200 Subject: [PATCH 33/63] make IntoKernelArg an empty class --- include/kernel_launcher/arg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kernel_launcher/arg.h b/include/kernel_launcher/arg.h index e4030ed..b625187 100644 --- a/include/kernel_launcher/arg.h +++ b/include/kernel_launcher/arg.h @@ -117,7 +117,7 @@ struct KernelArg { * See `into_kernel_arg(T&&)`. */ template -struct IntoKernelArg; +struct IntoKernelArg {}; template<> struct IntoKernelArg { From 81a831eb4b38230229743e552f0f1c65617c2bf0 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:16:22 +0200 Subject: [PATCH 34/63] Fix bug in parsing of `KERNEL_LAUNCHER_CAPTURE_FORCE` --- src/wisdom.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 8bb7d3d..52883ed 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -448,7 +448,7 @@ DefaultOracle DefaultOracle::from_env() { } patterns = value; - force = strstr(value, "FORCE") != nullptr; + force = strstr(key, "FORCE") != nullptr; } if (patterns == "1" || patterns == "true" || patterns == "TRUE") { @@ -469,8 +469,6 @@ DefaultOracle DefaultOracle::from_env() { // Print info message on which kernels will be tuned. if (!capture_patterns.empty()) { - std::stringstream ss; - log_info() << "capture enabled for the following kernels: " << string_comma_join(capture_patterns) << "\n"; } From 7d7df3a41e3c797eb933a0fd31f00e6378e3ffe0 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:57:32 +0200 Subject: [PATCH 35/63] Allow multiple delimiters for `string_split` --- include/kernel_launcher/utils.h | 2 ++ src/fs.cpp | 28 +++------------------------- src/utils.cpp | 21 +++++++++++++++++++-- tests/utils.cpp | 9 ++++++++- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/include/kernel_launcher/utils.h b/include/kernel_launcher/utils.h index c8e068e..9235dec 100644 --- a/include/kernel_launcher/utils.h +++ b/include/kernel_launcher/utils.h @@ -404,6 +404,8 @@ bool safe_int64_mul(int64_t lhs, int64_t rhs, int64_t& output); bool safe_int64_div(int64_t lhs, int64_t rhs, int64_t& output); bool string_match(const char* pattern, const char* input); +std::vector +string_split(const char* input, const std::vector& delims); std::vector string_split(const char* input, char delim); using hash_t = uint64_t; diff --git a/src/fs.cpp b/src/fs.cpp index a9271b6..a3b7fe4 100644 --- a/src/fs.cpp +++ b/src/fs.cpp @@ -109,32 +109,10 @@ static void add_env_directories(std::vector& result) { return; } - size_t index = 0; - bool is_done = false; - - while (!is_done) { - size_t length = 0; - - while (true) { - char c = paths[index + length]; - if (c == '\0') { - is_done = true; - break; - } - - // allowed seperators are ';', ',' and ':' - if (c == ';' || c == ',' || c == ':') { - break; - } - - length++; - } - - if (length > 0) { - result.emplace_back(&paths[index], length); + for (std::string path : string_split(paths, {':', ',', ';'})) { + if (!path.empty()) { + result.push_back(path); } - - index += length + 1; } } diff --git a/src/utils.cpp b/src/utils.cpp index 03ac230..31eeec1 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -218,14 +218,27 @@ bool string_match(const char* pattern, const char* input) { return false; } -std::vector string_split(const char* input, char delim) { +std::vector +string_split(const char* input, const std::vector& delims) { size_t start = 0; std::vector result; while (input[start] != '\0') { size_t end = start; - while (input[end] != '\0' && input[end] != delim) { + while (input[end] != '\0') { + bool is_delim = false; + + for (char delim : delims) { + if (input[end] == delim) { + is_delim = true; + } + } + + if (is_delim) { + break; + } + end++; } @@ -241,6 +254,10 @@ std::vector string_split(const char* input, char delim) { return result; } +std::vector string_split(const char* input, char delim) { + return string_split(input, std::vector {delim}); +} + hash_t hash_string(const char* buffer, size_t num_bytes) { // Simple FNV1a hash static constexpr hash_t prime = 0x100000001b3; diff --git a/tests/utils.cpp b/tests/utils.cpp index 71f01dd..bb13f15 100644 --- a/tests/utils.cpp +++ b/tests/utils.cpp @@ -118,11 +118,18 @@ TEST_CASE("test string_match") { TEST_CASE("test string_split") { using v = std::vector; CHECK(string_split("a,b,c", ',') == v {"a", "b", "c"}); + CHECK(string_split(",b,c", ',') == v {"", "b", "c"}); CHECK(string_split("a,,c", ',') == v {"a", "", "c"}); CHECK(string_split("a,b,", ',') == v {"a", "b", ""}); - CHECK(string_split(",b,c", ',') == v {"", "b", "c"}); + CHECK(string_split("a,,", ',') == v {"a", "", ""}); + CHECK(string_split(",b,", ',') == v {"", "b", ""}); + CHECK(string_split(",,c", ',') == v {"", "", "c"}); + CHECK(string_split(",,", ',') == v {"", "", ""}); + CHECK(string_split(",", ',') == v {"", ""}); CHECK(string_split("", ',') == v {""}); CHECK(string_split("aaaaa|bbbbb", '|') == v {"aaaaa", "bbbbb"}); + + CHECK(string_split("a|b,c:d", {'|', ',', ':'}) == v {"a", "b", "c", "d"}); } TEST_CASE("test ProblemSize") { From 93adae80908e39f8e432e8f6ee58e72cceeb6e35 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 15:03:34 +0200 Subject: [PATCH 36/63] Add `CaptureRule` object to replace strings that represent capture patterns --- include/kernel_launcher/wisdom.h | 31 +++++---- src/wisdom.cpp | 109 +++++++++++++++---------------- 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/include/kernel_launcher/wisdom.h b/include/kernel_launcher/wisdom.h index 4f56a5e..e0ccfeb 100644 --- a/include/kernel_launcher/wisdom.h +++ b/include/kernel_launcher/wisdom.h @@ -124,6 +124,16 @@ struct Oracle { const std::vector>& outputs) const = 0; }; +struct CaptureRule { + CaptureRule(std::string pattern, bool force = false) : + pattern(std::move(pattern)), + force(force) {} + CaptureRule(const char* pattern) : CaptureRule(std::string(pattern)) {} + + std::string pattern = ""; + bool force = false; +}; + struct DefaultOracle: Oracle { static DefaultOracle from_env(); @@ -131,8 +141,7 @@ struct DefaultOracle: Oracle { DefaultOracle( std::vector wisdom_dirs, std::string capture_dir, - std::vector capture_patterns = {}, - bool force_capture = false); + std::vector capture_rules = {}); virtual ~DefaultOracle() = default; @@ -173,19 +182,14 @@ struct DefaultOracle: Oracle { return capture_dir_; } - const std::vector& capture_patterns() const { - return capture_patterns_; - } - - bool is_capture_forced() const { - return force_capture_; + const std::vector& capture_rules() const { + return capture_rules_; } private: std::vector wisdom_dirs_; std::string capture_dir_; - std::vector capture_patterns_; - bool force_capture_; + std::vector capture_rules_; }; /** @@ -197,8 +201,7 @@ struct WisdomSettings { WisdomSettings( std::string wisdom_dir, std::string capture_dir, - std::vector capture_patterns = {}, - bool force_capture = false); + std::vector capture_rules = {}); WisdomSettings(std::shared_ptr oracle); template @@ -270,6 +273,8 @@ WisdomSettings default_wisdom_settings(); * returned by `default_wisdom_settings`. */ void append_global_wisdom_directory(std::string); + +// Deprecated void set_global_wisdom_directory(std::string); /** @@ -282,7 +287,7 @@ void set_global_capture_directory(std::string); * Add capture pattern to the `WisdomSettings` returned by * `default_wisdom_settings`. */ -void add_global_capture_pattern(std::string); +void add_global_capture_pattern(CaptureRule rule); } // namespace kernel_launcher diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 52883ed..0409c96 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -397,21 +397,19 @@ DefaultOracle::DefaultOracle() : DefaultOracle(*get_global_wisdom()) {} DefaultOracle::DefaultOracle( std::vector wisdom_dirs, std::string capture_dir, - std::vector capture_patterns, - bool force_capture) : + std::vector capture_rules) : wisdom_dirs_(std::move(wisdom_dirs)), capture_dir_(std::move(capture_dir)), - capture_patterns_(std::move(capture_patterns)), - force_capture_(force_capture) {} + capture_rules_(std::move(capture_rules)) {} DefaultOracle DefaultOracle::from_env() { std::vector wisdom_dirs = {"."}; std::string capture_dir = "."; - std::vector capture_patterns = {}; + std::vector capture_rules = {}; const char* value; if ((value = getenv("KERNEL_LAUNCHER_WISDOM")) != nullptr) { - for (std::string dir : string_split(value, ':')) { + for (std::string dir : string_split(value, {':', ';', ','})) { if (!dir.empty()) { wisdom_dirs.emplace_back(std::move(dir)); } @@ -426,58 +424,58 @@ DefaultOracle DefaultOracle::from_env() { capture_dir = value; } - std::string patterns; - bool force = false; - - // Try the following environment keys in order - const char* env_keys[4] = { + // Try the following environment keys + const char* env_keys[6] = { "KERNEL_LAUNCHER_CAPTURE_FORCE", "KERNEL_LAUNCHER_CAPTURE", + "KERNEL_LAUNCHER_FORCE_CAPTURE", "KERNEL_LAUNCHER_TUNE_FORCE", + "KERNEL_LAUNCHER_FORCE_TUNE", "KERNEL_LAUNCHER_TUNE", }; for (const char* key : env_keys) { + // Check if variable exists if ((value = getenv(key)) == nullptr) { continue; } - if (!patterns.empty()) { - log_warning() << "environment key " << key << " was ignored\n"; - continue; - } + std::string patterns = value; + bool force = strstr(key, "FORCE") != nullptr; - patterns = value; - force = strstr(key, "FORCE") != nullptr; - } + // Some patterns are special cased + if (patterns == "1" || patterns == "true" || patterns == "TRUE") { + patterns = "*"; + } - if (patterns == "1" || patterns == "true" || patterns == "TRUE") { - patterns = "*"; - } + if (patterns == "0" || patterns == "false" || patterns == "FALSE") { + patterns = ""; + } - if (patterns == "0" || patterns == "false" || patterns == "FALSE") { - patterns = ""; - } + for (auto pattern : string_split(patterns.c_str(), {',', '|', ';'})) { + if (pattern.empty()) { + continue; + } - for (std::string pattern : string_split(patterns.c_str(), ',')) { - if (pattern.empty()) { - continue; + capture_rules.emplace_back(std::move(pattern), force); } - - capture_patterns.push_back(std::move(pattern)); } // Print info message on which kernels will be tuned. - if (!capture_patterns.empty()) { - log_info() << "capture enabled for the following kernels: " - << string_comma_join(capture_patterns) << "\n"; + if (!capture_rules.empty() && log_info_enabled()) { + std::vector names; + for (const auto& p : capture_rules) { + names.push_back(p.pattern); + } + + log_info() << "the following kernels will be captured: " + << string_comma_join(names) << "\n"; } return DefaultOracle( std::move(wisdom_dirs), std::move(capture_dir), - std::move(capture_patterns), - force); + std::move(capture_rules)); } Config DefaultOracle::load_config( @@ -526,24 +524,27 @@ bool DefaultOracle::should_capture_kernel( ProblemSize problem_size, WisdomResult result) const { bool matches = false; + bool forced = false; - // If wisdom was found for this kernel and we do not force tuning, - // then there is no need to tune this kernel. - if (result == WisdomResult::Ok && !force_capture_) { - return false; - } - - for (const std::string& pattern : capture_patterns_) { - if (string_match(pattern.c_str(), tuning_key.c_str())) { + for (const auto& rule : capture_rules_) { + if (string_match(rule.pattern.c_str(), tuning_key.c_str())) { matches = true; + forced |= rule.force; break; } } + // No rule matches. We are done. if (!matches) { return false; } + // If wisdom was found for this kernel and we do not force tuning, + // then there is no need to capture this kernel. + if (result == WisdomResult::Ok && !forced) { + return false; + } + if (capture_file_exists(capture_dir_, tuning_key, problem_size)) { return false; } @@ -560,8 +561,7 @@ void append_global_wisdom_directory(std::string dir) { set_global_wisdom(DefaultOracle( std::move(dirs), wisdom->capture_directory(), - wisdom->capture_patterns(), - wisdom->is_capture_forced())); + wisdom->capture_rules())); } void set_global_wisdom_directory(std::string dir) { @@ -570,8 +570,7 @@ void set_global_wisdom_directory(std::string dir) { set_global_wisdom(DefaultOracle( std::vector {std::move(dir)}, wisdom->capture_directory(), - wisdom->capture_patterns(), - wisdom->is_capture_forced())); + wisdom->capture_rules())); } void set_global_capture_directory(std::string dir) { @@ -580,20 +579,18 @@ void set_global_capture_directory(std::string dir) { set_global_wisdom(DefaultOracle( wisdom->wisdom_directories(), std::move(dir), - wisdom->capture_patterns(), - wisdom->is_capture_forced())); + wisdom->capture_rules())); } -void add_global_capture_pattern(std::string pattern) { +void add_global_capture_pattern(CaptureRule rule) { auto wisdom = get_global_wisdom(); - std::vector patterns = wisdom->capture_patterns(); - patterns.push_back(std::move(pattern)); + std::vector rules = wisdom->capture_rules(); + rules.push_back(std::move(rule)); set_global_wisdom(DefaultOracle( wisdom->wisdom_directories(), wisdom->capture_directory(), - patterns, - wisdom->is_capture_forced())); + rules)); } WisdomSettings default_wisdom_settings() { @@ -612,12 +609,10 @@ WisdomSettings::WisdomSettings(std::shared_ptr oracle) : WisdomSettings::WisdomSettings( std::string wisdom_dir, std::string capture_dir, - std::vector capture_patterns, - bool force_capture) : + std::vector capture_rules) : WisdomSettings(std::make_shared( std::vector {std::move(wisdom_dir)}, std::move(capture_dir), - std::move(capture_patterns), - force_capture)) {} + std::move(capture_rules))) {} } // namespace kernel_launcher From 27fddb51f185d5607d4796858e7326c6b6e3e518 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 6 Apr 2023 11:19:54 +0200 Subject: [PATCH 37/63] Do not resolve path in `PragmaKernel` --- src/pragma.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/pragma.cpp b/src/pragma.cpp index 931bee9..362ffc0 100644 --- a/src/pragma.cpp +++ b/src/pragma.cpp @@ -41,13 +41,15 @@ PragmaKernel::PragmaKernel( std::vector template_args) : kernel_name_(std::move(kernel_name)), template_args_(std::move(template_args)) { - // Resolve absolute file path - const char* abs_path = realpath(path.c_str(), nullptr); - if (abs_path == nullptr) { - throw std::runtime_error("failed to resolve path: '" + path + "'"); - } - file_path_ = abs_path; + /* We cannot resolve the file path at this moment since we do not what + * type of `FileLoader` will be used during compilation. */ + //const char* abs_path = realpath(path.c_str(), nullptr); + //if (abs_path == nullptr) { + // throw std::runtime_error("failed to resolve path: '" + path + "'"); + //} + + file_path_ = path; } KernelBuilder PragmaKernel::build() const { From 773a8bb6db340cb1ed2549b83a4aabdf64683794 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:12:05 +0200 Subject: [PATCH 38/63] Print debug information in `KernelInstance::launch` --- include/kernel_launcher/arg.h | 2 ++ include/kernel_launcher/cuda.h | 11 ++++++++++- include/kernel_launcher/utils.h | 4 ++++ src/arg.cpp | 27 +++++++++++++++++++++++++++ src/builder.cpp | 29 +++++++++++++++++++++++++++++ src/compiler.cpp | 3 ++- src/cuda.cpp | 11 +++++++++-- src/utils.cpp | 23 ++++++++++++++++++++--- tests/arg.cpp | 30 ++++++++++++++++++++++++++++++ 9 files changed, 133 insertions(+), 7 deletions(-) diff --git a/include/kernel_launcher/arg.h b/include/kernel_launcher/arg.h index 8b146e4..e4030ed 100644 --- a/include/kernel_launcher/arg.h +++ b/include/kernel_launcher/arg.h @@ -98,6 +98,8 @@ struct KernelArg { std::vector to_bytes() const; void* as_void_ptr() const; + friend std::ostream& operator<<(std::ostream&, const KernelArg&); + private: TypeInfo type_; bool scalar_; diff --git a/include/kernel_launcher/cuda.h b/include/kernel_launcher/cuda.h index 3dd7050..1d08e11 100644 --- a/include/kernel_launcher/cuda.h +++ b/include/kernel_launcher/cuda.h @@ -38,7 +38,10 @@ void cuda_check(CUresult result, const char* msg); * Wrapper around `CUfunction` and the accompanying `CUmodule`. */ struct CudaModule { - CudaModule(const char* image, const char* fun_name); + CudaModule( + const char* image, + const char* lowered_name, + const char* human_name = nullptr); ~CudaModule(); CudaModule() = default; CudaModule(const CudaModule&) = delete; @@ -51,6 +54,7 @@ struct CudaModule { CudaModule& operator=(CudaModule&& that) noexcept { std::swap(that.module_, module_); std::swap(that.fun_ptr_, fun_ptr_); + std::swap(that.fun_name_, fun_name_); return *this; } @@ -61,6 +65,10 @@ struct CudaModule { uint32_t shared_mem, void** args) const; + const std::string& function_name() const { + return fun_name_; + } + CUfunction function() const { return fun_ptr_; } @@ -70,6 +78,7 @@ struct CudaModule { } private: + std::string fun_name_; CUfunction fun_ptr_ = nullptr; CUmodule module_ = nullptr; }; diff --git a/include/kernel_launcher/utils.h b/include/kernel_launcher/utils.h index ac9399f..c8e068e 100644 --- a/include/kernel_launcher/utils.h +++ b/include/kernel_launcher/utils.h @@ -14,6 +14,10 @@ namespace kernel_launcher { +bool log_debug_enabled(); +bool log_info_enabled(); +bool log_warning_enabled(); + std::ostream& log_debug(); std::ostream& log_info(); std::ostream& log_warning(); diff --git a/src/arg.cpp b/src/arg.cpp index 1f34c5c..a0bca93 100644 --- a/src/arg.cpp +++ b/src/arg.cpp @@ -207,4 +207,31 @@ void* KernelArg::as_void_ptr() const { } } +std::ostream& operator<<(std::ostream& os, const KernelArg& arg) { + // There are four possible representations: + // - pointer which is an array (length is known) + // - pointer which is not an array (length is unknown) + // - scalars convertible to `Value` + // - scalars without a representation + if (arg.type().is_pointer()) { + void* ptr; + ::memcpy(&ptr, arg.as_void_ptr(), sizeof(ptr)); + os << "array " << ptr; + + if (arg.is_array()) { + os << " of length " << arg.data_.array.nelements; + } + } else { + Value v = arg.to_value_or_empty(); + + if (!v.is_empty()) { + os << "scalar " << v; + } else { + os << "scalar <...>"; + } + } + + return os << " (type: " << arg.type_.name() << ")"; +} + } // namespace kernel_launcher \ No newline at end of file diff --git a/src/builder.cpp b/src/builder.cpp index 9ab14b7..602f546 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -102,6 +102,35 @@ void KernelInstance::launch( ptrs[i] = args[i].as_void_ptr(); } + if (log_debug_enabled()) { + auto p = problem_size; + auto b = block_size; + auto g = grid_size; + + log_debug() << "launching kernel " << module_.function_name() << "\n"; + log_debug() << " - device: " << CudaDevice::current().name() << "\n"; + log_debug() << " - problem size: [" // + << p.x << ", " << p.y << ", " << p.z << "]\n"; + log_debug() << " - grid size: [" // + << g.x << ", " << g.y << ", " << g.z << "]\n"; + log_debug() << " - block size: [" // + << b.x << ", " << b.y << ", " << b.z << "]\n"; + + if (smem > 0) { + log_debug() << " - shared memory: " << smem << " bytes\n"; + } + + if (stream != nullptr) { + log_debug() << " - stream: " << stream << "\n"; + } + + log_debug() << " - using " << args.size() << " arguments:\n"; + + for (const auto& arg : args) { + log_debug() << " - - " << arg << "\n"; + } + } + module_.launch(stream, grid_size, block_size, smem, ptrs.data()); } diff --git a/src/compiler.cpp b/src/compiler.cpp index fa362c7..5dc55c7 100644 --- a/src/compiler.cpp +++ b/src/compiler.cpp @@ -42,10 +42,11 @@ void KernelDef::add_compiler_option(std::string option) { } CudaModule ICompiler::compile(CudaContextHandle ctx, KernelDef def) const { + std::string human_name = def.name; std::string lowered_name; std::string ptx; compile_ptx(std::move(def), ctx.device().arch(), ptx, lowered_name); - return {ptx.c_str(), lowered_name.c_str()}; + return {ptx.c_str(), lowered_name.c_str(), human_name.c_str()}; } void Compiler::compile_ptx( diff --git a/src/cuda.cpp b/src/cuda.cpp index cc3241a..9247c6e 100644 --- a/src/cuda.cpp +++ b/src/cuda.cpp @@ -24,11 +24,18 @@ void cuda_check(CUresult result, const char* msg) { } } -CudaModule::CudaModule(const char* image, const char* fun_name) { +CudaModule::CudaModule( + const char* image, + const char* lowered_name, + const char* human_name) { + if (human_name != nullptr) { + fun_name_ = human_name; + } + KERNEL_LAUNCHER_CUDA_CHECK( cuModuleLoadDataEx(&module_, image, 0, nullptr, nullptr)); KERNEL_LAUNCHER_CUDA_CHECK( - cuModuleGetFunction(&fun_ptr_, module_, fun_name)); + cuModuleGetFunction(&fun_ptr_, module_, lowered_name)); } CudaModule::~CudaModule() { diff --git a/src/utils.cpp b/src/utils.cpp index 76b26ea..03ac230 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -46,9 +46,8 @@ struct DummyStream: std::ostream { DummyStream() = default; }; -static std::ostream& log_level(LogLevel level) { +bool log_enabled(LogLevel level) { static constexpr const char* ENV_KEY = "KERNEL_LAUNCHER_LOG"; - static DummyStream dummy_stream; static LogLevel min_level = LogLevel::Unknown; if (min_level == LogLevel::Unknown) { @@ -70,7 +69,25 @@ static std::ostream& log_level(LogLevel level) { } } - if (level < min_level) { + return level >= min_level; +} + +bool log_debug_enabled() { + return log_enabled(LogLevel::Debug); +} + +bool log_info_enabled() { + return log_enabled(LogLevel::Info); +} + +bool log_warning_enabled() { + return log_enabled(LogLevel::Warning); +} + +static std::ostream& log_level(LogLevel level) { + static DummyStream dummy_stream; + + if (!log_enabled(level)) { return dummy_stream; } diff --git a/tests/arg.cpp b/tests/arg.cpp index 9eeac20..5d4949a 100644 --- a/tests/arg.cpp +++ b/tests/arg.cpp @@ -1,9 +1,13 @@ +#include + #include "catch.hpp" #include "kernel_launcher/kernel.h" #include "test_utils.h" using namespace kernel_launcher; +struct MyObject {}; + TEST_CASE("test KernelArg") { SECTION("scalar int") { KernelArg v = into_kernel_arg(5); @@ -109,4 +113,30 @@ TEST_CASE("test KernelArg") { CHECK_THROWS( KernelArg::from_array(input.data(), input.size()).to_array(5)); } + + SECTION("operator<<") { + std::stringstream stream; + + SECTION("scalar primitive") { + stream << KernelArg::from_scalar(int(5)); + CHECK(stream.str() == "scalar 5 (type: int)"); + } + + SECTION("scalar arbitrary") { + stream << KernelArg::from_scalar(MyObject {}); + CHECK(stream.str() == "scalar <...> (type: MyObject)"); + } + + SECTION("scalar pointer") { + int* ptr = reinterpret_cast(0x123); + stream << KernelArg::from_scalar(ptr); + CHECK(stream.str() == "array 0x123 (type: int*)"); + } + + SECTION("array") { + int* ptr = reinterpret_cast(0x123); + stream << KernelArg::from_array(ptr, 5); + CHECK(stream.str() == "array 0x123 of length 5 (type: int*)"); + } + } } \ No newline at end of file From 73cedd7f7afc2091ecd50c601f2ebd961acb29c9 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:12:41 +0200 Subject: [PATCH 39/63] Fix incorrect device in `DeviceAttrEval` --- src/builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/builder.cpp b/src/builder.cpp index 602f546..bb7868d 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -39,7 +39,7 @@ struct DeviceAttrEval: Eval { bool lookup(const Variable& v, Value& out) const override { if (const auto* that = dynamic_cast(&v)) { - out = CudaDevice::current().attribute(that->get()); + out = device_.attribute(that->get()); return true; } From 4353d3407e712bd9cd0cfaf7ecaa052fa26c0115 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:13:27 +0200 Subject: [PATCH 40/63] Check if config is valid before compiling in `KernelBuilder::compile` --- src/builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/builder.cpp b/src/builder.cpp index bb7868d..ab3bcd2 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -446,7 +446,6 @@ KernelInstance KernelBuilder::compile( const ICompiler& compiler, CudaContextHandle ctx) const { DeviceAttrEval eval = {ctx.device(), config}; - CudaModule module = compiler.compile(ctx, build(eval, param_types)); if (!is_valid(eval)) { std::stringstream ss; @@ -466,6 +465,7 @@ KernelInstance KernelBuilder::compile( TypedExpr shared_mem = shared_mem_.resolve(eval); + CudaModule module = compiler.compile(ctx, build(eval, param_types)); return { std::move(module), std::move(block_size), From bdda4b7895088a1de2bd0ce1f915cceb2249e2ae Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:20:04 +0200 Subject: [PATCH 41/63] Fix incorrect grid size calculation in captures --- src/export.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/export.cpp b/src/export.cpp index 45f1b4e..b53a32f 100644 --- a/src/export.cpp +++ b/src/export.cpp @@ -215,9 +215,9 @@ struct KernelBuilderSerializerHack { builder.determine_block_size(2)}); result["grid_size"] = expr_list_to_json(std::array { - builder.determine_block_size(0), - builder.determine_block_size(1), - builder.determine_block_size(2)}); + builder.determine_grid_size(0), + builder.determine_grid_size(1), + builder.determine_grid_size(2)}); return result; } From 6ae71d0cf9810c559f2d18efdb3eafe51b1a002e Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Apr 2023 22:20:17 +0200 Subject: [PATCH 42/63] Add `make pretty` --- Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 87bb8da..fb6b000 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ BUILD_DIR=build -fmt: +pretty: clang-format -i include/kernel_launcher/*.h src/*.cpp tests/*.cpp examples/*/*.cu +fmt: pretty + test: ${BUILD_DIR} cd ${BUILD_DIR} && make kernel_launcher_tests cd tests && KERNEL_LAUNCHER_LOG=debug ../${BUILD_DIR}/tests/kernel_launcher_tests ${TEST} @@ -11,7 +13,7 @@ ${BUILD_DIR}: mkdir ${BUILD_DIR} cd ${BUILD_DIR} && cmake -DKERNEL_LAUNCHER_BUILD_TEST=1 -DCMAKE_BUILD_TYPE=debug .. -all: fmt test +all: pretty test clean: -.PHONY: fmt test all clean +.PHONY: pretty fmt test all clean From 19f0df3706f1372eb5dc744430b4be9724bfb507 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:13:30 +0200 Subject: [PATCH 43/63] Fix bug in parsing of `KERNEL_LAUNCHER_INCLUDE` --- src/fs.cpp | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/fs.cpp b/src/fs.cpp index eb32058..a9271b6 100644 --- a/src/fs.cpp +++ b/src/fs.cpp @@ -109,31 +109,32 @@ static void add_env_directories(std::vector& result) { return; } - while (true) { - if (paths[0] == '\0') { - break; - } - - if (paths[0] == ';') { - paths++; - continue; - } + size_t index = 0; + bool is_done = false; - size_t count = 0; + while (!is_done) { + size_t length = 0; while (true) { - char c = paths[count]; - if (c == '\0' || c == ';') { + char c = paths[index + length]; + if (c == '\0') { + is_done = true; break; } - count++; + + // allowed seperators are ';', ',' and ':' + if (c == ';' || c == ',' || c == ':') { + break; + } + + length++; } - if (count > 0) { - result.emplace_back(paths, count); + if (length > 0) { + result.emplace_back(&paths[index], length); } - paths += count; + index += length + 1; } } From a5851fc8b4481e5a341cd5f56fac5404aa45e688 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:15:18 +0200 Subject: [PATCH 44/63] make IntoKernelArg an empty class --- include/kernel_launcher/arg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kernel_launcher/arg.h b/include/kernel_launcher/arg.h index e4030ed..b625187 100644 --- a/include/kernel_launcher/arg.h +++ b/include/kernel_launcher/arg.h @@ -117,7 +117,7 @@ struct KernelArg { * See `into_kernel_arg(T&&)`. */ template -struct IntoKernelArg; +struct IntoKernelArg {}; template<> struct IntoKernelArg { From fc4f94a4c36b6dcf20dc8dec8edf2161a0cfed79 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:16:22 +0200 Subject: [PATCH 45/63] Fix bug in parsing of `KERNEL_LAUNCHER_CAPTURE_FORCE` --- src/wisdom.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 8bb7d3d..52883ed 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -448,7 +448,7 @@ DefaultOracle DefaultOracle::from_env() { } patterns = value; - force = strstr(value, "FORCE") != nullptr; + force = strstr(key, "FORCE") != nullptr; } if (patterns == "1" || patterns == "true" || patterns == "TRUE") { @@ -469,8 +469,6 @@ DefaultOracle DefaultOracle::from_env() { // Print info message on which kernels will be tuned. if (!capture_patterns.empty()) { - std::stringstream ss; - log_info() << "capture enabled for the following kernels: " << string_comma_join(capture_patterns) << "\n"; } From ed113be5ce5e48b1ba328e08cd8d100707c3c575 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 14:57:32 +0200 Subject: [PATCH 46/63] Allow multiple delimiters for `string_split` --- include/kernel_launcher/utils.h | 2 ++ src/fs.cpp | 28 +++------------------------- src/utils.cpp | 21 +++++++++++++++++++-- tests/utils.cpp | 9 ++++++++- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/include/kernel_launcher/utils.h b/include/kernel_launcher/utils.h index c8e068e..9235dec 100644 --- a/include/kernel_launcher/utils.h +++ b/include/kernel_launcher/utils.h @@ -404,6 +404,8 @@ bool safe_int64_mul(int64_t lhs, int64_t rhs, int64_t& output); bool safe_int64_div(int64_t lhs, int64_t rhs, int64_t& output); bool string_match(const char* pattern, const char* input); +std::vector +string_split(const char* input, const std::vector& delims); std::vector string_split(const char* input, char delim); using hash_t = uint64_t; diff --git a/src/fs.cpp b/src/fs.cpp index a9271b6..a3b7fe4 100644 --- a/src/fs.cpp +++ b/src/fs.cpp @@ -109,32 +109,10 @@ static void add_env_directories(std::vector& result) { return; } - size_t index = 0; - bool is_done = false; - - while (!is_done) { - size_t length = 0; - - while (true) { - char c = paths[index + length]; - if (c == '\0') { - is_done = true; - break; - } - - // allowed seperators are ';', ',' and ':' - if (c == ';' || c == ',' || c == ':') { - break; - } - - length++; - } - - if (length > 0) { - result.emplace_back(&paths[index], length); + for (std::string path : string_split(paths, {':', ',', ';'})) { + if (!path.empty()) { + result.push_back(path); } - - index += length + 1; } } diff --git a/src/utils.cpp b/src/utils.cpp index 03ac230..31eeec1 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -218,14 +218,27 @@ bool string_match(const char* pattern, const char* input) { return false; } -std::vector string_split(const char* input, char delim) { +std::vector +string_split(const char* input, const std::vector& delims) { size_t start = 0; std::vector result; while (input[start] != '\0') { size_t end = start; - while (input[end] != '\0' && input[end] != delim) { + while (input[end] != '\0') { + bool is_delim = false; + + for (char delim : delims) { + if (input[end] == delim) { + is_delim = true; + } + } + + if (is_delim) { + break; + } + end++; } @@ -241,6 +254,10 @@ std::vector string_split(const char* input, char delim) { return result; } +std::vector string_split(const char* input, char delim) { + return string_split(input, std::vector {delim}); +} + hash_t hash_string(const char* buffer, size_t num_bytes) { // Simple FNV1a hash static constexpr hash_t prime = 0x100000001b3; diff --git a/tests/utils.cpp b/tests/utils.cpp index 71f01dd..bb13f15 100644 --- a/tests/utils.cpp +++ b/tests/utils.cpp @@ -118,11 +118,18 @@ TEST_CASE("test string_match") { TEST_CASE("test string_split") { using v = std::vector; CHECK(string_split("a,b,c", ',') == v {"a", "b", "c"}); + CHECK(string_split(",b,c", ',') == v {"", "b", "c"}); CHECK(string_split("a,,c", ',') == v {"a", "", "c"}); CHECK(string_split("a,b,", ',') == v {"a", "b", ""}); - CHECK(string_split(",b,c", ',') == v {"", "b", "c"}); + CHECK(string_split("a,,", ',') == v {"a", "", ""}); + CHECK(string_split(",b,", ',') == v {"", "b", ""}); + CHECK(string_split(",,c", ',') == v {"", "", "c"}); + CHECK(string_split(",,", ',') == v {"", "", ""}); + CHECK(string_split(",", ',') == v {"", ""}); CHECK(string_split("", ',') == v {""}); CHECK(string_split("aaaaa|bbbbb", '|') == v {"aaaaa", "bbbbb"}); + + CHECK(string_split("a|b,c:d", {'|', ',', ':'}) == v {"a", "b", "c", "d"}); } TEST_CASE("test ProblemSize") { From b44ce52e2cc5c25ea97040bc78152e977fa8edf1 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Apr 2023 15:03:34 +0200 Subject: [PATCH 47/63] Add `CaptureRule` object to replace strings that represent capture patterns --- include/kernel_launcher/wisdom.h | 31 +++++---- src/wisdom.cpp | 109 +++++++++++++++---------------- 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/include/kernel_launcher/wisdom.h b/include/kernel_launcher/wisdom.h index 4f56a5e..e0ccfeb 100644 --- a/include/kernel_launcher/wisdom.h +++ b/include/kernel_launcher/wisdom.h @@ -124,6 +124,16 @@ struct Oracle { const std::vector>& outputs) const = 0; }; +struct CaptureRule { + CaptureRule(std::string pattern, bool force = false) : + pattern(std::move(pattern)), + force(force) {} + CaptureRule(const char* pattern) : CaptureRule(std::string(pattern)) {} + + std::string pattern = ""; + bool force = false; +}; + struct DefaultOracle: Oracle { static DefaultOracle from_env(); @@ -131,8 +141,7 @@ struct DefaultOracle: Oracle { DefaultOracle( std::vector wisdom_dirs, std::string capture_dir, - std::vector capture_patterns = {}, - bool force_capture = false); + std::vector capture_rules = {}); virtual ~DefaultOracle() = default; @@ -173,19 +182,14 @@ struct DefaultOracle: Oracle { return capture_dir_; } - const std::vector& capture_patterns() const { - return capture_patterns_; - } - - bool is_capture_forced() const { - return force_capture_; + const std::vector& capture_rules() const { + return capture_rules_; } private: std::vector wisdom_dirs_; std::string capture_dir_; - std::vector capture_patterns_; - bool force_capture_; + std::vector capture_rules_; }; /** @@ -197,8 +201,7 @@ struct WisdomSettings { WisdomSettings( std::string wisdom_dir, std::string capture_dir, - std::vector capture_patterns = {}, - bool force_capture = false); + std::vector capture_rules = {}); WisdomSettings(std::shared_ptr oracle); template @@ -270,6 +273,8 @@ WisdomSettings default_wisdom_settings(); * returned by `default_wisdom_settings`. */ void append_global_wisdom_directory(std::string); + +// Deprecated void set_global_wisdom_directory(std::string); /** @@ -282,7 +287,7 @@ void set_global_capture_directory(std::string); * Add capture pattern to the `WisdomSettings` returned by * `default_wisdom_settings`. */ -void add_global_capture_pattern(std::string); +void add_global_capture_pattern(CaptureRule rule); } // namespace kernel_launcher diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 52883ed..0409c96 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -397,21 +397,19 @@ DefaultOracle::DefaultOracle() : DefaultOracle(*get_global_wisdom()) {} DefaultOracle::DefaultOracle( std::vector wisdom_dirs, std::string capture_dir, - std::vector capture_patterns, - bool force_capture) : + std::vector capture_rules) : wisdom_dirs_(std::move(wisdom_dirs)), capture_dir_(std::move(capture_dir)), - capture_patterns_(std::move(capture_patterns)), - force_capture_(force_capture) {} + capture_rules_(std::move(capture_rules)) {} DefaultOracle DefaultOracle::from_env() { std::vector wisdom_dirs = {"."}; std::string capture_dir = "."; - std::vector capture_patterns = {}; + std::vector capture_rules = {}; const char* value; if ((value = getenv("KERNEL_LAUNCHER_WISDOM")) != nullptr) { - for (std::string dir : string_split(value, ':')) { + for (std::string dir : string_split(value, {':', ';', ','})) { if (!dir.empty()) { wisdom_dirs.emplace_back(std::move(dir)); } @@ -426,58 +424,58 @@ DefaultOracle DefaultOracle::from_env() { capture_dir = value; } - std::string patterns; - bool force = false; - - // Try the following environment keys in order - const char* env_keys[4] = { + // Try the following environment keys + const char* env_keys[6] = { "KERNEL_LAUNCHER_CAPTURE_FORCE", "KERNEL_LAUNCHER_CAPTURE", + "KERNEL_LAUNCHER_FORCE_CAPTURE", "KERNEL_LAUNCHER_TUNE_FORCE", + "KERNEL_LAUNCHER_FORCE_TUNE", "KERNEL_LAUNCHER_TUNE", }; for (const char* key : env_keys) { + // Check if variable exists if ((value = getenv(key)) == nullptr) { continue; } - if (!patterns.empty()) { - log_warning() << "environment key " << key << " was ignored\n"; - continue; - } + std::string patterns = value; + bool force = strstr(key, "FORCE") != nullptr; - patterns = value; - force = strstr(key, "FORCE") != nullptr; - } + // Some patterns are special cased + if (patterns == "1" || patterns == "true" || patterns == "TRUE") { + patterns = "*"; + } - if (patterns == "1" || patterns == "true" || patterns == "TRUE") { - patterns = "*"; - } + if (patterns == "0" || patterns == "false" || patterns == "FALSE") { + patterns = ""; + } - if (patterns == "0" || patterns == "false" || patterns == "FALSE") { - patterns = ""; - } + for (auto pattern : string_split(patterns.c_str(), {',', '|', ';'})) { + if (pattern.empty()) { + continue; + } - for (std::string pattern : string_split(patterns.c_str(), ',')) { - if (pattern.empty()) { - continue; + capture_rules.emplace_back(std::move(pattern), force); } - - capture_patterns.push_back(std::move(pattern)); } // Print info message on which kernels will be tuned. - if (!capture_patterns.empty()) { - log_info() << "capture enabled for the following kernels: " - << string_comma_join(capture_patterns) << "\n"; + if (!capture_rules.empty() && log_info_enabled()) { + std::vector names; + for (const auto& p : capture_rules) { + names.push_back(p.pattern); + } + + log_info() << "the following kernels will be captured: " + << string_comma_join(names) << "\n"; } return DefaultOracle( std::move(wisdom_dirs), std::move(capture_dir), - std::move(capture_patterns), - force); + std::move(capture_rules)); } Config DefaultOracle::load_config( @@ -526,24 +524,27 @@ bool DefaultOracle::should_capture_kernel( ProblemSize problem_size, WisdomResult result) const { bool matches = false; + bool forced = false; - // If wisdom was found for this kernel and we do not force tuning, - // then there is no need to tune this kernel. - if (result == WisdomResult::Ok && !force_capture_) { - return false; - } - - for (const std::string& pattern : capture_patterns_) { - if (string_match(pattern.c_str(), tuning_key.c_str())) { + for (const auto& rule : capture_rules_) { + if (string_match(rule.pattern.c_str(), tuning_key.c_str())) { matches = true; + forced |= rule.force; break; } } + // No rule matches. We are done. if (!matches) { return false; } + // If wisdom was found for this kernel and we do not force tuning, + // then there is no need to capture this kernel. + if (result == WisdomResult::Ok && !forced) { + return false; + } + if (capture_file_exists(capture_dir_, tuning_key, problem_size)) { return false; } @@ -560,8 +561,7 @@ void append_global_wisdom_directory(std::string dir) { set_global_wisdom(DefaultOracle( std::move(dirs), wisdom->capture_directory(), - wisdom->capture_patterns(), - wisdom->is_capture_forced())); + wisdom->capture_rules())); } void set_global_wisdom_directory(std::string dir) { @@ -570,8 +570,7 @@ void set_global_wisdom_directory(std::string dir) { set_global_wisdom(DefaultOracle( std::vector {std::move(dir)}, wisdom->capture_directory(), - wisdom->capture_patterns(), - wisdom->is_capture_forced())); + wisdom->capture_rules())); } void set_global_capture_directory(std::string dir) { @@ -580,20 +579,18 @@ void set_global_capture_directory(std::string dir) { set_global_wisdom(DefaultOracle( wisdom->wisdom_directories(), std::move(dir), - wisdom->capture_patterns(), - wisdom->is_capture_forced())); + wisdom->capture_rules())); } -void add_global_capture_pattern(std::string pattern) { +void add_global_capture_pattern(CaptureRule rule) { auto wisdom = get_global_wisdom(); - std::vector patterns = wisdom->capture_patterns(); - patterns.push_back(std::move(pattern)); + std::vector rules = wisdom->capture_rules(); + rules.push_back(std::move(rule)); set_global_wisdom(DefaultOracle( wisdom->wisdom_directories(), wisdom->capture_directory(), - patterns, - wisdom->is_capture_forced())); + rules)); } WisdomSettings default_wisdom_settings() { @@ -612,12 +609,10 @@ WisdomSettings::WisdomSettings(std::shared_ptr oracle) : WisdomSettings::WisdomSettings( std::string wisdom_dir, std::string capture_dir, - std::vector capture_patterns, - bool force_capture) : + std::vector capture_rules) : WisdomSettings(std::make_shared( std::vector {std::move(wisdom_dir)}, std::move(capture_dir), - std::move(capture_patterns), - force_capture)) {} + std::move(capture_rules))) {} } // namespace kernel_launcher From 35d6cb56928ec50f9947ac8aa823070fa3a4f8c8 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 6 Apr 2023 19:32:45 +0200 Subject: [PATCH 48/63] Add support for capturing kernel that use run-time arguments in their specifications --- include/kernel_launcher/arg.h | 3 +- include/kernel_launcher/export.h | 6 +- include/kernel_launcher/wisdom.h | 32 ++--- src/arg.cpp | 49 +++---- src/export.cpp | 183 ++++++++++++++++---------- src/kernel.cpp | 92 +++++++------ src/wisdom.cpp | 12 +- tests/arg.cpp | 39 ++++++ tests/assets/vector_add_key_1024.json | 98 ++------------ tests/export.cpp | 62 +++++---- tests/test_utils.h | 2 +- 11 files changed, 307 insertions(+), 271 deletions(-) diff --git a/include/kernel_launcher/arg.h b/include/kernel_launcher/arg.h index b625187..594abb6 100644 --- a/include/kernel_launcher/arg.h +++ b/include/kernel_launcher/arg.h @@ -95,8 +95,9 @@ struct KernelArg { void assert_type_matches(TypeInfo t) const; bool is_scalar() const; bool is_array() const; - std::vector to_bytes() const; + std::vector copy_array() const; void* as_void_ptr() const; + std::vector to_bytes() const; friend std::ostream& operator<<(std::ostream&, const KernelArg&); diff --git a/include/kernel_launcher/export.h b/include/kernel_launcher/export.h index 42d73dd..0f81e44 100644 --- a/include/kernel_launcher/export.h +++ b/include/kernel_launcher/export.h @@ -28,9 +28,9 @@ void export_capture_file( const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs = {}); + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays = {}); } // namespace kernel_launcher diff --git a/include/kernel_launcher/wisdom.h b/include/kernel_launcher/wisdom.h index e0ccfeb..320137d 100644 --- a/include/kernel_launcher/wisdom.h +++ b/include/kernel_launcher/wisdom.h @@ -119,9 +119,9 @@ struct Oracle { const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) const = 0; + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) const = 0; }; struct CaptureRule { @@ -130,7 +130,7 @@ struct CaptureRule { force(force) {} CaptureRule(const char* pattern) : CaptureRule(std::string(pattern)) {} - std::string pattern = ""; + std::string pattern; bool force = false; }; @@ -143,22 +143,22 @@ struct DefaultOracle: Oracle { std::string capture_dir, std::vector capture_rules = {}); - virtual ~DefaultOracle() = default; + ~DefaultOracle() override = default; - virtual Config load_config( + Config load_config( const std::string& tuning_key, const ConfigSpace& space, ProblemSize problem_size, CudaDevice device, bool* should_capture_out) const override; - virtual void capture_kernel( + void capture_kernel( const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) const override; + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) const override; virtual bool should_capture_kernel( const std::string& tuning_key, @@ -247,16 +247,16 @@ struct WisdomSettings { const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) const { + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) const { return impl_->capture_kernel( tuning_key, builder, problem_size, - param_types, - inputs, - outputs); + arguments, + input_arrays, + output_arrays); } private: diff --git a/src/arg.cpp b/src/arg.cpp index a0bca93..bb21242 100644 --- a/src/arg.cpp +++ b/src/arg.cpp @@ -163,35 +163,40 @@ TypeInfo KernelArg::type() const { } std::vector KernelArg::to_bytes() const { - std::vector result; + size_t nbytes = type_.size(); + std::vector result(nbytes); + ::memcpy(result.data(), as_void_ptr(), nbytes); + return result; +} +std::vector KernelArg::copy_array() const { if (is_array()) { - result.resize(type_.remove_pointer().size() * data_.array.nelements); - KERNEL_LAUNCHER_CUDA_CHECK(cuMemcpy( - reinterpret_cast(result.data()), - reinterpret_cast(data_.array.ptr), - result.size())); - } else { - // If the type is a pointer, exporting it to bytes will return - // the memory address of the pointer and not the data of the buffer it - // points to This is likely a bug on the user side. Todo: find a better - // way f handling this error (maybe already in KernelArg ctor?). - if (type_.is_pointer()) { - throw std::runtime_error("a raw pointer type was provided as " - "kernel argument (" + type_.name() + ") which cannot be " - "exported since the corresponding buffer size is unknown"); + size_t nbytes = type_.remove_pointer().size() * data_.array.nelements; + std::vector result(nbytes); + + if (nbytes > 0) { + KERNEL_LAUNCHER_CUDA_CHECK(cuMemcpy( + reinterpret_cast(result.data()), + reinterpret_cast(data_.array.ptr), + nbytes)); } - result.resize(type_.size()); + return result; + } - if (is_inline_scalar(type_)) { - ::memcpy(result.data(), data_.small_scalar.data(), type_.size()); - } else { - ::memcpy(result.data(), data_.large_scalar, type_.size()); - } + std::string msg; + + if (type_.is_pointer()) { + msg = "a raw pointer of type " + type_.name() + " was provided as " + "kernel argument which cannot be exported since the " + "corresponding buffer size is unknown"; + } else { + msg = "a scalar of type " + type_.name() + " was provided as " + "kernel argument which cannot be exported since it is not " + "an array"; } - return result; + throw std::runtime_error(msg); } KernelArg::KernelArg() : type_(type_of()), scalar_(true) {} diff --git a/src/export.cpp b/src/export.cpp index b53a32f..a90477d 100644 --- a/src/export.cpp +++ b/src/export.cpp @@ -51,13 +51,13 @@ static json value_to_json(const Value& expr) { //struct SelectExpr: BaseExpr { -static json expr_to_json(const BaseExpr& expr) { +static json expr_to_json(const BaseExpr& expr, const Eval& eval) { if (const ScalarExpr* v = dynamic_cast(&expr)) { return value_to_json(v->value()); } if (const SharedExpr* v = dynamic_cast(&expr)) { - return expr_to_json(v->inner()); + return expr_to_json(v->inner(), eval); } std::string op; @@ -68,17 +68,17 @@ static json expr_to_json(const BaseExpr& expr) { operands.emplace_back(pe->parameter().name()); } else if (const auto* ue = dynamic_cast(&expr)) { op = ue->op_name(); - operands.emplace_back(expr_to_json(ue->operand())); + operands.emplace_back(expr_to_json(ue->operand(), eval)); } else if (const auto* be = dynamic_cast(&expr)) { op = be->op_name(); - operands.emplace_back(expr_to_json(be->left_operand())); - operands.emplace_back(expr_to_json(be->right_operand())); + operands.emplace_back(expr_to_json(be->left_operand(), eval)); + operands.emplace_back(expr_to_json(be->right_operand(), eval)); } else if (const auto* se = dynamic_cast(&expr)) { op = "select"; - operands.emplace_back(expr_to_json(se->condition())); + operands.emplace_back(expr_to_json(se->condition(), eval)); for (const auto& p : se->operands()) { - operands.push_back(expr_to_json(p)); + operands.push_back(expr_to_json(p, eval)); } } else if ( const auto* de = dynamic_cast(&expr)) { @@ -87,6 +87,14 @@ static json expr_to_json(const BaseExpr& expr) { } else if (const auto* pse = dynamic_cast(&expr)) { op = "problem_size"; operands.emplace_back(pse->axis()); + } else if (const auto* var = dynamic_cast(&expr)) { + Value value; + if (eval.lookup(*var, value)) { + return value_to_json(value); + } + + throw std::runtime_error( + "could not serialize unknown variable: " + expr.to_string()); } else { throw std::runtime_error( "could not serialize expression: " + expr.to_string()); @@ -96,11 +104,11 @@ static json expr_to_json(const BaseExpr& expr) { } template -static std::vector expr_list_to_json(C collection) { +static std::vector expr_list_to_json(C collection, const Eval& eval) { std::vector result; for (const auto& entry : collection) { - result.push_back(expr_to_json(entry)); + result.push_back(expr_to_json(entry, eval)); } return result; @@ -156,12 +164,13 @@ static json tunable_param_to_json(const TunableParam& param) { } struct KernelBuilderSerializerHack { - static json config_space_to_json(const KernelBuilder& builder) { + static json + config_space_to_json(const KernelBuilder& builder, const Eval& eval) { std::vector restrictions; - for (auto e : expr_list_to_json(builder.restrictions_)) { + for (auto e : expr_list_to_json(builder.restrictions_, eval)) { restrictions.emplace_back(std::move(e)); } - for (auto e : expr_list_to_json(builder.assertions_)) { + for (auto e : expr_list_to_json(builder.assertions_, eval)) { restrictions.emplace_back(std::move(e)); } @@ -175,7 +184,8 @@ struct KernelBuilderSerializerHack { {"restrictions", std::move(restrictions)}}; } - static json builder_to_json(const KernelBuilder& builder) { + static json + builder_to_json(const KernelBuilder& builder, const Eval& eval) { std::vector headers; for (const auto& source : builder.preheaders_) { json content = nullptr; @@ -191,9 +201,19 @@ struct KernelBuilderSerializerHack { json defines = json::object(); for (const auto& p : builder.defines_) { - defines[p.first] = expr_to_json(p.second); + defines[p.first] = expr_to_json(p.second, eval); } + std::array block_size = { + builder.determine_block_size(0), + builder.determine_block_size(1), + builder.determine_block_size(2)}; + + std::array grid_size = { + builder.determine_grid_size(0), + builder.determine_grid_size(1), + builder.determine_grid_size(2)}; + json result; const std::string* content = builder.kernel_source_.content(); if (content != nullptr) { @@ -203,21 +223,16 @@ struct KernelBuilderSerializerHack { } result["name"] = builder.kernel_name_; - result["compile_flags"] = expr_list_to_json(builder.compile_flags_); - result["shared_memory"] = expr_to_json(builder.shared_mem_); - result["template_args"] = expr_list_to_json(builder.template_args_); - result["defines"] = std::move(defines); + result["compile_flags"] = + expr_list_to_json(builder.compile_flags_, eval); + result["shared_memory"] = expr_to_json(builder.shared_mem_, eval); + result["template_args"] = + expr_list_to_json(builder.template_args_, eval); result["headers"] = std::move(headers); + result["defines"] = std::move(defines); - result["block_size"] = expr_list_to_json(std::array { - builder.determine_block_size(0), - builder.determine_block_size(1), - builder.determine_block_size(2)}); - - result["grid_size"] = expr_list_to_json(std::array { - builder.determine_grid_size(0), - builder.determine_grid_size(1), - builder.determine_grid_size(2)}); + result["block_size"] = expr_list_to_json(block_size, eval); + result["grid_size"] = expr_list_to_json(grid_size, eval); return result; } @@ -329,51 +344,47 @@ static const DataFile& write_kernel_arg( static json kernel_args_to_json( const std::string& tuning_key, const std::string& data_dir, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) { + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) { std::vector previous_files; std::vector result; - size_t nargs = param_types.size(); - if (inputs.size() != nargs - || (!outputs.empty() && outputs.size() != nargs)) { - throw std::invalid_argument("invalid number of arguments"); - } - - for (size_t i = 0; i < nargs; i++) { - TypeInfo dtype = param_types[i]; + size_t array_index = 0; + for (const auto& argument : arguments) { json entry; - entry["type"] = dtype.name(); + entry["type"] = argument.type().name(); - if (dtype.is_pointer()) { + if (argument.is_array()) { // Data type of primitive scalar type for pointer - TypeInfo prim_type = dtype.remove_pointer(); + TypeInfo prim_type = argument.type().remove_pointer(); - if (inputs[i].size() % prim_type.size() != 0) { - throw std::invalid_argument("invalid input argument"); + if (input_arrays.at(array_index).size() % prim_type.size() != 0) { + throw std::invalid_argument("invalid input array size"); } const DataFile& input_file = write_kernel_arg( tuning_key, data_dir, - inputs[i], + input_arrays[array_index], previous_files); entry["kind"] = "array"; entry["hash"] = input_file.hash; entry["file"] = input_file.file_name; - if (!outputs.empty() && !outputs[i].empty()) { - if (inputs[i].size() != outputs[i].size()) { - throw std::invalid_argument("invalid output argument"); + if (!output_arrays.empty() + && !output_arrays.at(array_index).empty()) { + if (input_arrays[array_index].size() + != output_arrays[array_index].size()) { + throw std::invalid_argument("invalid output array size"); } const DataFile& output_file = write_kernel_arg( tuning_key, data_dir, - outputs[i], + output_arrays[array_index], previous_files); // Only add the output reference file if it does not match the @@ -383,13 +394,11 @@ static json kernel_args_to_json( entry["reference_hash"] = output_file.hash; } } - } else { - if (inputs[i].size() != dtype.size()) { - throw std::invalid_argument("invalid argument"); - } + array_index++; + } else { entry["kind"] = "scalar"; - entry["data"] = inputs[i]; + entry["data"] = argument.to_bytes(); } result.emplace_back(std::move(entry)); @@ -398,22 +407,64 @@ static json kernel_args_to_json( return result; } +struct ArgsEval: Eval { + explicit ArgsEval( + ProblemSize problem_size, + const std::vector& args) : + args_(args), + problem_size_(problem_size) {} + + bool lookup(const Variable& v, Value& out) const override { + if (const auto* that = dynamic_cast(&v)) { + size_t i = that->get(); + + if (i < args_.size()) { + out = args_[i].to_value_or_empty(); + + if (!out.is_empty()) { + return true; + } + } + } + + if (const auto* that = dynamic_cast(&v)) { + if (that->axis() < problem_size_.size()) { + out = problem_size_[that->axis()]; + return true; + } + } + + return false; + } + + private: + const std::vector& args_; + const ProblemSize problem_size_; +}; + static json kernel_to_json( const std::string& tuning_key, const KernelBuilder& builder, const std::string& data_dir, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) { + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) { + ArgsEval eval(problem_size, arguments); + json result; result["key"] = tuning_key; result["environment"] = environment_json(); result["config_space"] = - KernelBuilderSerializerHack::config_space_to_json(builder); - result["kernel"] = KernelBuilderSerializerHack::builder_to_json(builder); - result["arguments"] = - kernel_args_to_json(tuning_key, data_dir, param_types, inputs, outputs); + KernelBuilderSerializerHack::config_space_to_json(builder, eval); + result["kernel"] = + KernelBuilderSerializerHack::builder_to_json(builder, eval); + result["arguments"] = kernel_args_to_json( + tuning_key, + data_dir, + arguments, + input_arrays, + output_arrays); result["problem_size"] = std::vector { problem_size.x, problem_size.y, @@ -459,9 +510,9 @@ void export_capture_file( const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) { + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) { std::string file_name = tuning_key_to_file_name(directory, tuning_key, problem_size); @@ -471,9 +522,9 @@ void export_capture_file( builder, directory, problem_size, - param_types, - inputs, - outputs); + arguments, + input_arrays, + output_arrays); log_info() << "writing capture to " << file_name << " for kernel " << tuning_key << std::endl; diff --git a/src/kernel.cpp b/src/kernel.cpp index af98bf8..e598154 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -130,14 +130,59 @@ static void assert_types_equal( throw std::runtime_error(msg); } +static void launch_captured_impl( + WisdomKernelImpl* impl_, + cudaStream_t stream, + ProblemSize problem_size, + const std::vector& args) { + const std::string& tuning_key = impl_->builder_.tuning_key(); + std::vector> inputs; + std::vector> outputs; + + KERNEL_LAUNCHER_CUDA_CHECK(cuStreamSynchronize(stream)); + KERNEL_LAUNCHER_CUDA_CHECK(cuCtxSynchronize()); + + for (const KernelArg& arg : args) { + if (arg.is_array()) { + inputs.emplace_back(arg.copy_array()); + } + } + + impl_->instance_.launch(stream, problem_size, args); + + KERNEL_LAUNCHER_CUDA_CHECK(cuStreamSynchronize(stream)); + KERNEL_LAUNCHER_CUDA_CHECK(cuCtxSynchronize()); + + for (const KernelArg& arg : args) { + if (arg.is_array()) { + outputs.emplace_back(arg.copy_array()); + } + } + + try { + impl_->settings_.capture_kernel( + tuning_key, + impl_->builder_, + problem_size, + args, + inputs, + outputs); + } catch (const std::exception& err) { + log_warning() << "error ignored while writing tuning file for \"" + << tuning_key << "\": " << err.what(); + } +} + void WisdomKernel::launch(cudaStream_t stream, std::vector args) { if (!impl_) { throw std::runtime_error("WisdomKernel has not been initialized"); } + std::lock_guard guard(impl_->mutex_); + ProblemSize problem_size = impl_->problem_processor_(args); + bool should_capture = false; - std::lock_guard guard(impl_->mutex_); if (!impl_->compiled_) { const std::string& tuning_key = impl_->builder_.tuning_key(); @@ -146,7 +191,6 @@ void WisdomKernel::launch(cudaStream_t stream, std::vector args) { param_types.push_back(arg.type()); } - bool should_capture = false; compile_impl( impl_.get(), tuning_key, @@ -154,47 +198,15 @@ void WisdomKernel::launch(cudaStream_t stream, std::vector args) { CudaContextHandle::current(), param_types, &should_capture); - - if (should_capture) { - std::vector> inputs; - std::vector> outputs; - - KERNEL_LAUNCHER_CUDA_CHECK(cuStreamSynchronize(stream)); - KERNEL_LAUNCHER_CUDA_CHECK(cuCtxSynchronize()); - - for (const KernelArg& arg : args) { - inputs.push_back(arg.to_bytes()); - } - - impl_->instance_.launch(stream, problem_size, args); - - KERNEL_LAUNCHER_CUDA_CHECK(cuStreamSynchronize(stream)); - KERNEL_LAUNCHER_CUDA_CHECK(cuCtxSynchronize()); - - for (const KernelArg& arg : args) { - outputs.push_back(arg.to_bytes()); - } - - try { - impl_->settings_.capture_kernel( - tuning_key, - impl_->builder_, - problem_size, - param_types, - inputs, - outputs); - } catch (const std::exception& err) { - log_warning() - << "error ignored while writing tuning file for \"" - << tuning_key << "\": " << err.what(); - } - - return; - } } assert_types_equal(args, impl_->param_types_); - impl_->instance_.launch(stream, problem_size, args); + + if (should_capture) { + launch_captured_impl(impl_.get(), stream, problem_size, args); + } else { + impl_->instance_.launch(stream, problem_size, args); + } } } // namespace kernel_launcher \ No newline at end of file diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 0409c96..406de7e 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -506,17 +506,17 @@ void DefaultOracle::capture_kernel( const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, - const std::vector& param_types, - const std::vector>& inputs, - const std::vector>& outputs) const { + const std::vector& arguments, + const std::vector>& input_arrays, + const std::vector>& output_arrays) const { export_capture_file( capture_dir_, tuning_key, builder, problem_size, - param_types, - inputs, - outputs); + arguments, + input_arrays, + output_arrays); } bool DefaultOracle::should_capture_kernel( diff --git a/tests/arg.cpp b/tests/arg.cpp index 5d4949a..6a9e65c 100644 --- a/tests/arg.cpp +++ b/tests/arg.cpp @@ -139,4 +139,43 @@ TEST_CASE("test KernelArg") { CHECK(stream.str() == "array 0x123 of length 5 (type: int*)"); } } +} + +// These tests are seperate since they require CUDA +TEST_CASE("test KernelArg::copy_array", "[CUDA]") { + CUcontext ctx; + KERNEL_LAUNCHER_CUDA_CHECK(cuInit(0)); + KERNEL_LAUNCHER_CUDA_CHECK(cuCtxCreate(&ctx, 0, 0)); + + SECTION("scalar int") { + KernelArg v = KernelArg::from_scalar((int)123); + CHECK(v.to_bytes() == std::vector {123, 0, 0, 0}); + CHECK_THROWS(v.copy_array()); + } + + SECTION("array int*") { + std::vector array {1, 2, 3}; + + std::vector ptr_bytes(sizeof(int*)); + int* array_ptr = array.data(); + ::memcpy(ptr_bytes.data(), (uint8_t*)&array_ptr, sizeof(int*)); + + KernelArg v = KernelArg::from_array(array.data(), array.size()); + CHECK(v.to_bytes() == ptr_bytes); + CHECK( + v.copy_array() + == std::vector {1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}); + } + + SECTION("array NULL") { + KernelArg v = into_kernel_arg((int*)nullptr); + CHECK(v.to_bytes() == std::vector {0, 0, 0, 0, 0, 0, 0, 0}); + CHECK_THROWS(v.copy_array()); + } + + SECTION("array nullptr") { + KernelArg v = into_kernel_arg(nullptr); + CHECK(v.to_bytes() == std::vector {0, 0, 0, 0, 0, 0, 0, 0}); + CHECK_THROWS(v.copy_array()); + } } \ No newline at end of file diff --git a/tests/assets/vector_add_key_1024.json b/tests/assets/vector_add_key_1024.json index 39ce69a..7984917 100644 --- a/tests/assets/vector_add_key_1024.json +++ b/tests/assets/vector_add_key_1024.json @@ -98,6 +98,7 @@ "source": "\n template \n __global__\n void vector_add(int n, int *c, const int* a, const int* b) {\n for (int k = 0; k < ELEMENTS_PER_THREAD; k++) {\n int index = (blockIdx.x * ELEMENTS_PER_THREAD + k) * blockDim.x + threadIdx.x;\n\n if (index < n) {\n c[index] = a[index] + b[index];\n }\n }\n }\n ", "name": "vector_add", "compile_flags": [], + "headers": [], "block_size": [ { "operator": "parameter", @@ -115,12 +116,7 @@ { "operator": "/", "operands": [ - { - "operator": "problem_size", - "operands": [ - 0 - ] - }, + 1024, { "operator": "*", "operands": [ @@ -146,12 +142,7 @@ { "operator": "%", "operands": [ - { - "operator": "problem_size", - "operands": [ - 0 - ] - }, + 1024, { "operator": "*", "operands": [ @@ -176,76 +167,8 @@ } ] }, - { - "operator": "+", - "operands": [ - { - "operator": "/", - "operands": [ - { - "operator": "problem_size", - "operands": [ - 1 - ] - }, - 1 - ] - }, - { - "operator": "!=", - "operands": [ - { - "operator": "%", - "operands": [ - { - "operator": "problem_size", - "operands": [ - 1 - ] - }, - 1 - ] - }, - 0 - ] - } - ] - }, - { - "operator": "+", - "operands": [ - { - "operator": "/", - "operands": [ - { - "operator": "problem_size", - "operands": [ - 2 - ] - }, - 1 - ] - }, - { - "operator": "!=", - "operands": [ - { - "operator": "%", - "operands": [ - { - "operator": "problem_size", - "operands": [ - 2 - ] - }, - 1 - ] - }, - 0 - ] - } - ] - } + 1, + 1 ], "shared_memory": 0, "template_args": [ @@ -275,21 +198,21 @@ "type": "float*", "kind": "array", "hash": "1ceaf73df40e531df3bfb26b4fb7cd95fb7bff1d", - "file": "vector_add_key_qcmwx175.bin", - "reference_file": "vector_add_key_970yyzfz.bin", + "file": "vector_add_key_3g6deeqn.bin.gz", + "reference_file": "vector_add_key_9ehr6p5i.bin", "reference_hash": "14c1b905d46846f68a71f5bca482cb38171b43b9" }, { "type": "float const*", "kind": "array", "hash": "81afb94d38ce9521dfbbfaa36a5fe609f802bb03", - "file": "vector_add_key_gss24clw.bin" + "file": "vector_add_key_pe2mtp0a.bin" }, { "type": "float const*", "kind": "array", "hash": "6e61fc0e7da493354e9f995796462cf2cc15ac20", - "file": "vector_add_key_i0g41210.bin" + "file": "vector_add_key_up6lg48m.bin" } ], "problem_size": [ @@ -297,4 +220,5 @@ 1, 1 ] -} \ No newline at end of file +} + diff --git a/tests/export.cpp b/tests/export.cpp index 4c46a16..93fe6e4 100644 --- a/tests/export.cpp +++ b/tests/export.cpp @@ -38,6 +38,13 @@ void compare_exports( } } +template +std::vector to_bytes(const std::vector& array) { + std::vector result(array.size() * sizeof(T)); + ::memcpy(result.data(), array.data(), result.size()); + return result; +} + TEST_CASE("test export_tuning_file", "[CUDA]") { CUcontext ctx; KERNEL_LAUNCHER_CUDA_CHECK(cuInit(0)); @@ -48,12 +55,15 @@ TEST_CASE("test export_tuning_file", "[CUDA]") { // Create temporary directory and clear its contents std::filesystem::create_directory(tmp_dir); - for (const auto& entry : std::filesystem::directory_iterator(tmp_dir)) { - if (!entry.is_regular_file()) { - continue; - } - std::filesystem::remove(entry); + SECTION("clean up") { + for (const auto& entry : std::filesystem::directory_iterator(tmp_dir)) { + if (!entry.is_regular_file()) { + continue; + } + + std::filesystem::remove(entry); + } } SECTION("vector add") { @@ -71,23 +81,20 @@ TEST_CASE("test export_tuning_file", "[CUDA]") { c_ref[i] = a[i] + b[i]; } + std::vector arguments = { + KernelArg::from_scalar(int(n)), + KernelArg::from_array((float*)c.data(), c.size()), + KernelArg::from_array((const float*)a.data(), a.size()), + KernelArg::from_array((const float*)b.data(), b.size())}; + export_capture_file( tmp_dir, "vector_add_key", builder, {uint32_t(n)}, - {type_of(), - type_of(), - type_of(), - type_of()}, - {KernelArg::from_scalar(int(n)).to_bytes(), - KernelArg::from_array(c.data(), c.size()).to_bytes(), - KernelArg::from_array(a.data(), a.size()).to_bytes(), - KernelArg::from_array(b.data(), b.size()).to_bytes()}, - {KernelArg::from_scalar(int(n)).to_bytes(), - KernelArg::from_array(c_ref.data(), c_ref.size()).to_bytes(), - KernelArg::from_array(a.data(), a.size()).to_bytes(), - KernelArg::from_array(b.data(), b.size()).to_bytes()}); + arguments, + {to_bytes(c), to_bytes(a), to_bytes(b)}, + {to_bytes(c_ref), to_bytes(a), to_bytes(b)}); compare_exports("vector_add_key_1024", tmp_dir, assets_dir); } @@ -123,23 +130,20 @@ TEST_CASE("test export_tuning_file", "[CUDA]") { } } + std::vector arguments = { + KernelArg::from_scalar(int(n)), + KernelArg::from_array((float*)c.data(), c.size()), + KernelArg::from_array((const float*)a.data(), a.size()), + KernelArg::from_array((const float*)b.data(), b.size())}; + export_capture_file( tmp_dir, "matmul_key", builder, {uint32_t(n), uint32_t(n)}, - {type_of(), - type_of(), - type_of(), - type_of()}, - {KernelArg::from_scalar(int(n)).to_bytes(), - KernelArg::from_array(c.data(), c.size()).to_bytes(), - KernelArg::from_array(a.data(), a.size()).to_bytes(), - KernelArg::from_array(b.data(), b.size()).to_bytes()}, - {KernelArg::from_scalar(int(n)).to_bytes(), - KernelArg::from_array(c_ref.data(), c_ref.size()).to_bytes(), - KernelArg::from_array(a.data(), a.size()).to_bytes(), - KernelArg::from_array(b.data(), b.size()).to_bytes()}); + arguments, + {to_bytes(c), to_bytes(a), to_bytes(b)}, + {to_bytes(c_ref), to_bytes(a), to_bytes(b)}); compare_exports("matmul_key_1024x1024", tmp_dir, assets_dir); } diff --git a/tests/test_utils.h b/tests/test_utils.h index d60a90b..1b075cd 100644 --- a/tests/test_utils.h +++ b/tests/test_utils.h @@ -63,7 +63,7 @@ inline kernel_launcher::KernelBuilder build_vector_add_kernel() { .define("ELEMENTS_PER_THREAD", et) .template_type() .block_size(tb) - .grid_divisors(eb); + .grid_size(div_ceil(arg0, eb)); return builder; } From da2b9571e99f0cb1dd0ade002a343aae4f033e68 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 6 Apr 2023 19:36:42 +0200 Subject: [PATCH 49/63] Reduce cognitive complexity of `DefaultOracle::from_env` --- src/wisdom.cpp | 54 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 406de7e..8ab9b2c 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -402,27 +402,9 @@ DefaultOracle::DefaultOracle( capture_dir_(std::move(capture_dir)), capture_rules_(std::move(capture_rules)) {} -DefaultOracle DefaultOracle::from_env() { - std::vector wisdom_dirs = {"."}; - std::string capture_dir = "."; - std::vector capture_rules = {}; +static std::vector determine_capture_rules() { const char* value; - - if ((value = getenv("KERNEL_LAUNCHER_WISDOM")) != nullptr) { - for (std::string dir : string_split(value, {':', ';', ','})) { - if (!dir.empty()) { - wisdom_dirs.emplace_back(std::move(dir)); - } - } - - if (!wisdom_dirs.empty()) { - capture_dir = wisdom_dirs[0]; - } - } - - if ((value = getenv("KERNEL_LAUNCHER_DIR")) != nullptr) { - capture_dir = value; - } + std::vector result = {}; // Try the following environment keys const char* env_keys[6] = { @@ -440,8 +422,8 @@ DefaultOracle DefaultOracle::from_env() { continue; } - std::string patterns = value; bool force = strstr(key, "FORCE") != nullptr; + std::string patterns = value; // Some patterns are special cased if (patterns == "1" || patterns == "true" || patterns == "TRUE") { @@ -453,14 +435,38 @@ DefaultOracle DefaultOracle::from_env() { } for (auto pattern : string_split(patterns.c_str(), {',', '|', ';'})) { - if (pattern.empty()) { - continue; + if (!pattern.empty()) { + result.emplace_back(std::move(pattern), force); } + } + } - capture_rules.emplace_back(std::move(pattern), force); + return result; +} + +DefaultOracle DefaultOracle::from_env() { + std::vector wisdom_dirs = {"."}; + std::string capture_dir = "."; + const char* value; + + if ((value = getenv("KERNEL_LAUNCHER_WISDOM")) != nullptr) { + for (std::string dir : string_split(value, {':', ';', ','})) { + if (!dir.empty()) { + wisdom_dirs.emplace_back(std::move(dir)); + } + } + + if (!wisdom_dirs.empty()) { + capture_dir = wisdom_dirs[0]; } } + if ((value = getenv("KERNEL_LAUNCHER_DIR")) != nullptr) { + capture_dir = value; + } + + auto capture_rules = determine_capture_rules(); + // Print info message on which kernels will be tuned. if (!capture_rules.empty() && log_info_enabled()) { std::vector names; From c24a8780b924571ad07bc6fbd7f4bb3ad5f2490b Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 6 Apr 2023 19:38:50 +0200 Subject: [PATCH 50/63] Rename `Oracle` to `IWisdomSettings` --- include/kernel_launcher/wisdom.h | 20 ++++++++--------- src/wisdom.cpp | 38 +++++++++++++++++--------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/include/kernel_launcher/wisdom.h b/include/kernel_launcher/wisdom.h index 320137d..55b96dd 100644 --- a/include/kernel_launcher/wisdom.h +++ b/include/kernel_launcher/wisdom.h @@ -105,8 +105,8 @@ inline Config load_best_config( result); } -struct Oracle { - virtual ~Oracle() = default; +struct IWisdomSettings { + virtual ~IWisdomSettings() = default; virtual Config load_config( const std::string& tuning_key, @@ -134,16 +134,16 @@ struct CaptureRule { bool force = false; }; -struct DefaultOracle: Oracle { - static DefaultOracle from_env(); +struct DefaultWisdomSettings: IWisdomSettings { + static DefaultWisdomSettings from_env(); - DefaultOracle(); - DefaultOracle( + DefaultWisdomSettings(); + DefaultWisdomSettings( std::vector wisdom_dirs, std::string capture_dir, std::vector capture_rules = {}); - ~DefaultOracle() override = default; + ~DefaultWisdomSettings() override = default; Config load_config( const std::string& tuning_key, @@ -202,11 +202,11 @@ struct WisdomSettings { std::string wisdom_dir, std::string capture_dir, std::vector capture_rules = {}); - WisdomSettings(std::shared_ptr oracle); + WisdomSettings(std::shared_ptr oracle); template WisdomSettings(std::shared_ptr ptr) : - WisdomSettings(std::shared_ptr {std::move(ptr)}) {} + WisdomSettings(std::shared_ptr {std::move(ptr)}) {} WisdomSettings(const WisdomSettings&) = default; @@ -260,7 +260,7 @@ struct WisdomSettings { } private: - std::shared_ptr impl_; + std::shared_ptr impl_; }; /** diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 8ab9b2c..6d15481 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -374,27 +374,29 @@ Config load_best_config( result_out); } -std::shared_ptr global_default_wisdom = nullptr; +std::shared_ptr global_default_wisdom = nullptr; -static std::shared_ptr set_global_wisdom(DefaultOracle oracle) { - auto ptr = std::make_shared(std::move(oracle)); +static std::shared_ptr +set_global_wisdom(DefaultWisdomSettings oracle) { + auto ptr = std::make_shared(std::move(oracle)); atomic_store(&global_default_wisdom, ptr); return ptr; } -static std::shared_ptr get_global_wisdom() { +static std::shared_ptr get_global_wisdom() { auto ptr = atomic_load(&global_default_wisdom); if (!ptr) { - ptr = set_global_wisdom(DefaultOracle::from_env()); + ptr = set_global_wisdom(DefaultWisdomSettings::DefaultWisdomSettings()); } return ptr; } -DefaultOracle::DefaultOracle() : DefaultOracle(*get_global_wisdom()) {} +DefaultWisdomSettings::DefaultWisdomSettings() : + DefaultWisdomSettings(*get_global_wisdom()) {} -DefaultOracle::DefaultOracle( +DefaultWisdomSettings::DefaultWisdomSettings( std::vector wisdom_dirs, std::string capture_dir, std::vector capture_rules) : @@ -444,7 +446,7 @@ static std::vector determine_capture_rules() { return result; } -DefaultOracle DefaultOracle::from_env() { +DefaultWisdomSettings DefaultWisdomSettings::from_env() { std::vector wisdom_dirs = {"."}; std::string capture_dir = "."; const char* value; @@ -478,13 +480,13 @@ DefaultOracle DefaultOracle::from_env() { << string_comma_join(names) << "\n"; } - return DefaultOracle( + return DefaultWisdomSettings( std::move(wisdom_dirs), std::move(capture_dir), std::move(capture_rules)); } -Config DefaultOracle::load_config( +Config DefaultWisdomSettings::load_config( const std::string& tuning_key, const ConfigSpace& space, ProblemSize problem_size, @@ -508,7 +510,7 @@ Config DefaultOracle::load_config( return config; } -void DefaultOracle::capture_kernel( +void DefaultWisdomSettings::capture_kernel( const std::string& tuning_key, const KernelBuilder& builder, ProblemSize problem_size, @@ -525,7 +527,7 @@ void DefaultOracle::capture_kernel( output_arrays); } -bool DefaultOracle::should_capture_kernel( +bool DefaultWisdomSettings::should_capture_kernel( const std::string& tuning_key, ProblemSize problem_size, WisdomResult result) const { @@ -564,7 +566,7 @@ void append_global_wisdom_directory(std::string dir) { auto dirs = wisdom->wisdom_directories(); dirs.push_back(std::move(dir)); - set_global_wisdom(DefaultOracle( + set_global_wisdom(DefaultWisdomSettings( std::move(dirs), wisdom->capture_directory(), wisdom->capture_rules())); @@ -573,7 +575,7 @@ void append_global_wisdom_directory(std::string dir) { void set_global_wisdom_directory(std::string dir) { auto wisdom = get_global_wisdom(); - set_global_wisdom(DefaultOracle( + set_global_wisdom(DefaultWisdomSettings( std::vector {std::move(dir)}, wisdom->capture_directory(), wisdom->capture_rules())); @@ -582,7 +584,7 @@ void set_global_wisdom_directory(std::string dir) { void set_global_capture_directory(std::string dir) { auto wisdom = get_global_wisdom(); - set_global_wisdom(DefaultOracle( + set_global_wisdom(DefaultWisdomSettings( wisdom->wisdom_directories(), std::move(dir), wisdom->capture_rules())); @@ -593,7 +595,7 @@ void add_global_capture_pattern(CaptureRule rule) { std::vector rules = wisdom->capture_rules(); rules.push_back(std::move(rule)); - set_global_wisdom(DefaultOracle( + set_global_wisdom(DefaultWisdomSettings( wisdom->wisdom_directories(), wisdom->capture_directory(), rules)); @@ -605,7 +607,7 @@ WisdomSettings default_wisdom_settings() { WisdomSettings::WisdomSettings() : WisdomSettings(get_global_wisdom()) {} -WisdomSettings::WisdomSettings(std::shared_ptr oracle) : +WisdomSettings::WisdomSettings(std::shared_ptr oracle) : impl_(std::move(oracle)) { if (!impl_) { throw std::runtime_error("Oracle cannot be null"); @@ -616,7 +618,7 @@ WisdomSettings::WisdomSettings( std::string wisdom_dir, std::string capture_dir, std::vector capture_rules) : - WisdomSettings(std::make_shared( + WisdomSettings(std::make_shared( std::vector {std::move(wisdom_dir)}, std::move(capture_dir), std::move(capture_rules))) {} From a64ee4ccb8b3e5281bd21c305ffb6040210fa123 Mon Sep 17 00:00:00 2001 From: stijn Date: Fri, 7 Apr 2023 09:52:05 +0200 Subject: [PATCH 51/63] Add support for `KERNEL_LAUNCHER_CAPTURE_SKIP` --- docs/env_vars.rst | 10 ++++- include/kernel_launcher/config.h | 2 +- include/kernel_launcher/kernel.h | 47 +++++++++++++++----- include/kernel_launcher/wisdom.h | 32 +++++++++----- src/builder.cpp | 3 +- src/kernel.cpp | 73 +++++++++++++++++++++++--------- src/wisdom.cpp | 53 +++++++++++++++++++---- 7 files changed, 166 insertions(+), 54 deletions(-) diff --git a/docs/env_vars.rst b/docs/env_vars.rst index 3b7308c..b892e32 100644 --- a/docs/env_vars.rst +++ b/docs/env_vars.rst @@ -12,7 +12,7 @@ Environment Variables * - **KERNEL_LAUNCHER_CAPTURE** - ``_`` - - Kernels for which a tuning specification will be exported on the first call to the kernel. + - Kernels for which a tuning specification will be captured. The value should a comma-seperated list of kernel names. Additionally, an ``*`` can be used as a wild card. @@ -30,6 +30,14 @@ Environment Variables (i.e., a wisdom file was found), the ``KERNEL_LAUNCHER_CAPTURE_FORCE`` will force to always capture kernels regardless of whether wisdom files are available. + * - **KERNEL_LAUNCHER_CAPTURE _SKIP** + - ``0`` + - Set the number of kernel launches to skip before capturing a particular kernel. + For example, if you set the value to ``3``, only the fourth launch will be captured since the + first three launches will be skipped. + + Note that this option is applied on a `per-kernel basis`, which means that each individual kernel keeps its own skip counter. + * - **KERNEL_LAUNCHER_LOG** - ``info`` - Controls how much logging information is printed to stderr. There are three possible options: diff --git a/include/kernel_launcher/config.h b/include/kernel_launcher/config.h index f967796..7bb1f33 100644 --- a/include/kernel_launcher/config.h +++ b/include/kernel_launcher/config.h @@ -14,7 +14,7 @@ struct ConfigSpace; /** * A particular configuration from a `ConfigSpace`. This class is essentially - * a table that maps `TunableParam`s to `Value`s. + * a lookup table that maps `TunableParam`s to `Value`s. */ struct Config: Eval { using const_iterator = typename TunableMap::const_iterator; diff --git a/include/kernel_launcher/kernel.h b/include/kernel_launcher/kernel.h index 290cc5c..3b1f193 100644 --- a/include/kernel_launcher/kernel.h +++ b/include/kernel_launcher/kernel.h @@ -45,9 +45,8 @@ struct Kernel { /** * Launch this kernel onto the given stream with the given arguments. */ - void launch(cudaStream_t stream, Args&&... args) { - std::vector kargs = { - into_kernel_arg(std::forward(args))...}; + void launch(cudaStream_t stream, Args... args) { + std::vector kargs = {into_kernel_arg(std::move(args))...}; ProblemSize problem_size = problem_processor_(kargs); instance_.launch(stream, problem_size, kargs); } @@ -107,6 +106,19 @@ struct WisdomKernel { Compiler compiler = default_compiler(), WisdomSettings settings = default_wisdom_settings()); + /** + * Explicitly compile this kernel for the given problem size and parameter + * types. + * + * @param problem_size Use to find the configuration from the wisdom file. + * @param param_types Types of kernel parameters. + * @param context CUDA context to use for compilation. + */ + void compile( + ProblemSize problem_size, + std::vector param_types, + CudaContextHandle context = CudaContextHandle::current()); + /** * Explicitly compile this kernel for the given problem size and parameter * types. @@ -116,16 +128,25 @@ struct WisdomKernel { * @param context CUDA context to use for compilation. */ void compile( - ProblemSize problem_size, - std::vector param_types, + std::vector args, CudaContextHandle context = CudaContextHandle::current()); + /** + * Sets an internal flag that enables the next launch of this kernel to + * be captured. + * + * @param skip Optionally set the number of kernel launches to skip_launches before + * capturing a kernel launch. For example, if `skip_launches=5` then the next + * 6th kernel launch will be captured. + */ + void capture_next_launch(int skip_launches = 0); + /** * Delete this kernel and reset its contents. */ void clear(); - void launch(cudaStream_t stream, std::vector args); + void launch_args(cudaStream_t stream, std::vector args); /** * Launch this kernel onto the given stream with the given arguments. @@ -133,7 +154,9 @@ struct WisdomKernel { */ template void launch(cudaStream_t stream, Args&&... args) { - return launch(stream, {into_kernel_arg(std::forward(args))...}); + return launch_args( + stream, + {into_kernel_arg(std::forward(args))...}); } /** @@ -142,19 +165,23 @@ struct WisdomKernel { */ template void launch(Args&&... args) { - return launch( + return launch_args( cudaStream_t(nullptr), {into_kernel_arg(std::forward(args))...}); } template void operator()(cudaStream_t stream, Args&&... args) { - return launch(stream, std::forward(args)...); + return launch_args( + stream, + {into_kernel_arg(std::forward(args))...}); } template void operator()(Args&&... args) { - return launch(std::forward(args)...); + return launch_args( + cudaStream_t(nullptr), + {into_kernel_arg(std::forward(args))...}); } private: diff --git a/include/kernel_launcher/wisdom.h b/include/kernel_launcher/wisdom.h index 55b96dd..62005ea 100644 --- a/include/kernel_launcher/wisdom.h +++ b/include/kernel_launcher/wisdom.h @@ -113,7 +113,7 @@ struct IWisdomSettings { const ConfigSpace& space, ProblemSize problem_size, CudaDevice device, - bool* should_capture_out) const = 0; + int* capture_skip_out = nullptr) const = 0; virtual void capture_kernel( const std::string& tuning_key, @@ -125,13 +125,18 @@ struct IWisdomSettings { }; struct CaptureRule { - CaptureRule(std::string pattern, bool force = false) : + CaptureRule( + std::string pattern, + bool force = false, + int skip_launches = 0) : pattern(std::move(pattern)), - force(force) {} + force(force), + skip_launches(skip_launches) {} CaptureRule(const char* pattern) : CaptureRule(std::string(pattern)) {} std::string pattern; bool force = false; + int skip_launches = 0; }; struct DefaultWisdomSettings: IWisdomSettings { @@ -150,7 +155,7 @@ struct DefaultWisdomSettings: IWisdomSettings { const ConfigSpace& space, ProblemSize problem_size, CudaDevice device, - bool* should_capture_out) const override; + int* capture_skip_launches_out) const override; void capture_kernel( const std::string& tuning_key, @@ -163,15 +168,18 @@ struct DefaultWisdomSettings: IWisdomSettings { virtual bool should_capture_kernel( const std::string& tuning_key, ProblemSize problem_size, - WisdomResult result) const; + WisdomResult result, + int& capture_skip_launches_out) const; - bool should_capture_kernel( + int should_capture_kernel( const std::string& tuning_key, - ProblemSize problem_size) const { + ProblemSize problem_size, + int& capture_skip_launches_out) const { return should_capture_kernel( tuning_key, problem_size, - WisdomResult::NotFound); + WisdomResult::NotFound, + capture_skip_launches_out); } const std::vector& wisdom_directories() const { @@ -217,20 +225,22 @@ struct WisdomSettings { * @param space The configuration space of the kernel. * @param problem_size The current problem size. * @param device The current device. - * @param should_capture_out Optional. Indicates if kernel must be captured. + * @param capture_skip_out Optional, indicates if the kernel should be + * captured. If negative, the kernel will not be captured. Otherwise, + * the kernel will be captured after the `capture_skip_out` kernel launches. */ Config load_config( const std::string& tuning_key, const ConfigSpace& space, ProblemSize problem_size, CudaDevice device, - bool* should_capture_out = nullptr) const { + int* capture_skip_out = nullptr) const { return impl_->load_config( tuning_key, space, problem_size, device, - should_capture_out); + capture_skip_out); } /** diff --git a/src/builder.cpp b/src/builder.cpp index ab3bcd2..6a3e945 100644 --- a/src/builder.cpp +++ b/src/builder.cpp @@ -156,8 +156,7 @@ KernelBuilder::KernelBuilder( KernelBuilder& KernelBuilder::argument_processor(ArgumentsProcessor f) { if (!f) { throw std::runtime_error( - "null pointer given in " - "`KernelBuilder::argument_processor(...)`"); + "null pointer given in `KernelBuilder::argument_processor(...)`"); } args_processors_.push_back(std::move(f)); diff --git a/src/kernel.cpp b/src/kernel.cpp index e598154..eb2ef7e 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -16,6 +16,8 @@ struct WisdomKernelImpl { Compiler compiler_; std::vector param_types_; WisdomSettings settings_; + bool capture_required_; + int capture_skip_; }; WisdomKernel::WisdomKernel() = default; @@ -35,7 +37,9 @@ void WisdomKernel::initialize( std::move(problem_processor), std::move(compiler), std::vector {}, - std::move(settings)}); + std::move(settings), + false, + 0}); } void WisdomKernel::clear() { @@ -47,17 +51,18 @@ void WisdomKernel::clear() { void compile_impl( WisdomKernelImpl* impl, - const std::string& tuning_key, ProblemSize problem_size, CudaContextHandle context, - std::vector param_types, - bool* should_capture = nullptr) { + std::vector param_types) { + const std::string& tuning_key = impl->builder_.tuning_key(); + int capture_skip = 0; + Config config = impl->settings_.load_config( tuning_key, impl->builder_, problem_size, context.device(), - should_capture); + &capture_skip); // Assign result to temporary variable since compile may throw auto instance = @@ -67,6 +72,8 @@ void compile_impl( impl->instance_ = std::move(instance); impl->param_types_ = std::move(param_types); impl->compiled_ = true; + impl->capture_required_ = capture_skip >= 0; + impl->capture_skip_ = capture_skip; } void WisdomKernel::compile( @@ -77,16 +84,26 @@ void WisdomKernel::compile( throw std::runtime_error("WisdomKernel has not been initialized"); } - const std::string& tuning_key = impl_->builder_.tuning_key(); + std::lock_guard guard(impl_->mutex_); + compile_impl(impl_.get(), problem_size, context, std::move(param_types)); +} + +void WisdomKernel::compile( + std::vector args, + CudaContextHandle context) { + if (!impl_) { + throw std::runtime_error("WisdomKernel has not been initialized"); + } std::lock_guard guard(impl_->mutex_); - compile_impl( - impl_.get(), - tuning_key, - problem_size, - context, - std::move(param_types), - nullptr); + ProblemSize problem_size = impl_->problem_processor_(args); + + std::vector param_types; + for (const KernelArg& arg : args) { + param_types.push_back(arg.type()); + } + + compile_impl(impl_.get(), problem_size, context, std::move(param_types)); } static void assert_types_equal( @@ -173,19 +190,27 @@ static void launch_captured_impl( } } -void WisdomKernel::launch(cudaStream_t stream, std::vector args) { +void WisdomKernel::capture_next_launch(int skip_launches) { if (!impl_) { throw std::runtime_error("WisdomKernel has not been initialized"); } std::lock_guard guard(impl_->mutex_); + impl_->capture_required_ = true; + impl_->capture_skip_ = skip_launches; +} + +void WisdomKernel::launch_args( + cudaStream_t stream, + std::vector args) { + if (!impl_) { + throw std::runtime_error("WisdomKernel has not been initialized"); + } + std::lock_guard guard(impl_->mutex_); ProblemSize problem_size = impl_->problem_processor_(args); - bool should_capture = false; if (!impl_->compiled_) { - const std::string& tuning_key = impl_->builder_.tuning_key(); - std::vector param_types; for (const KernelArg& arg : args) { param_types.push_back(arg.type()); @@ -193,15 +218,23 @@ void WisdomKernel::launch(cudaStream_t stream, std::vector args) { compile_impl( impl_.get(), - tuning_key, problem_size, CudaContextHandle::current(), - param_types, - &should_capture); + param_types); } assert_types_equal(args, impl_->param_types_); + bool should_capture = false; + if (impl_->capture_required_) { + if (impl_->capture_skip_ <= 0) { + impl_->capture_required_ = false; + should_capture = true; + } else { + impl_->capture_skip_ -= 1; + } + } + if (should_capture) { launch_captured_impl(impl_.get(), stream, problem_size, args); } else { diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 6d15481..5c101ec 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -387,7 +387,7 @@ static std::shared_ptr get_global_wisdom() { auto ptr = atomic_load(&global_default_wisdom); if (!ptr) { - ptr = set_global_wisdom(DefaultWisdomSettings::DefaultWisdomSettings()); + ptr = set_global_wisdom(DefaultWisdomSettings::from_env()); } return ptr; @@ -406,6 +406,26 @@ DefaultWisdomSettings::DefaultWisdomSettings( static std::vector determine_capture_rules() { const char* value; + int skip = 0; + + if ((value = getenv("KERNEL_LAUNCHER_CAPTURE_SKIP")) != nullptr + || (value = getenv("KERNEL_LAUNCHER_SKIP")) != nullptr) { + bool valid; + + try { + skip = std::stoi(value); + valid = skip >= 0; + } catch (const std::exception& e) { + valid = false; + } + + if (!valid) { + log_warning() << "failed to parse KERNEL_LAUNCHER_CAPTURE_SKIP, " + "kernel capturing is now be disabled"; + return {}; + } + } + std::vector result = {}; // Try the following environment keys @@ -438,7 +458,7 @@ static std::vector determine_capture_rules() { for (auto pattern : string_split(patterns.c_str(), {',', '|', ';'})) { if (!pattern.empty()) { - result.emplace_back(std::move(pattern), force); + result.emplace_back(std::move(pattern), force, skip); } } } @@ -463,7 +483,9 @@ DefaultWisdomSettings DefaultWisdomSettings::from_env() { } } - if ((value = getenv("KERNEL_LAUNCHER_DIR")) != nullptr) { + // capture directory + if ((value = getenv("KERNEL_LAUNCHER_CAPTURE_DIR")) != nullptr + || (value = getenv("KERNEL_LAUNCHER_DIR")) != nullptr) { capture_dir = value; } @@ -491,7 +513,7 @@ Config DefaultWisdomSettings::load_config( const ConfigSpace& space, ProblemSize problem_size, CudaDevice device, - bool* should_capture_out) const { + int* capture_skip_launches_out) const { WisdomResult result = WisdomResult::Ok; Config config = load_best_config( wisdom_dirs_, @@ -502,9 +524,18 @@ Config DefaultWisdomSettings::load_config( problem_size, &result); - if (should_capture_out != nullptr) { - *should_capture_out = - this->should_capture_kernel(tuning_key, problem_size, result); + if (capture_skip_launches_out != nullptr) { + int skip; + + if (!this->should_capture_kernel( + tuning_key, + problem_size, + result, + skip)) { + skip = -1; + } + + *capture_skip_launches_out = skip; } return config; @@ -530,14 +561,17 @@ void DefaultWisdomSettings::capture_kernel( bool DefaultWisdomSettings::should_capture_kernel( const std::string& tuning_key, ProblemSize problem_size, - WisdomResult result) const { + WisdomResult result, + int& skip_launches_out) const { bool matches = false; bool forced = false; + int skip = std::numeric_limits::max(); for (const auto& rule : capture_rules_) { if (string_match(rule.pattern.c_str(), tuning_key.c_str())) { matches = true; forced |= rule.force; + skip = std::min(skip, rule.skip_launches); break; } } @@ -547,7 +581,7 @@ bool DefaultWisdomSettings::should_capture_kernel( return false; } - // If wisdom was found for this kernel and we do not force tuning, + // If wisdom was found for this kernel, and we do not force tuning, // then there is no need to capture this kernel. if (result == WisdomResult::Ok && !forced) { return false; @@ -557,6 +591,7 @@ bool DefaultWisdomSettings::should_capture_kernel( return false; } + skip_launches_out = skip; return true; } From 799cbf629777b231a737abeac599885442ee5a8b Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 19 Apr 2023 15:05:09 +0200 Subject: [PATCH 52/63] Fix bug in `write_file` where it did not handle `overwrite=false` properly --- include/kernel_launcher/fs.h | 19 +++++++++- src/fs.cpp | 31 ++++++++++----- tests/fs.cpp | 73 ++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 11 deletions(-) diff --git a/include/kernel_launcher/fs.h b/include/kernel_launcher/fs.h index 8219096..0964053 100644 --- a/include/kernel_launcher/fs.h +++ b/include/kernel_launcher/fs.h @@ -19,18 +19,33 @@ bool write_file( size_t nbytes, bool overwrite = false); +inline bool write_file( + const std::string& path, + const std::vector& content, + bool overwrite = false) { + return write_file(path, content.data(), content.size(), overwrite); +} + inline bool write_file( const std::string& path, const std::vector& content, bool overwrite = false) { - return write_file(path, (char*)content.data(), content.size(), overwrite); + return write_file( + path, + (const char*)content.data(), + content.size(), + overwrite); } inline bool write_file( const std::string& path, const std::vector& content, bool overwrite = false) { - return write_file(path, (char*)content.data(), content.size(), overwrite); + return write_file( + path, + (const char*)content.data(), + content.size(), + overwrite); } inline bool write_file( diff --git a/src/fs.cpp b/src/fs.cpp index a3b7fe4..0d55dec 100644 --- a/src/fs.cpp +++ b/src/fs.cpp @@ -82,22 +82,35 @@ bool write_file( const char* content, size_t nbytes, bool overwrite) { - std::ofstream stream(path, std::ios::ate); + std::fstream stream; - if (stream) { - if (stream.tellp() < 0 || (stream.tellp() > 0 && !overwrite)) { + // Check if the file already exists. Note that there exists a race condition here where it is possible that the file + // is created between the check and opening for writing. However, there is no portable way to perform this check + // atomically until `std::ios::noreplace` is stable. + if (!overwrite) { + stream.open(path, std::ios::in); + bool exists = (bool)stream; + stream.close(); + + if (exists) { return false; } + } - stream.write(content, std::streamsize(nbytes)); + // Open file for writing. + stream.open(path, std::ios::out | std::ios::binary); + if (!stream) { + return false; + } - // Check if the stream is still valid after writing - if (stream) { - return true; - } + // Write data + stream.write(content, static_cast(nbytes)); + if (!stream) { + return false; } - return false; + stream.close(); + return true; } static void add_env_directories(std::vector& result) { diff --git a/tests/fs.cpp b/tests/fs.cpp index 8edf261..16fb308 100644 --- a/tests/fs.cpp +++ b/tests/fs.cpp @@ -1,6 +1,9 @@ #include "kernel_launcher/fs.h" +#include + #include "catch.hpp" +#include "test_utils.h" using namespace kernel_launcher; @@ -37,4 +40,74 @@ TEST_CASE("test fs") { CHECK(path_join("/a/", "/") == "/"); CHECK(path_join("//", "/b") == "/b"); } +} + +TEST_CASE("test read_file/write_file") { + std::string filename = assets_directory() + "/temporary_file.txt"; + + // delete file if, for example, previous test crashed + std::filesystem::remove(filename); + + SECTION("write/read simple") { + // write 3 bytes + std::vector expected = {1, 2, 3}; + CHECK(write_file(filename, expected)); + + // check if file exists + CHECK(std::filesystem::exists(filename)); + + // read data into buffer + std::vector data; + CHECK(read_file(filename, data)); + CHECK(data == expected); + } + + SECTION("write/read empty") { + // write 0 bytes + std::vector expected = {}; + CHECK(write_file(filename, expected)); + + // check if file exists + CHECK(std::filesystem::exists(filename)); + + // read empty data into buffer + std::vector data; + CHECK(read_file(filename, data) == true); + CHECK(data == expected); + } + + SECTION("read non-existing file") { + std::vector data; + CHECK(read_file(filename, data) == false); + CHECK(data.empty()); + } + + SECTION("write existing file") { + // write 3 bytes + std::vector expected = {1, 2, 3}; + CHECK(write_file(filename, expected)); + + // check if file exists + CHECK(std::filesystem::exists(filename)); + + expected = {4, 5}; + + SECTION("overwrite=false") { + // this should fail, we cannot overwrite it + CHECK(write_file(filename, expected, false) == false); + } + + SECTION("overwrite=true") { + // this should succeed since we set the `overwrite` flag + CHECK(write_file(filename, expected, true)); + + // check if the data was overwritten + std::vector data; + CHECK(read_file(filename, data)); + CHECK(expected == data); + } + } + + // cleanup + std::filesystem::remove(filename); } \ No newline at end of file From 73f72b056a95aacede58e84a43cefc52ed39bbba Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 19 Apr 2023 15:26:27 +0200 Subject: [PATCH 53/63] Add test for exporting `NULL` as kernel argument --- src/arg.cpp | 32 +++- tests/assets/vector_add_key_0.json | 237 +++++++++++++++++++++++++++++ tests/export.cpp | 23 ++- 3 files changed, 283 insertions(+), 9 deletions(-) create mode 100644 tests/assets/vector_add_key_0.json diff --git a/src/arg.cpp b/src/arg.cpp index bb21242..dad794b 100644 --- a/src/arg.cpp +++ b/src/arg.cpp @@ -100,8 +100,8 @@ KernelArg KernelArg::to_array(size_t nelements) const { if (is_array()) { if (nelements > data_.array.nelements) { throw std::runtime_error( - "array of type " + type_.remove_pointer().name() - + " cannot be be resized to " + std::to_string(nelements) + "array of type `" + type_.remove_pointer().name() + + "` cannot be be resized to " + std::to_string(nelements) + " elements, it only has " + std::to_string(data_.array.nelements) + " elements"); } @@ -110,8 +110,8 @@ KernelArg KernelArg::to_array(size_t nelements) const { } else { if (!type_.is_pointer()) { throw std::runtime_error( - "argument is not a pointer type and cannot be converted into an array: " - + type_.name()); + "argument of type `" + type_.name() + "` is not a pointer type " + "and thus cannot be converted into an array"); } return {type_, *(void**)as_void_ptr(), nelements}; @@ -123,8 +123,8 @@ Value KernelArg::to_value() const { if (v.is_empty()) { throw std::runtime_error( - "cannot convert value of type \"" + type_.name() - + "\" instance of kernel_launcher::Value"); + "cannot convert value of type `" + type_.name() + + "` instance of kernel_launcher::Value"); } return v; @@ -163,6 +163,22 @@ TypeInfo KernelArg::type() const { } std::vector KernelArg::to_bytes() const { + // If this type is pointer, we check if it is a NULL pointer. It makes no + // sense to export non-NULL pointer since the address will be invalid + // when reading the exported pointer. + if (type_.is_pointer()) { + void* ptr; + ::memcpy(&ptr, as_void_ptr(), sizeof(void*)); + + if (ptr != nullptr) { + std::string msg = "a raw pointer of type `" + type_.name() + "` " + "was provided as kernel argument which cannot be exported " + "since the corresponding buffer size is unknown"; + + throw std::runtime_error(msg); + } + } + size_t nbytes = type_.size(); std::vector result(nbytes); ::memcpy(result.data(), as_void_ptr(), nbytes); @@ -187,11 +203,11 @@ std::vector KernelArg::copy_array() const { std::string msg; if (type_.is_pointer()) { - msg = "a raw pointer of type " + type_.name() + " was provided as " + msg = "a raw pointer of type `" + type_.name() + "` was provided as " "kernel argument which cannot be exported since the " "corresponding buffer size is unknown"; } else { - msg = "a scalar of type " + type_.name() + " was provided as " + msg = "a scalar of type `" + type_.name() + "` was provided as " "kernel argument which cannot be exported since it is not " "an array"; } diff --git a/tests/assets/vector_add_key_0.json b/tests/assets/vector_add_key_0.json new file mode 100644 index 0000000..42d1e01 --- /dev/null +++ b/tests/assets/vector_add_key_0.json @@ -0,0 +1,237 @@ +{ + "key": "vector_add_key", + "environment": { + "host_name": "node026", + "date": "2023-04-19T15:25:10+0200", + "runtime_version": 11070, + "driver_version": 11070, + "nvrtc_version": 11070 + }, + "config_space": { + "parameters": [ + { + "name": "threads_per_block", + "values": [ + 1, + 32, + 128, + 256 + ], + "priors": [ + 1.0, + 1.0, + 1.0, + 1.0 + ], + "default": 256 + }, + { + "name": "elements_per_thread", + "values": [ + 1, + 2, + 4 + ], + "priors": [ + 1.0, + 1.0, + 1.0 + ], + "default": 1 + } + ], + "restrictions": [ + { + "operator": "&&", + "operands": [ + { + "operator": ">=", + "operands": [ + { + "operator": "*", + "operands": [ + { + "operator": "parameter", + "operands": [ + "elements_per_thread" + ] + }, + { + "operator": "parameter", + "operands": [ + "threads_per_block" + ] + } + ] + }, + 32 + ] + }, + { + "operator": "<=", + "operands": [ + { + "operator": "*", + "operands": [ + { + "operator": "parameter", + "operands": [ + "elements_per_thread" + ] + }, + { + "operator": "parameter", + "operands": [ + "threads_per_block" + ] + } + ] + }, + 1024 + ] + } + ] + } + ] + }, + "kernel": { + "source": "\n template \n __global__\n void vector_add(int n, int *c, const int* a, const int* b) {\n for (int k = 0; k < ELEMENTS_PER_THREAD; k++) {\n int index = (blockIdx.x * ELEMENTS_PER_THREAD + k) * blockDim.x + threadIdx.x;\n\n if (index < n) {\n c[index] = a[index] + b[index];\n }\n }\n }\n ", + "name": "vector_add", + "compile_flags": [], + "shared_memory": 0, + "template_args": [ + "int" + ], + "headers": [], + "defines": { + "ELEMENTS_PER_THREAD": { + "operator": "parameter", + "operands": [ + "elements_per_thread" + ] + } + }, + "block_size": [ + { + "operator": "parameter", + "operands": [ + "threads_per_block" + ] + }, + 1, + 1 + ], + "grid_size": [ + { + "operator": "+", + "operands": [ + { + "operator": "/", + "operands": [ + 0, + { + "operator": "*", + "operands": [ + { + "operator": "parameter", + "operands": [ + "elements_per_thread" + ] + }, + { + "operator": "parameter", + "operands": [ + "threads_per_block" + ] + } + ] + } + ] + }, + { + "operator": "!=", + "operands": [ + { + "operator": "%", + "operands": [ + 0, + { + "operator": "*", + "operands": [ + { + "operator": "parameter", + "operands": [ + "elements_per_thread" + ] + }, + { + "operator": "parameter", + "operands": [ + "threads_per_block" + ] + } + ] + } + ] + }, + 0 + ] + } + ] + }, + 1, + 1 + ] + }, + "arguments": [ + { + "type": "int", + "kind": "scalar", + "data": [ + 0, + 0, + 0, + 0 + ] + }, + { + "type": "decltype(nullptr)", + "kind": "scalar", + "data": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "type": "float const*", + "kind": "scalar", + "data": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "type": "float const*", + "kind": "array", + "hash": "da39a3ee5e6b4b0d3255bfef95601890afd80709", + "file": "vector_add_key_i1j63muv.bin" + } + ], + "problem_size": [ + 0, + 1, + 1 + ] +} \ No newline at end of file diff --git a/tests/export.cpp b/tests/export.cpp index 93fe6e4..3454529 100644 --- a/tests/export.cpp +++ b/tests/export.cpp @@ -99,7 +99,28 @@ TEST_CASE("test export_tuning_file", "[CUDA]") { compare_exports("vector_add_key_1024", tmp_dir, assets_dir); } - SECTION("matmul") { + SECTION("vector_add n=0") { + KernelBuilder builder = build_vector_add_kernel(); + size_t n = 0; + + std::vector arguments = { + KernelArg::from_scalar(int(n)), + KernelArg::from_scalar(nullptr), + KernelArg::from_scalar((const float*)nullptr), + KernelArg::from_array((const float*)nullptr, 0)}; + + export_capture_file( + tmp_dir, + "vector_add_key", + builder, + {uint32_t(n)}, + arguments, + {{}, {}, {}}); + + compare_exports("vector_add_key_0", tmp_dir, assets_dir); + } + + SECTION("matmul n=1024") { KernelBuilder builder = build_matmul_kernel(); size_t n = 1024; std::vector a(n * n); From a41654d8897fbd385d1417fd211511e1e88334da Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 2 May 2023 20:01:12 +0200 Subject: [PATCH 54/63] Fix syntax error in `export.cpp` --- src/export.cpp | 51 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/src/export.cpp b/src/export.cpp index d461a43..2361307 100644 --- a/src/export.cpp +++ b/src/export.cpp @@ -186,23 +186,41 @@ struct KernelBuilderSerializerHack { static json builder_to_json(const KernelBuilder& builder, const Eval& eval) { + json result; + const std::string* content = builder.kernel_source_.content(); + if (content != nullptr) { + result["source"] = *content; + } else { + result["file"] = builder.kernel_source_.file_name(); + } + + result["name"] = builder.kernel_name_; + result["compile_flags"] = + expr_list_to_json(builder.compile_flags_, eval); + result["shared_memory"] = // + expr_to_json(builder.shared_mem_, eval); + result["template_args"] = + expr_list_to_json(builder.template_args_, eval); + std::vector headers; for (const auto& source : builder.preheaders_) { - json content = nullptr; + json header_content = nullptr; if (source.content() != nullptr) { - content = *source.content(); + header_content = *source.content(); } headers.push_back({ {"file", source.file_name()}, - {"content", std::move(content)}, + {"content", std::move(header_content)}, }); } + result["headers"] = std::move(headers); json defines = json::object(); for (const auto& p : builder.defines_) { defines[p.first] = expr_to_json(p.second, eval); } + result["defines"] = std::move(defines); std::array block_size = { builder.determine_block_size(0), @@ -214,36 +232,9 @@ struct KernelBuilderSerializerHack { builder.determine_grid_size(1), builder.determine_grid_size(2)}; - json result; - const std::string* content = builder.kernel_source_.content(); - if (content != nullptr) { - result["source"] = *content; - } else { - result["file"] = builder.kernel_source_.file_name(); - } - - result["name"] = builder.kernel_name_; - result["compile_flags"] = - expr_list_to_json(builder.compile_flags_, eval); - result["shared_memory"] = expr_to_json(builder.shared_mem_, eval); - result["template_args"] = - expr_list_to_json(builder.template_args_, eval); - result["headers"] = std::move(headers); - result["defines"] = std::move(defines); - result["block_size"] = expr_list_to_json(block_size, eval); result["grid_size"] = expr_list_to_json(grid_size, eval); - result["block_size"] = expr_list_to_json(std::array { - builder.determine_block_size(0), - builder.determine_block_size(1), - builder.determine_block_size(2)}); - - result["grid_size"] = expr_list_to_json(std::array { - builder.determine_grid_size(0), - builder.determine_grid_size(1), - builder.determine_grid_size(2)}); - return result; } }; From 5b70ceac265db78634d4d3082001a8d9a6d12e14 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 2 May 2023 20:01:35 +0200 Subject: [PATCH 55/63] Fix test for `KernelArg::to_bytes` --- tests/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/arg.cpp b/tests/arg.cpp index a4ab7cf..4b47557 100644 --- a/tests/arg.cpp +++ b/tests/arg.cpp @@ -161,7 +161,7 @@ TEST_CASE("test KernelArg::copy_array", "[CUDA]") { ::memcpy(ptr_bytes.data(), (uint8_t*)&array_ptr, sizeof(int*)); KernelArg v = KernelArg::from_array(array.data(), array.size()); - CHECK(v.to_bytes() == ptr_bytes); + CHECK_THROWS(v.to_bytes()); // pointers cannot be exported CHECK( v.copy_array() == std::vector {1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}); From 17e94499444fe0550b03546c6f0f83e6723e5be6 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 2 May 2023 20:28:08 +0200 Subject: [PATCH 56/63] Fix `build_pragma_kernel` only recognizing `#pragma kernel` and not the synonyms --- include/kernel_launcher/internal/tokens.h | 7 ------- src/internal/directives.cpp | 2 +- src/internal/parser.cpp | 24 ++++++++++++++++++----- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/include/kernel_launcher/internal/tokens.h b/include/kernel_launcher/internal/tokens.h index 5cd9929..207efb0 100644 --- a/include/kernel_launcher/internal/tokens.h +++ b/include/kernel_launcher/internal/tokens.h @@ -106,13 +106,6 @@ struct TokenStream { throw_expecting_token(t, str); } - template - [[noreturn]] void - throw_expecting_token(Token t, const std::array& patterns) const { - static_assert(N > 0, "number of patterns cannot be zero"); - throw_expecting_token(t, patterns[0]); - } - template Token consume(const T& pattern) { Token t = next(); diff --git a/src/internal/directives.cpp b/src/internal/directives.cpp index f9550e5..7b3a0df 100644 --- a/src/internal/directives.cpp +++ b/src/internal/directives.cpp @@ -449,7 +449,7 @@ process_directive(TokenStream& stream, KernelBuilder& builder, Context& ctx) { } else { stream.throw_unexpected_token( t, - "this is not a supported action in kernel_launcher"); + "this directive is not supported by kernel_launcher"); } } } diff --git a/src/internal/parser.cpp b/src/internal/parser.cpp index d16920d..f1706c0 100644 --- a/src/internal/parser.cpp +++ b/src/internal/parser.cpp @@ -69,24 +69,38 @@ static std::vector parse_kernel_params(TokenStream& stream) { static bool extract_kernel_tuner_directives( TokenStream& stream, std::vector& directives_out) { - static constexpr const std::array PRAGMA_NAMES = { + static constexpr const std::array PRAGMA_NAMES = { "kernel", + "kernel_launcher", "kernel_tuner"}; // Check if directive starts with correct pragma. If not, this is - // not a relevant pragma and we do not need to parse it. + // not a relevant pragma, and we do not need to parse it. Token t = stream.peek(); - bool is_relevant = stream.next_if("pragma") && stream.next_if(PRAGMA_NAMES); + if (!stream.next_if("pragma")) { + stream.seek(t); + return false; + } + + std::string pragma_name = stream.span(stream.consume(TokenKind::Ident)); stream.seek(t); - if (!is_relevant) { + // Check if `pragma_name` is one of `PRAGMA_NAMES` + bool is_valid_name = false; + for (const auto& valid_name : PRAGMA_NAMES) { + if (pragma_name == valid_name) { + is_valid_name = true; + } + } + + if (!is_valid_name) { return false; } // Parse all pragmas do { stream.consume("pragma"); - stream.consume(PRAGMA_NAMES); + stream.consume(pragma_name); t = stream.next(); directives_out.push_back(t); From c9653770dc820b0433a4ff28ed4b1e005931b269 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 2 May 2023 20:28:46 +0200 Subject: [PATCH 57/63] Add `pragma.h` to `kernel_launcher.h` --- include/kernel_launcher.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/kernel_launcher.h b/include/kernel_launcher.h index 70c4f14..dbfc4ec 100644 --- a/include/kernel_launcher.h +++ b/include/kernel_launcher.h @@ -1,12 +1,15 @@ #pragma once #include "kernel_launcher/arg.h" +#include "kernel_launcher/builder.h" #include "kernel_launcher/compiler.h" #include "kernel_launcher/config.h" #include "kernel_launcher/cuda.h" +#include "kernel_launcher/export.h" #include "kernel_launcher/expr.h" #include "kernel_launcher/fs.h" #include "kernel_launcher/kernel.h" +#include "kernel_launcher/pragma.h" #include "kernel_launcher/registry.h" #include "kernel_launcher/utils.h" #include "kernel_launcher/value.h" From ed0c89b833b65317612215f65950be7c3ecfc2f2 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 2 May 2023 20:29:10 +0200 Subject: [PATCH 58/63] Add documentation in `wisdom.h` --- include/kernel_launcher/wisdom.h | 91 ++++++++++++++++++++++++-------- src/wisdom.cpp | 1 - 2 files changed, 69 insertions(+), 23 deletions(-) diff --git a/include/kernel_launcher/wisdom.h b/include/kernel_launcher/wisdom.h index 62005ea..7b23a4f 100644 --- a/include/kernel_launcher/wisdom.h +++ b/include/kernel_launcher/wisdom.h @@ -15,8 +15,9 @@ namespace kernel_launcher { struct WisdomRecordImpl; /** - * Represents a record read from a wisdom file. Use methods such as - * ``problem_size()`` and ``device_name()`` to retrieve fields of this record. + * Use by ``process_wisdom_file``. Represents a record read from a wisdom + * file. Use methods such as ``problem_size()`` and ``device_name()`` to + * retrieve fields of this record. */ struct WisdomRecord { WisdomRecord(const WisdomRecordImpl& impl) : impl_(impl) {} @@ -105,6 +106,13 @@ inline Config load_best_config( result); } +/** +* The interface that describes how to load a configuration and how to + * capture a kernel launch for a ``WisdomKernel``. + * + * If you want to implement your own ``IWisdomSettings``, it is best to + * extend ``DefaultWisdomSettings`` and override the necessary methods. + */ struct IWisdomSettings { virtual ~IWisdomSettings() = default; @@ -124,6 +132,9 @@ struct IWisdomSettings { const std::vector>& output_arrays) const = 0; }; +/** + * A rule that describes which kernels should be captured. + */ struct CaptureRule { CaptureRule( std::string pattern, @@ -132,6 +143,7 @@ struct CaptureRule { pattern(std::move(pattern)), force(force), skip_launches(skip_launches) {} + CaptureRule(const char* pattern) : CaptureRule(std::string(pattern)) {} std::string pattern; @@ -139,6 +151,10 @@ struct CaptureRule { int skip_launches = 0; }; +/** + * The interface that describes how to load a configuration and how to + * capture a kernel launch for a ``WisdomKernel``. + */ struct DefaultWisdomSettings: IWisdomSettings { static DefaultWisdomSettings from_env(); @@ -150,6 +166,17 @@ struct DefaultWisdomSettings: IWisdomSettings { ~DefaultWisdomSettings() override = default; + /** + * Loads a configuration for a given kernel instance. + * + * @param tuning_key The tuning key of the kernel instance. + * @param space The configuration space of the kernel instance. + * @param problem_size The current problem size. + * @param device The current device. + * @param capture_skip_out Optional, indicates if the kernel should be + * captured. If negative, the kernel will not be captured. Otherwise, + * the kernel will be captured after the `capture_skip_out` kernel launches. + */ Config load_config( const std::string& tuning_key, const ConfigSpace& space, @@ -157,6 +184,16 @@ struct DefaultWisdomSettings: IWisdomSettings { CudaDevice device, int* capture_skip_launches_out) const override; + /** + * Called to export a captured kernel launch to a file. + * + * @param tuning_key The tuning key of the kernel instance. + * @param builder The builder of the kernel instance. + * @param problem_size The current problem size. + * @param arguments The kernel arguments. + * @param input_arrays The input arrays associated with the arguments. + * @param output_arrays The output arrays associated with the arguments. + */ void capture_kernel( const std::string& tuning_key, const KernelBuilder& builder, @@ -165,31 +202,41 @@ struct DefaultWisdomSettings: IWisdomSettings { const std::vector>& input_arrays, const std::vector>& output_arrays) const override; + /** + * Returns ``true`` if the given kernel instance should be captured in the + * future. This method is called after ``load_config`` loads a + * configuration for a kernel. + * + * @param tuning_key The tuning key of the kernel instance. + * @param problem_size The problem size of the kernel instance. + * @param result The result from ``load_best_config``. + * @param capture_skip_launches_out Out parameter. If set to `n`, the + * first `n` kernel launches will be skipped and the `n+1`-th kernel launch + * will be captured. + */ virtual bool should_capture_kernel( const std::string& tuning_key, ProblemSize problem_size, WisdomResult result, int& capture_skip_launches_out) const; - int should_capture_kernel( - const std::string& tuning_key, - ProblemSize problem_size, - int& capture_skip_launches_out) const { - return should_capture_kernel( - tuning_key, - problem_size, - WisdomResult::NotFound, - capture_skip_launches_out); - } - + /** + * Returns the directories that will be searched to find wisdom files. + */ const std::vector& wisdom_directories() const { return wisdom_dirs_; } + /** + * Returns the directory where kernel captures will be stored. + */ const std::string& capture_directory() const { return capture_dir_; } + /** + * Returns the active capture rules. + */ const std::vector& capture_rules() const { return capture_rules_; } @@ -221,8 +268,8 @@ struct WisdomSettings { /** * Load the configuration for the given parameters. * - * @param tuning_key The tuning key of the kernel. - * @param space The configuration space of the kernel. + * @param tuning_key The tuning key of the kernel instance. + * @param space The configuration space of the kernel instance. * @param problem_size The current problem size. * @param device The current device. * @param capture_skip_out Optional, indicates if the kernel should be @@ -244,14 +291,14 @@ struct WisdomSettings { } /** + * Called to export a captured kernel launch to a file. * - * - * @param tuning_key - * @param builder - * @param problem_size - * @param param_types - * @param inputs - * @param outputs + * @param tuning_key The tuning key of the kernel instance. + * @param builder The builder of the kernel instance. + * @param problem_size The current problem size. + * @param arguments The kernel arguments. + * @param input_arrays The input arrays associated with the arguments. + * @param output_arrays The output arrays associated with the arguments. */ void capture_kernel( const std::string& tuning_key, diff --git a/src/wisdom.cpp b/src/wisdom.cpp index 5c101ec..d80eb85 100644 --- a/src/wisdom.cpp +++ b/src/wisdom.cpp @@ -572,7 +572,6 @@ bool DefaultWisdomSettings::should_capture_kernel( matches = true; forced |= rule.force; skip = std::min(skip, rule.skip_launches); - break; } } From 9bf667f5683ea6099deebd7132a88a3c7e123a9e Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 2 May 2023 20:51:45 +0200 Subject: [PATCH 59/63] Update README with new example of the pragma-based API --- README.md | 85 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8f7f3df..0ccb201 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,68 @@ Recommended installation is using CMake. See the [installation guide](https://ke ## Example -See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory. +There are many ways of using Kernel Launcher. See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory. + +### Pragma-based API +Below shows an example of using the pragma-based API, which allows existing CUDA kernels to be annotated with Kernel-Launcher-specific directives. + +**kernel.cu** +```cpp +#pragma kernel tune(threads_per_block=32, 64, 128, 256, 512, 1024) +#pragma kernel block_size(threads_per_block) +#pragma kernel problem_size(n) +#pragma kernel buffers(A[n], B[n], C[n]) +template +__global__ void vector_add(int n, T *C, const T *A, const T *B) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + C[i] = A[i] + B[i]; + } +} +``` + +**main.cpp** +```cpp +#include "kernel_launcher.h" + +int main() { + // Initialize CUDA memory. This is outside the scope of kernel_launcher. + unsigned int n = 1000000; + float *dev_A, *dev_B, *dev_C; + /* cudaMalloc, cudaMemcpy, ... */ + + // Namespace alias. + namespace kl = kernel_launcher; + + // Launch the kernel! Again, the grid size and block size do not need to + // be specified, they are calculated from the kernel specifications and + // run-time arguments. + kl::launch( + kl::PragmaKernel("kernel.cu", "vector_add", {"float"}), + n, dev_C, dev_A, dev_B + ); +} + +``` + + +### Builder-based API +Below shows an example of the `KernelBuilder`-based API. +This offers more flexiblity than the pragma-based API, but is also more verbose: + +**kernel.cu** +```cpp +template +__global__ void vector_add(int n, T *C, const T *A, const T *B) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + C[i] = A[i] + B[i]; + } +} +``` + +**main.cpp** ```cpp #include "kernel_launcher.h" @@ -33,22 +93,19 @@ int main() { namespace kl = kernel_launcher; // Create a kernel builder - kl::KernelBuilder builder("vector_add", "vector_add_kernel.cu"); + auto builder = kl::KernelBuilder("vector_add", "vector_add_kernel.cu"); // Define the variables that can be tuned for this kernel. auto threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024}); - auto elements_per_thread = builder.tune("elements_per_thread", {1, 2, 4, 8}); // Set kernel properties such as block size, grid divisor, template arguments, etc. builder - .problem_size(kl::arg0) - .block_size(threads_per_block) - .grid_divisors(threads_per_block * elements_per_thread) .template_args(kl::type_of()) - .define("ELEMENTS_PER_THREAD", elements_per_thread); + .problem_size(kl::arg0) + .block_size(threads_per_block); // Define the kernel - kl::WisdomKernel vector_add_kernel(builder); + auto vector_add_kernel = kl::WisdomKernel(builder); // Initialize CUDA memory. This is outside the scope of kernel_launcher. unsigned int n = 1000000; @@ -60,16 +117,24 @@ int main() { // derived from the kernel specifications and run-time arguments. vector_add_kernel(n, dev_C, dev_A, dev_B); } - ``` + + ## License Licensed under Apache 2.0. See [LICENSE](https://github.com/KernelTuner/kernel_launcher/blob/master/LICENSE). + ## Citation -``` +If you use Kernel Launcher in your work, please cite the following publication: + +> S. Heldens, B. van Werkhoven (2023), "Kernel Launcher: C++ Library for Optimal-Performance Portable CUDA Applications", The Eighteenth International Workshop on Automatic Performance Tuning (iWAPT2023) co-located with IPDPS 2023 + +As BibTeX: + +```Latex @article{heldens2023kernellauncher, title={Kernel Launcher: C++ Library for Optimal-Performance Portable CUDA Applications}, author={Heldens, Stijn and van Werkhoven, Ben}, From d11c0405ca8438538e72e6859f9a030383d67fd3 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 4 May 2023 09:24:59 +0200 Subject: [PATCH 60/63] Swap order of argumentsfor `buid_pragma_kernel` to be consistent with `KernelBuilder` --- README.md | 2 +- examples/vector_add_annotated/main.cu | 2 +- include/kernel_launcher/pragma.h | 8 ++++---- src/pragma.cpp | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0ccb201..9501e52 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ int main() { // be specified, they are calculated from the kernel specifications and // run-time arguments. kl::launch( - kl::PragmaKernel("kernel.cu", "vector_add", {"float"}), + kl::PragmaKernel("vector_add", "kernel.cu", {"float"}), n, dev_C, dev_A, dev_B ); } diff --git a/examples/vector_add_annotated/main.cu b/examples/vector_add_annotated/main.cu index 4951682..e90b581 100644 --- a/examples/vector_add_annotated/main.cu +++ b/examples/vector_add_annotated/main.cu @@ -60,7 +60,7 @@ int main(int argc, char* argv[]) { // Call kernel kl::launch( - kl::PragmaKernel("kernel_annotated.cu", "vector_add", {"float"}), + kl::PragmaKernel("vector_add", "kernel_annotated.cu", {"float"}), n, C_dev, (const float*)A_dev, diff --git a/include/kernel_launcher/pragma.h b/include/kernel_launcher/pragma.h index 92e8372..eeb3f6e 100644 --- a/include/kernel_launcher/pragma.h +++ b/include/kernel_launcher/pragma.h @@ -10,15 +10,15 @@ namespace kernel_launcher { * with the given `kernel_name`, extract the KernelLauncher-specific pragmas * for that kernel, and returns `KernelBuilder`. * - * @param source The source code. Can be either a filename (like `"kernel.cu"`) - * or a filename+content pair (like `{"kernel.cu", "#include ..."}`). * @param kernel_name The name of the kernel in the source code. It may contain * namespaces such as `mypackage::kernels::vector_add`. + * @param source The source code. Can be either a filename (like `"kernel.cu"`) + * or a filename+content pair (like `{"kernel.cu", "#include ..."}`). * @param template_args Optional; template arguments passed to the kernel. */ KernelBuilder build_pragma_kernel( - const KernelSource& source, const std::string& kernel_name, + const KernelSource& source, const std::vector& template_args = {}, const FileLoader& fs = DefaultLoader {}); @@ -36,8 +36,8 @@ struct PragmaKernel: IKernelDescriptor { * @param template_args Optional; template arguments passed to the kernel. */ PragmaKernel( - std::string path, std::string kernel_name, + std::string path, std::vector template_args = {}); KernelBuilder build() const override; diff --git a/src/pragma.cpp b/src/pragma.cpp index a1c0dcf..2fff3b3 100644 --- a/src/pragma.cpp +++ b/src/pragma.cpp @@ -5,8 +5,8 @@ namespace kernel_launcher { KernelBuilder build_pragma_kernel( - const KernelSource& source, const std::string& kernel_name, + const KernelSource& source, const std::vector& template_args, const FileLoader& fs) { // Read file @@ -36,8 +36,8 @@ KernelBuilder build_pragma_kernel( } PragmaKernel::PragmaKernel( - std::string path, std::string kernel_name, + std::string path, std::vector template_args) : kernel_name_(std::move(kernel_name)), template_args_(std::move(template_args)) { @@ -53,8 +53,8 @@ PragmaKernel::PragmaKernel( KernelBuilder PragmaKernel::build() const { return build_pragma_kernel( - KernelSource(file_path_), kernel_name_, + KernelSource(file_path_), template_args_); } From 9bf7466e3296b79b50d603a252a1d055a2eb167c Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 4 May 2023 09:25:51 +0200 Subject: [PATCH 61/63] Fix small error in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9501e52..19139db 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ int main() { namespace kl = kernel_launcher; // Create a kernel builder - auto builder = kl::KernelBuilder("vector_add", "vector_add_kernel.cu"); + auto builder = kl::KernelBuilder("vector_add", "kernel.cu"); // Define the variables that can be tuned for this kernel. auto threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024}); From 6298d82b6be0e23e20917050246f1035d9182c7b Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 4 May 2023 09:29:39 +0200 Subject: [PATCH 62/63] Change example in READMe to use `ConfigSpace` object --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 19139db..c8681cb 100644 --- a/README.md +++ b/README.md @@ -92,13 +92,13 @@ int main() { // Namespace alias. namespace kl = kernel_launcher; - // Create a kernel builder - auto builder = kl::KernelBuilder("vector_add", "kernel.cu"); - // Define the variables that can be tuned for this kernel. - auto threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024}); + auto space = kl::ConfigSpace(); + auto threads_per_block = space.tune("block_size", {32, 64, 128, 256, 512, 1024}); - // Set kernel properties such as block size, grid divisor, template arguments, etc. + // Create a kernel builder and set kernel properties such as block size, + // grid divisor, template arguments, etc. + auto builder = kl::KernelBuilder("vector_add", "kernel.cu", space); builder .template_args(kl::type_of()) .problem_size(kl::arg0) From b33e35b88b9474d5b01bc74c3e09fb7cfdffd4c0 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 4 May 2023 09:41:53 +0200 Subject: [PATCH 63/63] Some text improvements in README --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c8681cb..4378a44 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,10 @@ -_Kernel Launcher_ is a C++ library that makes it easy to dynamically compile _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and call them in an easy type-safe way using C++ magic. -Additionally, _Kernel Launcher_ supports exporting kernel specifications, to enable tuning by [Kernel Tuner](https://github.com/KernelTuner/kernel_tuner), and importing the tuning results, known as _wisdom_ files, back into the application. +_Kernel Launcher_ is a C++ library that enables dynamic compilation _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and launching them in an easy type-safe way using C++ magic. +On top of that, Kernel Launcher supports _capturing_ kernel launches, to enable tuning by [Kernel Tuner](https://github.com/KernelTuner/kernel_tuner), and importing the tuning results, known as _wisdom_ files, back into the application. +The result: highly efficient GPU applications with maximum portability. +