Skip to content

Commit

Permalink
optimizes string matching by allowing memcmp like functionality (even…
Browse files Browse the repository at this point in the history
… on utf8 sequences)

reference: #147
comparison: https://compiler-explorer.com/z/Tz3KhG
  • Loading branch information
Andersama committed Dec 28, 2020
1 parent 9a37e55 commit 09e933a
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 4 deletions.
49 changes: 45 additions & 4 deletions include/ctre/evaluation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,52 @@ template <typename CharT, typename Iterator, typename EndIterator> constexpr CTR
return false;
}

template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {

bool same = (compare_character(String, current, end) && ... && true);
#if __cpp_char8_t >= 201811
template <size_t N, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_utf8_string(Iterator current, [[maybe_unused]] const EndIterator end, char8_t (&buffer)[N], std::index_sequence<Idx...>) noexcept {
//abuse inside knowledge of how utf8_iterator works
if constexpr (!std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>) {
size_t count = end.ptr - current.ptr; //size_t count = std::distance(current.ptr, end.ptr);
size_t bump = ((count < N) ? count : N);
return { Iterator{current.ptr + bump, current.end}, (count >= N) && (((current.ptr[Idx] == buffer[Idx]) + ... + 0) == sizeof...(Idx)) };
} else {
size_t count = current.end - current.ptr; //size_t count = std::distance(current.ptr, current.end);
size_t bump = ((count < N) ? count : N);
return { Iterator{current.ptr + bump, current.end}, (count >= N) && (((current.ptr[Idx] == buffer[Idx]) + ... + 0) == sizeof...(Idx)) };
}
}
#endif

return {current, same};
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
#if __cpp_char8_t >= 201811
if constexpr (std::is_same_v<::std::remove_const_t<Iterator>, utf8_iterator> && (std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>> || std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>)) {
constexpr size_t str_length = (utf8_codepoint_length(String) + ... + 0ULL);
//encode our String... into it's utf8 representation
char8_t utf8_sequence[str_length];
char8_t* ptr = utf8_sequence;
((ptr = utf32_codepoint_to_utf8_codepoint(String, ptr)), ...);
//run the comparison
return evaluate_match_utf8_string(current, end, utf8_sequence, std::make_index_sequence<str_length>());
} else if constexpr (is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
//check the remaining bytes*
size_t count = end - current;
//make sure we only "bump" the iterator a safe distance
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
//do math against how many characters we match, avoid as many branches as possible
return { current + bump, (count >= sizeof...(String)) && (((current[Idx] == String) + ... + 0) == sizeof...(String)) };
} else {
bool same = (compare_character(String, current, end) && ... && true);
return { current, same };
}
#else
if constexpr (is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
size_t count = end - current;
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
return { current + bump, (count >= sizeof...(String)) && (((current[Idx] == String) + ... + 0) == sizeof...(String)) };
} else {
bool same = (compare_character(String, current, end) && ... && true);
return { current, same };
}
#endif
}

template <typename R, typename Iterator, typename EndIterator, auto... String, typename... Tail>
Expand Down
38 changes: 38 additions & 0 deletions include/ctre/utf8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,44 @@
#include <iterator>

namespace ctre {
constexpr char8_t* utf32_codepoint_to_utf8_codepoint(uint32_t code, char8_t *ptr) {
if (code < 0x80) {
ptr[0] = code;
return ptr + 1;
} else if (code < 0x800) { // 00000yyy yyxxxxxx
ptr[0] = (0b11000000 | (code >> 6));
ptr[1] = (0b10000000 | (code & 0x3f));
return ptr + 2;
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
ptr[0] = (0b11100000 | (code >> 12)); // 1110zzz
ptr[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
ptr[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx
return ptr + 3;
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
ptr[0] = (0b11110000 | (code >> 18)); // 11110uuu
ptr[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
ptr[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy
ptr[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx
return ptr + 4;
} else {
ptr[0] = 0xff; //invalid start byte
return ptr + 1;
}
}

constexpr uint32_t utf8_codepoint_length(uint32_t code) {
if (code < 0x80) {
return 1;
} else if (code < 0x800) { // 00000yyy yyxxxxxx
return 2;
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
return 3;
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
return 4;
} else {
return 1;
}
}

struct utf8_iterator {
using self_type = utf8_iterator;
Expand Down

0 comments on commit 09e933a

Please sign in to comment.