Skip to content

Commit

Permalink
Support Already Converted Llava Tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov committed Dec 3, 2024
1 parent f79bccb commit 925747f
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/regex_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ std::string reformat_replace_pattern(std::string replace_pattern) {

const std::map<std::string, std::string> search_pattern_rewrites = {
{R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))", R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))"},
{R"((^)(.))", R"((^)([\s\S]))"}
{R"((^)(.))", R"((^)([\s\S]))"},
{R"((^)(.+))", R"((^)([\s\S]))"}
};

/**
Expand Down
8 changes: 8 additions & 0 deletions tests/layer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
replace_term=r"▁\2",
)
),
( # test backward compatibility with old regex
"\n",
"▁\n",
RegexNormalizationStep(
regex_search_pattern=r"(^)(.+)",
replace_term=r"▁$2",
)
),
]
)
def test_regex_normalization(test_string, expected, layer):
Expand Down

0 comments on commit 925747f

Please sign in to comment.