forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_explain.py
128 lines (113 loc) · 4.74 KB
/
test_explain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import re
import string
import hypothesis
import hypothesis.strategies
import pytest
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import get_lang_class
# Only include languages with no external dependencies
# "is" seems to confuse importlib, so we're also excluding it for now
# excluded: ja, ru, th, uk, vi, zh, is
LANGUAGES = [
pytest.param("fr", marks=pytest.mark.slow()),
pytest.param("af", marks=pytest.mark.slow()),
pytest.param("ar", marks=pytest.mark.slow()),
pytest.param("bg", marks=pytest.mark.slow()),
"bn",
pytest.param("ca", marks=pytest.mark.slow()),
pytest.param("cs", marks=pytest.mark.slow()),
pytest.param("da", marks=pytest.mark.slow()),
pytest.param("de", marks=pytest.mark.slow()),
"el",
"en",
pytest.param("es", marks=pytest.mark.slow()),
pytest.param("et", marks=pytest.mark.slow()),
pytest.param("fa", marks=pytest.mark.slow()),
pytest.param("fi", marks=pytest.mark.slow()),
"fr",
pytest.param("ga", marks=pytest.mark.slow()),
pytest.param("he", marks=pytest.mark.slow()),
pytest.param("hi", marks=pytest.mark.slow()),
pytest.param("hr", marks=pytest.mark.slow()),
"hu",
pytest.param("id", marks=pytest.mark.slow()),
pytest.param("it", marks=pytest.mark.slow()),
pytest.param("kn", marks=pytest.mark.slow()),
pytest.param("lb", marks=pytest.mark.slow()),
pytest.param("lt", marks=pytest.mark.slow()),
pytest.param("lv", marks=pytest.mark.slow()),
pytest.param("nb", marks=pytest.mark.slow()),
pytest.param("nl", marks=pytest.mark.slow()),
"pl",
pytest.param("pt", marks=pytest.mark.slow()),
pytest.param("ro", marks=pytest.mark.slow()),
pytest.param("si", marks=pytest.mark.slow()),
pytest.param("sk", marks=pytest.mark.slow()),
pytest.param("sl", marks=pytest.mark.slow()),
pytest.param("sq", marks=pytest.mark.slow()),
pytest.param("sr", marks=pytest.mark.slow()),
pytest.param("sv", marks=pytest.mark.slow()),
pytest.param("ta", marks=pytest.mark.slow()),
pytest.param("te", marks=pytest.mark.slow()),
pytest.param("tl", marks=pytest.mark.slow()),
pytest.param("tr", marks=pytest.mark.slow()),
pytest.param("tt", marks=pytest.mark.slow()),
pytest.param("ur", marks=pytest.mark.slow()),
]
@pytest.mark.parametrize("lang", LANGUAGES)
def test_tokenizer_explain(lang):
tokenizer = get_lang_class(lang)().tokenizer
examples = pytest.importorskip(f"spacy.lang.{lang}.examples")
for sentence in examples.sentences:
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens
def test_tokenizer_explain_special_matcher(en_vocab):
suffix_re = re.compile(r"[\.]$")
infix_re = re.compile(r"[/]")
rules = {"a.": [{"ORTH": "a."}]}
tokenizer = Tokenizer(
en_vocab,
rules=rules,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
)
tokens = [t.text for t in tokenizer("a/a.")]
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
assert tokens == explain_tokens
@hypothesis.strategies.composite
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
"""
Composite strategy for fuzzily generating sentence with varying interpunctation.
draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis'
strategies.
max_n_words (int): Max. number of words in generated sentence.
RETURNS (str): Fuzzily generated sentence.
"""
punctuation_and_space_regex = "|".join(
[*[re.escape(p) for p in string.punctuation], r"\s"]
)
sentence = [
[
draw(hypothesis.strategies.text(min_size=1)),
draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)),
]
for _ in range(
draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words))
)
]
return " ".join([token for token_pair in sentence for token in token_pair])
@pytest.mark.xfail
@pytest.mark.parametrize("lang", LANGUAGES)
@hypothesis.given(sentence=sentence_strategy())
def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
"""
Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis.
lang (str): Language to test.
text (str): Fuzzily generated sentence to tokenize.
"""
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"