diff --git a/AUTHORS.md b/AUTHORS.md index bcafe6e0..c8ea3051 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -24,4 +24,5 @@ [Heng Yu](https://github.com/GNEHUY) +[Tianyun Ji](https://github.com/KINGNEWBLUSH) The stared contributors are the corresponding authors. diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 45bd96a6..b5559b60 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -2,7 +2,14 @@ # 2021/5/18 @ tongshiwei import logging import jieba +from nltk.tokenize import word_tokenize +import nltk +import spacy +import tokenizers as huggingface_tokenizer +from tokenizers.trainers import BpeTrainer from .stopwords import DEFAULT_STOPWORDS +from tokenizers import Tokenizer as HGTokenizer + jieba.setLogLevel(logging.INFO) @@ -15,7 +22,13 @@ def is_chinese(word): return True -def tokenize(text, granularity="word", stopwords="default"): +def tokenize(text, + granularity="word", + stopwords="default", + tokenizer="jieba", + tok_model="en_core_web_sm", + bpe_json='bpe.tokenizer.json', + bpe_trainfile=None): """ Using jieba library to tokenize item by word or char. @@ -37,17 +50,68 @@ def tokenize(text, granularity="word", stopwords="default"): """ stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords stopwords = stopwords if stopwords is not None else {} - if granularity == "word": - return [token for token in jieba.cut(text) if token not in stopwords and token.strip()] - elif granularity == "char": - jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()] - # Use jieba_tokens to hangle sentence with mixed chinese and english. - split_tokens = [] - for token in jieba_tokens: - if is_chinese(token): - split_tokens.extend(list(token)) - else: - split_tokens.append(token) - return split_tokens + + if (tokenizer == 'jieba'): + if granularity == "word": + return [ + token for token in jieba.cut(text) + if token not in stopwords and token.strip() + ] + elif granularity == "char": + jieba_tokens = [ + token for token in jieba.cut(text) + if token not in stopwords and token.strip() + ] + # Use jieba_tokens to hangle sentence with mixed chinese and english. + split_tokens = [] + for token in jieba_tokens: + if is_chinese(token): + split_tokens.extend(list(token)) + else: + split_tokens.append(token) + return split_tokens + else: + raise TypeError("Unknown granularity %s" % granularity) + + elif (tokenizer == 'nltk'): + try: + return [ + token for token in word_tokenize(text) + if token not in stopwords and token.strip() + ] + except LookupError: + nltk.download('punkt') + return [ + token for token in word_tokenize(text) + if token not in stopwords and token.strip() + ] + + elif (tokenizer == 'spacy'): + try: + spacy_tokenizer = spacy.load(tok_model) + except OSError: + spacy.cli.download(tok_model) + spacy_tokenizer = spacy.load(tok_model) + output = spacy_tokenizer(str(text)) + return [ + token.text for token in output + if token.text not in stopwords + ] + + elif (tokenizer == 'bpe'): + try: + tokenizer = HGTokenizer.from_file(bpe_json) + except Exception: + tokenizer = huggingface_tokenizer.Tokenizer( + huggingface_tokenizer.models.BPE()) + if (bpe_trainfile is None): + raise LookupError("bpe train file not found, using %s." % bpe_trainfile) + trainer = BpeTrainer( + special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + tokenizer.train(files=[bpe_trainfile], trainer=trainer) + tokenizer.save(bpe_json, pretty=True) + output = tokenizer.encode(text) + output = output.tokens + return output[0] else: - raise TypeError("Unknown granularity %s" % granularity) + raise TypeError("Invalid Spliter: %s" % tokenizer) diff --git a/setup.py b/setup.py index c5cc9e21..d6c02460 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,9 @@ 'networkx', 'numpy>=1.17.0', 'jieba', + 'nltk', + 'spacy', + 'tokenizers', 'js2py', 'EduData>=0.0.16', 'PyBaize>=0.0.3' diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index e9471e39..44b4b58a 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -4,6 +4,7 @@ import pytest from EduNLP.Tokenizer import get_tokenizer from EduNLP.Pretrain import DisenQTokenizer +from EduNLP.utils import abs_current_dir, path_append def test_tokenizer(): @@ -50,6 +51,49 @@ def test_CharTokenizer(): assert ret == ans +def test_TokenizerNLTK(): + items = ["The stationery store has 600 exercise books, and after selling\ + some, there are still 4 packs left, 25 each, how many are sold?"] + ans = [ + 'The', 'stationery', 'store', 'has', '600', 'exercise', + 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', + '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' + ] + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'nltk', "stopwords": set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans + + +def test_TokenizerSpacy(): + items = ["The stationery store has 600 exercise books, and after selling\ + some, there are still 4 packs left, 25 each, how many are sold?"] + ans = [ + 'The', 'stationery', 'store', 'has', '600', 'exercise', + 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', + '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' + ] + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'spacy', "stopwords": set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans + + +def test_TokenizerBPE(): + items = ['The stationery store has $600$ exercise books, and after selling some,\ + there are still $4$ packs left, $25$ each, how many are sold?'] + ans = ['h', '600', ' ', '4', ' ', '25', ' '] + data_path = path_append(abs_current_dir(__file__), + "../../static/test_data/standard_luna_data.json", to_str=True) + tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), + "bpe_trainfile": data_path}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans + + def test_SpaceTokenizer(): items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] tokenizer = get_tokenizer("space", stop_words=[])