-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenizer.py
218 lines (175 loc) · 7.22 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import os
import jieba
import re
import json
import aiohttp
import asyncio
from typing import List, Set
from logger_config import setup_logger, inspect_trace
# 配置日志系统
logger = setup_logger()
# 读取配置文件
config_path = os.path.join(os.path.dirname(__file__), 'config.json')
with open(config_path, 'r') as f:
config = json.load(f)
if os.getenv('GITHUB_ACTIONS'):
logger.info("Running in GitHub Actions environment")
LLM_API_URL = os.getenv('LLM_API_URL')
LLM_API_KEY = os.getenv('LLM_API_KEY')
if not LLM_API_KEY:
logger.error("github actions LLM_API_KEY is empty, please set it in config.json")
exit(-1)
if not isinstance(LLM_API_URL, str):
raise ValueError("LLM_API_URL must be a string")
if not isinstance(LLM_API_KEY, str):
raise ValueError("LLM_API_KEY must be a string")
logger.info(f"LLM_API_URL: {LLM_API_URL}")
logger.info("read api url and key from github actions successfully!")
else:
LLM_API_URL = config.get('LLM_API_URL')
LLM_API_KEY = config.get('LLM_API_KEY')
exclude_words = [
'的', '了', '和', '或', '与', '在', '更', '这', '是', '不']
if not LLM_API_KEY:
logger.error("LLM_API_KEY is empty, please set it in config.json")
exit(-1)
"""
unicode_list = [
("逗号", ",", "\uFF0C"),
("句号", "。", "\u3002"),
("问号", "?", "\uFF1F"),
("感叹号", "!", "\uFF01"),
("冒号", ":", "\uFF1A"),
("分号", ";", "\uFF1B"),
("引号", "“”、‘’", "\u201C\u201D\u3001\u2018\u2019"),
("括号", "()、【】、《》", "\uFF08\uFF09\u3001\u3010\u3011\u300A\u300B"),
("省略号", "……", "\u2026\u2026"),
("顿号", "、", "\u3001"),
("连接号", "—", "\u2014"),
("破折号", "——", "\u2014\u2014"),
("书名号", "《》", "\u300A\u300B")
]
"""
async def jieba_tokenizer(sentence: str) -> List[str]:
"""
使用jieba进行分词
:param sentence: 输入句子
:return: 分词结果列表
"""
return jieba.lcut(sentence)
async def deepseek_tokenizer(sentence: str, session: aiohttp.ClientSession) -> List[str]:
"""
使用 DeepSeek API 进行分词
:param sentence: 输入句子
:param session: aiohttp 客户端会话
:return: 分词结果列表
"""
# 请求头
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {LLM_API_KEY}"
}
# 请求数据
data = {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user",
"content": f"样例输入 \"2.5亿美元打造游戏史首个变性黑人!揭秘《星鸣特攻》究竟是如何“正确”地走向暴死的\" 样例输出 ['2.5亿美元', '打造', '游戏史', '首个', '变性', '黑人', '揭秘', '星鸣特攻', '究竟', '如何', '正确地', '走向', '暴死'] 参靠输入输出对后面这句话进行拆分 {sentence}"}
],
"stream": False
}
try:
# 发送请求
async with session.post(LLM_API_URL, headers=headers, data=json.dumps(data)) as response:
response.raise_for_status() # 检查请求是否成功
# 解析响应数据
response_data = await response.json()
# 提取分词结果
tokenized_phrases = response_data['choices'][0]['message']['content'].strip("[]").split(", ")
return [phrase.strip("'") for phrase in tokenized_phrases]
except aiohttp.ClientError as e:
logger.error(f"请求失败: {e}")
return []
except KeyError as e:
logger.error(f"解析响应数据失败: {e}")
return []
except Exception as e:
logger.error(f"发生未知错误: {e}")
inspect_trace()
return []
def filter_chinese_words(words: List[str], min_length: int = 2, max_length: int = 8) -> Set[str]:
"""
过滤出符合长度要求的中文词
:param words: 分词结果列表
:param min_length: 词的最小长度
:param max_length: 词的最大长度
:return: 符合要求的中文词集合
"""
filtered_words = set()
# add some branch to remove space around and in the words
for word in words:
if "·" in word:
word = word.replace("·", "")
if " " in word:
word=word.replace(" ", "")
if re.match(r"^[\u4e00-\u9fa5\s]+$", word
) and min_length <= len(word) <= max_length:
add_word = True
for i in exclude_words:
if i in word:
add_word = False
break
if word.startswith("一"):
add_word = False
# 判断word里面是否有数字,有则跳过
if re.search(r"\d", word):
add_word = False
# 检查是否包含英文字母
if re.search(r"[a-zA-Z]", word):
add_word = False
# 专门匹配中文标点符号的正则表达式
unicode_pattern = re.compile(
r"[\uFF0C\u3002\uFF1F\uFF01\uFF1A\uFF1B\u201C\u201D\u3001\u2018\u2019\uFF08\uFF09\u3010\u3011\u300A\u300B\u2026\u3001\u2014\u300A\u300B]"
)
if bool(unicode_pattern.search(word)):
add_word = False
if add_word:
filtered_words.add(word)
return filtered_words
async def tokenize_and_filter(sentence: str, tokenizer_func, *args, min_length: int = 2, max_length: int = 8) -> Set[
str]:
"""
通用分词和过滤接口
:param sentence: 输入句子
:param tokenizer_func: 分词函数
:param args: 分词函数的额外参数
:param min_length: 词的最小长度
:param max_length: 词的最大长度
:return: 符合要求的中文词集合
"""
words = await tokenizer_func(sentence, *args)
return filter_chinese_words(words, min_length, max_length)
async def LLM_Split_words(sentence_list: List[str]) -> Set[str]:
async with aiohttp.ClientSession() as session:
tasks = [tokenize_and_filter(sentence, deepseek_tokenizer, session) for sentence in sentence_list]
results = await asyncio.gather(*tasks)
return set.union(*results)
async def main():
sentence_list = [
"2.5亿美元打造游戏史首个变性黑人!揭秘《星鸣特攻》究竟是如何“正确”地走向暴死的",
"1k出头!锐度报表!全画幅75mm F2,质感满满的国货之光,铭匠光学75 F2到底咋样?",
"日本平成时代所有被停播的违规广告&具体封禁理由",
"26岁的日本京都大学热血男:为了挽救即将被废除的泡澡社团,他作出的决定是……"
]
for senten in sentence_list:
splited_words = await tokenize_and_filter(senten, jieba_tokenizer)
logger.info(f"jieba分词结果: {splited_words}")
async with aiohttp.ClientSession() as session:
tasks = [tokenize_and_filter(sentence, deepseek_tokenizer, session) for sentence in sentence_list]
results = await asyncio.gather(*tasks)
for i, result in enumerate(results):
logger.info(f"句子 {i + 1} 的分词结果: {result}")
# 示例用法
if __name__ == "__main__":
asyncio.run(main())