-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
127 lines (103 loc) · 3.09 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# coding=utf-8
import re
from bs4 import BeautifulSoup
import os
import pickle
import codecs
from tqdm import tqdm
from urllib.request import urlretrieve
import zipfile
class DLProgress(tqdm):
last_block = 0
def hook(self, block_num=1, block_size=1, total_size=None):
self.total = total_size
self.update((block_num - self.last_block) * block_size)
self.last_block = block_num
def load_text8():
dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'
if not os.path.isfile(dataset_filename):
with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
urlretrieve(
'http://mattmahoney.net/dc/text8.zip',
dataset_filename,
pbar.hook)
if not os.path.isdir(dataset_folder_path):
with zipfile.ZipFile(dataset_filename) as zip_ref:
zip_ref.extractall(dataset_folder_path)
return load_data(os.path.join(dataset_folder_path, dataset_filename.split('.')[0]))
def load_data(path):
"""
load dataset from file
"""
assert(os.path.exists(path))
assert(os.path.isfile(path))
input_file = path
with codecs.open(input_file, "r") as f:
data = f.read()
return data
def load_preprocess():
"""
Load the Preprocessed Training data and return them in batches of <batch_size> or less
"""
return pickle.load(open('preprocess.p', mode='rb'))
def save_params(params):
"""
Save parameters to file
"""
pickle.dump(params, open('params.p', 'wb'))
def load_params():
"""
Load parameters from file
"""
return pickle.load(open('params.p', mode='rb'))
def isFullEnglish(sentence):
text = ' '.join(sentence)
text_rm_eng = re.sub(r'[a-zA-Z]+', '', text)
return True if len(text_rm_eng) * 1.0 / len(sentence) < 0.9 else False
def token_lookup(tokenize=True):
return {
'.': ' ||period|| ',
'。': ' ||period|| ',
',': ' ||comma|| ',
',': ' ||comma|| ',
'"': ' ||quotation_mark|| ',
';': ' ||semicolon|| ',
';': ' ||semiccolon|| ',
'!': ' ||exclamation_mark|| ',
'!': ' ||exclamation_mark|| ',
'?': ' ||question_mark|| ',
'?': ' ||question_mark|| ',
'(': ' ||left_parentheses|| ',
'(': ' ||left_parentheses|| ',
')': ' ||right_parentheses|| ',
')': ' ||right_parentheses|| ',
'--': ' ||dash|| ',
'——': ' ||dash|| ',
'\n': ' ||return|| '
} if tokenize else {
'.': '',
'。': '',
',': '',
',': '',
'"': '',
';': '',
';': '',
'!': '',
'!': '',
'?': '',
'?': '',
'(': '',
'(': '',
')': '',
')': '',
'--': '',
'——': '',
'\n': ''
}
flatten = lambda l: [item for sublist in l for item in sublist]
def clean_html(sentences):
soup = BeautifulSoup(sentences)
cleaned = re.sub('<[^<]+?>|([0-9]、)|^\n|(\xa0)', '', soup.get_text())
return cleaned