forked from 1024er/cbert_aug
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
executable file
·147 lines (127 loc) · 4.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from __future__ import division
from __future__ import print_function
import collections
import io
import json
import os
import numpy as np
import progressbar
import chainer
from chainer import cuda
from chainer.dataset import convert
class Outer(list):
def __init__(self):
list.__init__(self)
def write(self, item):
if '\n' not in item:
self.append(item.strip())
class UnkDropout(chainer.dataset.DatasetMixin):
def __init__(self, dataset, unk, ratio=0.01):
self.dataset = dataset
self.unk = unk
self.ratio = ratio
def __len__(self):
return len(self.dataset)
def get_example(self, i):
x, y = self.dataset[i]
if chainer.config.train:
rand = np.random.rand(x.size - 2) < self.ratio
## keep bos and eos
_x = x.copy()
_x[1:-1] = np.where(rand, self.unk, _x[1:-1])
return (_x, y)
return (x, y)
def convert_xt_batch_seq(xt_batch_seq, gpu):
batchsize = len(xt_batch_seq[0])
seq_len = len(xt_batch_seq)
xt_batch_seq = np.array(xt_batch_seq, 'i')
## (bproplen, batch, 2)
xt_batch_seq = convert.to_device(gpu, xt_batch_seq)
xp = cuda.get_array_module(xt_batch_seq)
x_seq_batch = xp.split(
xt_batch_seq[:, :, 0].T.reshape(batchsize * seq_len),
batchsize, axis=0)
t_seq_batch = xp.split(
xt_batch_seq[:, :, 1].T.reshape(batchsize * seq_len),
batchsize, axis=0)
return x_seq_batch, t_seq_batch
def count_words_from_file(counts, file_path):
bar = progressbar.ProgressBar()
for l in bar(io.open(file_path, encoding='utf-8')):
## TODO: parallel
if l.strip():
words = l.strip().split()
for word in words:
counts[word] += 1
return counts
def count_words(dataset, alpha=0.4):
counts = collections.defaultdict(int)
for w in dataset:
counts[w] += 1
counts = [counts[i] for i in range(len(counts))]
counts = np.array(counts, 'f')
counts /= counts.sum()
counts = counts ** alpha
counts = counts.tolist()
return counts
def make_chain_dataset(file_path, vocab={}, update_vocab=False,
chain_length=2):
dataset = []
chain = []
unk_id = vocab['<unk>']
def make_array(chain):
array_chain = []
for words in chain:
tokens = []
for word in words:
if update_vocab:
if word not in vocab:
vocab[word] = len(vocab)
tokens.append(vocab.get(word, unk_id))
array_chain.append(np.array(tokens, 'i'))
return array_chain
for line in io.open(file_path, encoding='utf-8'):
if not line.strip():
if len(chain) >= chain_length:
dataset.append(make_array(chain))
chain = []
continue
words = line.strip().split() + ['<eos>']
chain.append(words)
if len(chain) >= chain_length:
dataset.append(make_array(chain))
return dataset, vocab
def tokenize_text(file_path, vocab={}, update_vocab=False):
tokens = []
unk_id = vocab['<unk>']
with io.open(file_path, encoding='utf-8') as f:
for line in f:
words = line.split() + ['<eos>']
for word in words:
if update_vocab:
if word not in vocab:
vocab[word] = len(vocab)
tokens.append(vocab.get(word, unk_id))
return tokens, vocab
def get_wikitext_words_and_vocab(
name='wikitext-2', base_dir='datasets', vocab=None):
assert(name in ['wikitext-2', 'wikitext-103'])
base_dir2 = os.path.join(base_dir, name)
predata_path = os.path.join(base_dir2, 'preprocessed_data.json')
if os.path.exists(predata_path) and vocab is None:
train, valid, test, vocab = json.load(open(predata_path))
else:
prepared_vocab = (vocab is not None)
if not prepared_vocab:
vocab = {'<eos>': 0, '<unk>': 1}
train, vocab = tokenize_text(
os.path.join(base_dir2, 'wiki.train.tokens'),
vocab, update_vocab=not prepared_vocab)
valid, _ = tokenize_text(
os.path.join(base_dir2, 'wiki.valid.tokens'),
vocab, update_vocab=False)
test, _ = tokenize_text(
os.path.join(base_dir2, 'wiki.test.tokens'),
vocab, update_vocab=False)
json.dump([train, valid, test, vocab], open(predata_path, 'w'))
return train, valid, test, vocab