-
Notifications
You must be signed in to change notification settings - Fork 43
/
task_utils.py
409 lines (362 loc) · 18.1 KB
/
task_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
"""
Utility methods that are used for training and evaluation of the tasks.
"""
import os
import operator
import numpy as np
import random
from collections import namedtuple
from sklearn import svm
from sklearn.metrics import accuracy_score
import data_utils
from constants import POS_ID, NEG_ID, SENTIMENT, POS, POS_BILSTM, PARSING,\
BAYES_OPT
from simpletagger import StructuredPerceptron
from bist_parser.bmstparser.src import mstlstm
from bist_parser.bmstparser.src.utils import vocab_conll, write_conll,\
write_original_conll
from bilstm_tagger.src.simplebilty import SimpleBiltyTagger, load
NUM_EPOCHS = 50
PATIENCE = 2
def get_data_subsets(feature_vals, feature_weights, train_data, train_labels,
task, num_train_examples):
"""
Given the feature values and the feature weights, return the stratified
subset of the training data with the highest feature scores.
:param feature_vals: a numpy array of shape (num_train_data, num_features)
containing the feature values
:param feature_weights: a numpy array of shape (num_features, ) containing
the weight for each feature
:param train_data: a sparse numpy array of shape (num_train_data, vocab_size)
containing the training data
:param train_labels: a numpy array of shape (num_train_data) containing the
training labels
:param task: the task; this determines whether we use stratification
:param num_train_examples: the number of training examples for the
respective task
:return: subsets of the training data and its labels as a tuple of two
numpy arrays
"""
# calculate the scores as the dot product between feature values and weights
scores = feature_vals.dot(np.transpose(feature_weights))
# sort the indices by their scores
sorted_index_score_pairs = sorted(zip(range(len(scores)), scores),
key=operator.itemgetter(1), reverse=True)
# get the top indices
top_indices, _ = zip(*sorted_index_score_pairs)
if task == SENTIMENT:
# for sentiment, rather than taking the top n indices, we still want to
# have a stratified training set so we take the top n/2 positive and
# top n/2 negative indices
top_pos_indices = [idx for idx in top_indices if train_labels[idx] ==
POS_ID][:int(num_train_examples/2)]
top_neg_indices = [idx for idx in top_indices if train_labels[idx] ==
NEG_ID][:int(num_train_examples/2)]
top_indices = top_pos_indices + top_neg_indices
elif task in [POS, POS_BILSTM, PARSING]:
# for POS tagging and parsing, we don't need a stratified train set
top_indices = list(top_indices[:num_train_examples])
else:
raise ValueError('Top index retrieval not implemented for %s.' % task)
if isinstance(train_data, list):
# numpy indexing does not work if train_data is a list
return [train_data[idx] for idx in top_indices],\
train_labels[top_indices]
# we get the corresponding subsets of the training data and the labels
return train_data[top_indices], train_labels[top_indices]
def task2train_and_evaluate_func(task):
"""Return the train_and_evaluate function for a task."""
if task == SENTIMENT:
return train_and_evaluate_sentiment
if task == POS:
return train_and_evaluate_pos
if task == POS_BILSTM:
return train_and_evaluate_pos_bilstm
if task == PARSING:
return train_and_evaluate_parsing
raise ValueError('Train_and_evaluate is not implemented for %s.' % task)
def train_and_evaluate_sentiment(train_data, train_labels, val_data, val_labels,
test_data=None, test_labels=None,
parser_output_path=None, perl_script_path=None):
"""
Trains an SVM on the provided training data. Calculates accuracy on the
validation set and (optionally) on the test set.
:param train_data: the training data; a sparse numpy matrix of shape
(num_examples, max_vocab_size)
:param train_labels: the training labels; a numpy array of shape (num_labels)
:param val_data: the validation data; same format as the training data
:param val_labels: the validation labels
:param test_data: the test data
:param test_labels: the test labels
:param parser_output_path: only necessary for parsing; is ignored here
:param perl_script_path: only necessary for parsing; is ignored here
:return: the validation accuracy and (optionally) the test data;
otherwise None
"""
print('Training the SVM on %d examples...' % train_data.shape[0])
clf = svm.SVC()
clf.fit(train_data, train_labels)
# validate the configuration on the validation and test set (if provided)
val_predictions = clf.predict(val_data)
val_accuracy = accuracy_score(val_labels, val_predictions)
print('Val acc: %.5f' % val_accuracy)
test_accuracy = None
if test_data is not None and test_labels is not None:
test_predictions = clf.predict(test_data)
test_accuracy = accuracy_score(test_labels, test_predictions)
print('Test acc: %.5f' % test_accuracy)
return val_accuracy, test_accuracy
def train_and_evaluate_pos(train_data, train_labels, val_data, val_labels,
test_data=None, test_labels=None,
parser_output_path=None, perl_script_path=None):
"""
Trains the tagger on the provided training data. Calculates accuracy on the
validation set and (optionally) on the test set.
:param train_data: the training data; a list of lists of shape
(num_examples, sequence_length)
:param train_labels: the training labels; a list of lists of tags
:param val_data: the validation data; same format as the training data
:param val_labels: the validation labels
:param test_data: the test data
:param test_labels: the test labels
:param parser_output_path: only necessary for parsing; is ignored here
:param perl_script_path: only necessary for parsing; is ignored here
:return: the validation accuracy and (optionally) the test acc; else None
"""
print('Training the tagger on %d examples...' % len(train_data))
sp = StructuredPerceptron()
tr_data = [(words, tags) for words, tags in zip(train_data, train_labels)]
pos_iterations, pos_learning_rate = 5, 0.2
sp.fit(tr_data, iterations=pos_iterations, learning_rate=pos_learning_rate)
# validate the configuration on the validation and test set (if provided)
val_predictions = sp.predict(val_data)
val_accuracy = pos_accuracy_score(val_labels, val_predictions)
print('Val acc: %.5f' % val_accuracy)
test_accuracy = None
if test_data is not None and test_labels is not None:
test_predictions = sp.predict(test_data)
test_accuracy = pos_accuracy_score(test_labels, test_predictions)
print('Test acc: %.5f' % test_accuracy)
return val_accuracy, test_accuracy
def train_and_evaluate_pos_bilstm(train_data, train_labels, val_data, val_labels,
test_data=None, test_labels=None,
parser_output_path=None, perl_script_path=None):
"""
Trains the tagger on the provided training data. Calculates accuracy on the
validation set and (optionally) on the test set.
:param train_data: the training data; a list of lists of shape
(num_examples, sequence_length)
:param train_labels: the training labels; a list of lists of tags
:param val_data: the validation data; same format as the training data
:param val_labels: the validation labels
:param test_data: the test data
:param test_labels: the test labels
:return: the validation accuracy and (optionally) the test data; else None
"""
print('Training the BiLSTM tagger on %d examples...' % len(train_data))
in_dim = 64
h_dim = 100
c_in_dim = 100
h_layers = 1
trainer = "adam"
# temporary file used to restore best model; random number is used to avoid
# name clash in parallel runs
model_path = '/tmp/bilstm_tagger_model_%d' % random.randint(0, 1000000)
tagger = SimpleBiltyTagger(in_dim, h_dim, c_in_dim, h_layers,
embeds_file=None)
train_X, train_Y = tagger.get_train_data_from_instances(train_data,
train_labels)
val_X, val_Y = tagger.get_data_as_indices_from_instances(val_data,
val_labels)
# train the model with early stopping
tagger.fit(train_X, train_Y, NUM_EPOCHS, trainer, val_X=val_X, val_Y=val_Y,
patience=PATIENCE, model_path=model_path)
# load the best model and remove the model files
tagger = load(model_path)
os.unlink(model_path)
os.unlink(model_path + '.pickle') # file used to save the parameters
val_correct, val_total = tagger.evaluate(val_X, val_Y)
val_accuracy = val_correct / val_total
print('Val acc: %.5f' % val_accuracy)
test_accuracy = None
if test_data is not None and test_labels is not None:
test_X, test_Y = tagger.get_data_as_indices_from_instances(test_data,
test_labels)
test_correct, test_total = tagger.evaluate(test_X, test_Y)
test_accuracy = test_correct / test_total
print('Test acc: %.5f' % test_accuracy)
return val_accuracy, test_accuracy
def train_and_evaluate_parsing(train_data, train_labels, val_data, val_labels,
test_data=None, test_labels=None,
parser_output_path=None, perl_script_path=None):
"""
Trains the parser on the provided training data. Calculates LAS on the
validation set and (optionally) on the test set.
:param train_data: the training data; a list of CoNLL entries
:param train_labels: pseudo-labels; not used as labels as labels are
contained in train_data
:param val_data: the validation data; same format as the training data
:param val_labels: pseud-labels; not used as contained in val_data
:param test_data: the test data
:param test_labels: pseudo-labels; not used as contained in test_data
:return: the validation accuracy and (optionally) the test data; else None
"""
print('Training the parser on %d examples...' % len(train_data))
if test_data is not None:
# incorporate the test data as some POS tags (e.g. XX) might only
# appear in the target domain
words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data, test_data]))
else:
words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data]))
# set the variables used for initializing the parser and initialize the
# parser
ParserOptions = namedtuple('parser_options',
'activation, blstmFlag, labelsFlag, costaugFlag,'
' bibiFlag, lstm_dims, wembedding_dims, '
'pembedding_dims, rembedding_dims, lstm_layers, '
'external_embedding, hidden_units, '
'hidden2_units, epochs')
parser_options = ParserOptions(
epochs=NUM_EPOCHS,
activation='tanh',
blstmFlag=True,
labelsFlag=True,
costaugFlag=True,
bibiFlag=False,
lstm_dims=125,
wembedding_dims=100,
pembedding_dims=25,
rembedding_dims=25,
lstm_layers=2,
external_embedding=None,
hidden_units=100,
hidden2_units=0
)
parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options)
# write the dev data to a file
dev_data_path = os.path.join(parser_output_path, 'dev.conll')
write_original_conll(dev_data_path, val_data)
# set the variables used for tracking training progress for early stopping
best_dev_las, epochs_no_improvement = 0., 0
best_model_path = os.path.join(parser_output_path, 'parser')
print('Training model for %d max epochs with early stopping with patience '
'%d...' % (NUM_EPOCHS, PATIENCE))
for epoch in range(parser_options.epochs):
print('Starting epoch', epoch)
parser.TrainOnEntries(train_data)
# write the predictions to a file
pred_path = os.path.join(parser_output_path,
'dev_pred_epoch_' + str(epoch + 1) + '.conll')
write_conll(pred_path, parser.PredictOnEntries(val_data))
eval_path = pred_path + '.eval'
perl_script_command = ('perl %s -g %s -s %s > %s' % (
perl_script_path,dev_data_path, pred_path, eval_path))
print('Evaluating with %s...' % perl_script_command)
os.system(perl_script_command)
las, uas, acc = data_utils.read_parsing_evaluation(eval_path)
# remove the predictions and the evaluation file
if os.path.exists(pred_path):
os.unlink(pred_path)
if os.path.exists(eval_path):
os.unlink(eval_path)
if las > best_dev_las:
print('LAS %.2f is better than best dev LAS %.2f.'
% (las, best_dev_las))
best_dev_las = las
epochs_no_improvement = 0
parser.Save(best_model_path)
else:
print('LAS %.2f is worse than best dev LAS %.2f.'
% (las, best_dev_las))
epochs_no_improvement += 1
if epochs_no_improvement == PATIENCE:
print('No improvement for %d epochs. Early stopping...'
% epochs_no_improvement)
print('Best dev LAS:', best_dev_las)
break
test_las = None
if test_data is not None:
# load the best model
parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options)
parser.Load(best_model_path)
# first write the dev data to a file
test_data_path = os.path.join(parser_output_path, 'test.conll')
write_original_conll(test_data_path, test_data)
# then write the prediction to another file
pred_path = os.path.join(parser_output_path, 'test_pred.conll')
write_conll(pred_path, parser.PredictOnEntries(test_data))
eval_path = pred_path + '.eval'
perl_script_command = ('perl %s -g %s -s %s > %s' % (
perl_script_path, test_data_path, pred_path, eval_path))
print('Evaluating with %s...' % perl_script_command)
os.system(perl_script_command)
test_las, test_uas, test_acc = data_utils.read_parsing_evaluation(
eval_path)
print('Test LAS:', test_las, 'test UAS:', test_uas,
'test acc:', test_acc)
# remove the saved parser
if os.path.exists(best_model_path):
os.unlink(best_model_path)
return best_dev_las, test_las
def train_pretrained_weights(feature_values, X_train, y_train, train_domains,
num_train_examples, X_val, y_val, X_test, y_test,
trg_domain, args, feature_names,
parser_output_path, perl_script_path):
"""
Train a model using pre-trained data selection weights (which could have
been trained on an other model/domain/task).
:param feature_values: a numpy array of shape (num_examples, num_features)
:param X_train: the training data
:param y_train: the training labels
:param train_domains: a list of training domains, only used for counting
:param num_train_examples: the number of examples used for training
:param X_val: the validation data
:param y_val: the validation labels
:param X_test: the test data
:param y_test: the test labels
:param trg_domain: the target domain
:param args: the arguments used for calling the script; used for logging
:param feature_names: a list of the feature names
:param parser_output_path: the output path of the parser
:param perl_script_path: the path to the perl script
:return:
"""
for feat_weights_domain, feat_weights_feats, feature_weights in \
data_utils.read_feature_weights_file(args.feature_weights_file):
assert len(feature_weights) == len(feature_names)
assert set(args.feature_sets) == set(feat_weights_feats.split(' '))
if trg_domain != feat_weights_domain:
continue
# count how many examples belong to each source domain
train_domain_subset, _ = get_data_subsets(
feature_values, feature_weights, train_domains, y_train, args.task,
num_train_examples)
for subset_domain in set(train_domain_subset):
print('# of %s in train data for trg domain %s: %d'
% (subset_domain, trg_domain,
train_domain_subset.count(subset_domain)))
continue
# get the train subset with the highest scores and train
train_subset, labels_subset = get_data_subsets(
feature_values, feature_weights, X_train, y_train, args.task,
num_train_examples)
val_accuracy, test_accuracy = task2train_and_evaluate_func(args.task)(
train_subset, labels_subset, X_val, y_val, X_test, y_test,
parser_output_path=parser_output_path,
perl_script_path=perl_script_path)
dict_key = ('%s-X-domain-%s-%s' % (BAYES_OPT, feat_weights_domain,
feat_weights_feats))
# log the result to the log file
data_utils.log_to_file(args.log_file, {dict_key: [(
val_accuracy, test_accuracy, feature_weights)]}, trg_domain, args)
def pos_accuracy_score(gold, predicted):
"""
Calculate the accuracy for POS.
:param gold: a list of lists of gold tags
:param predicted: a list of lists of predicted tags
:return the accuracy score
"""
tags_correct = np.sum([1 for gold_tags, pred_tags in zip(gold, predicted)
for g, p in zip(gold_tags, pred_tags) if g == p])
tags_total = len([t for g in gold for t in g]) # ravel list
return tags_correct/float(tags_total)