-
Notifications
You must be signed in to change notification settings - Fork 5
/
tf_classify.py
143 lines (120 loc) · 6.91 KB
/
tf_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# from tf_custom_models import LRClassifierL2, SoftmaxClassifier
from utility import train_and_eval_auc
import os
from os.path import join as pjoin
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
import json
from hs_model import HateSpeechSystem
import tf_data
logging.basicConfig(level=logging.INFO)
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string("model_type", "classic", "classic / hb_embed / hb_append")
tf.app.flags.DEFINE_boolean("bidirectional", False, "Bidirectionality of LSTM")
tf.app.flags.DEFINE_string("scoring", "auc", "auc / f1_macro")
tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.")
tf.app.flags.DEFINE_float("max_gradient_norm", 10.0, "Clip gradients to this norm.")
tf.app.flags.DEFINE_float("input_dropout", 0.35, "Fraction of input units randomly dropped.")
tf.app.flags.DEFINE_float("output_dropout", 0.35, "Fraction of output units randomly dropped.")
tf.app.flags.DEFINE_float("state_dropout", 0.2, "Fraction of units randomly dropped on recurrent connections.")
tf.app.flags.DEFINE_float("embedding_dropout", 0.3, "Fraction of embeddings randomly dropped on lookup.")
tf.app.flags.DEFINE_string("optimizer", "adam", "adam / sgd")
tf.app.flags.DEFINE_integer("batch_size", 16, "Batch size to use during training.")
tf.app.flags.DEFINE_integer("epochs", 1000, "Number of epochs to train.")
tf.app.flags.DEFINE_integer("state_size", 100, "Size of each model layer.")
tf.app.flags.DEFINE_integer("num_layers", 2, "Number of encoder layers.")
tf.app.flags.DEFINE_integer("tweet_size", 32, "The length of a tweet (example).")
tf.app.flags.DEFINE_integer("output_size", 2, "The output size, aka number of classes.")
tf.app.flags.DEFINE_integer("embedding_size", 100, "Size of the pretrained vocabulary. (default 100)")
tf.app.flags.DEFINE_integer("hatebase_size", 7, "Number of hatebase features. (default 8)")
tf.app.flags.DEFINE_string("data_dir", "data/twitter_davidson", "SQuAD directory (default ./data/twitter_davidson)")
tf.app.flags.DEFINE_string("train_dir", "train", "Training directory (default: ./train).")
tf.app.flags.DEFINE_string("log_dir", "log", "Path to store log and flag files (default: ./log)")
tf.app.flags.DEFINE_integer("print_every", 100, "How many iterations to do per print.")
tf.app.flags.DEFINE_integer("save_every", 1200, "How many iterations to do per save checkpoint and evaluate.")
tf.app.flags.DEFINE_string("vocab_path", "data/twitter_davidson/vocab.dat", "Path to vocab file (default: ./data/twitter_davidson/vocab.dat)")
tf.app.flags.DEFINE_string("vocab_stemmed", False, "Whether to use stemmed vocabulary (False for GloVe, True for others)")
tf.app.flags.DEFINE_string("embed_path", "", "Path to the GLoVe embedding (default: ./data/twitter_davidson/embeddings.{embedding_size}d.dat)")
tf.app.flags.DEFINE_string("embed_trainable", False, "Whether to train embeddings (False for GloVe, True for others)")
def initialize_model(session, model, train_dir, seed=42):
tf.set_random_seed(seed)
np.random.seed(seed)
ckpt = tf.train.get_checkpoint_state(train_dir)
v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
logging.info(str(vars(FLAGS)))
if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
model.saver.restore(session, ckpt.model_checkpoint_path)
else:
logging.info("Created model with fresh parameters.")
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
session.run(init_op)
logging.info('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables()))
return model
def initialize_vocab(vocab_path):
if tf.gfile.Exists(vocab_path):
rev_vocab = []
with tf.gfile.GFile(vocab_path, mode="rb") as f:
rev_vocab.extend(f.readlines())
# index to word
rev_vocab = [line.strip('\n') for line in rev_vocab]
# word to index
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
return vocab, rev_vocab
else:
raise ValueError("Vocabulary file %s not found.", vocab_path)
# def load_dataset(x_file, y_file):
# with open(x_file, 'rb') as x_file, open(y_file, 'rb') as y_file:
# data_x = pd.read_csv( x_file, header = None, quoting = 0, dtype = np.float64 )
# data_y = pd.read_csv( y_file, header = 0, quoting = 0, usecols = ['hate_speech'], dtype = np.int32)
# return (data_x.values, data_y.values.ravel())
def load_dataset(x_file, y_file):
with open(x_file, 'rb') as x_file, open(y_file, 'rb') as y_file:
data_x = np.loadtxt( x_file, delimiter = ' ', dtype = int)
if FLAGS.output_size == 2:
data_y = pd.read_csv( y_file, header = 0, quoting = 0, usecols = ['hate_speech'], dtype = int)
elif FLAGS.output_size == 3:
data_y = pd.read_csv( y_file, header = 0, quoting = 0, usecols = ['class'], dtype = int)
data_y = data_y.values.ravel()
# for i in range(len(data_x)/batch_size):
# start = batch_size*i
# end = batch_size*(i+1)
# dataset.append((data_x[start:end], data_y[start:end]))
return data_x, data_y
def main(_):
if FLAGS.embed_path:
FLAGS.embed_path = FLAGS.embed_path.format(FLAGS.embedding_size)
else:
FLAGS.embed_path = pjoin("data", "twitter_davidson", "embeddings.{}d.dat".format(FLAGS.embedding_size))
if not os.path.exists(FLAGS.log_dir):
os.makedirs(FLAGS.log_dir)
file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
logging.getLogger().addHandler(file_handler)
# print(vars(FLAGS))
with open(pjoin(FLAGS.log_dir, "flags.json"), 'w') as fout:
json.dump(FLAGS.__flags, fout)
hb = 'hb.' if FLAGS.model_type == 'hb_append' else ''
stemmed = 'stemmed.' if FLAGS.vocab_stemmed else ''
train_x_path = pjoin(FLAGS.data_dir, "train.ids.{}d.{}{}vec".format(FLAGS.tweet_size, hb, stemmed))
test_x_path = pjoin(FLAGS.data_dir, "test.ids.{}d.{}{}vec".format(FLAGS.tweet_size, hb, stemmed))
print train_x_path, test_x_path
train_y_path = pjoin(FLAGS.data_dir, "train.y")
test_y_path = pjoin(FLAGS.data_dir, "test.y")
train_x, train_y = load_dataset(train_x_path, train_y_path)
test_x, test_y = load_dataset(test_x_path, test_y_path) #FLAGS.batch_size
vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) # one is list and one is dict
FLAGS.vocab_len = len(vocab)
#lr = SoftmaxClassifier(max_iter=FLAGS.epochs)
#lr = LRClassifierL2(C=100)
# train_and_eval_auc( train_x, train_y, test_x, test_y, model=lr )
hs = HateSpeechSystem( FLAGS, train_x, train_y )
with tf.Session() as sess:
initialize_model(sess, hs, FLAGS.train_dir)
hs.train(sess, train_x, train_y, test_x, test_y)
# get predictions
# write predictions
if __name__ == "__main__":
tf.app.run()