-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_document_classifier.py
285 lines (247 loc) · 12.4 KB
/
run_document_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
from torch import cuda
import os
import re
import subprocess
import argparse
import json
from pytorch_document_classifier import *
def get_gpu_utilization():
try:
result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'], capture_output=True, text=True)
gpu_utilizations = [int(utilization) for utilization in result.stdout.strip().split('\n')]
return gpu_utilizations
except Exception as e:
print(f'Error getting GPU utilization: {e}')
return None
def choose_least_utilized_gpu():
gpu_utilizations = get_gpu_utilization()
if gpu_utilizations is not None:
# Find the GPU index with the lowest utilization
min_utilized_gpu = min(gpu_utilizations)
print(f'GPU is at {min_utilized_gpu} capacity')
if min_utilized_gpu > 0:
raise Exception('GPU might be too busy')
min_utilization_index = gpu_utilizations.index(min_utilized_gpu)
return min_utilization_index
else:
return None
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process input for data download')
parser.add_argument('--topic',
type=str, help='Topic name') # Must specify
parser.add_argument('--download_docs', '-d',
action='store_true', default=False, help='Download new data')
parser.add_argument('--num_ontopic_topic_docs', type=str, # Must specify
help='Max number of documents in each ontopic category')
parser.add_argument('--num_offtopic_docs', type=int,
help='Number of offtopic documents') # Must specify
parser.add_argument('--num_unlabeled_docs', type=int,
help='Number of unlabeled documents') # Must specify
parser.add_argument('--train_stage_one_classifier', action='store_true',
default=False, help='Train the Naive Bayes classifier')
parser.add_argument('--train_stage_two_classifier', action='store_true',
default=False, help='Train the language model')
parser.add_argument('--train_stage_three_classifier', action='store_true',
default=False, help='Train the language model')
parser.add_argument('--run_stage_one_classifier', action='store_true',
default=False, help='Run the naive Bayes classifier')
parser.add_argument('--run_stage_two_classifier', action='store_true',
default=False, help='Run the language model classifier')
parser.add_argument('--model_name', type=str,
help='Name of the HuggingFace model', default='biolink')
parser.add_argument('--use_original_for_stage_two_training', '-use_entire',
action='store_true', default=False) # Pick this or...
parser.add_argument('--use_stage_one_predictions', '-use_s1_preds',
action='store_true', default=False) # ...this.
parser.add_argument('--epochs', type=int, default=5)
parser.add_argument('--max_pmid', '-max',
type=int, default=38000000,
help='Maximum PMID ID number')
parser.add_argument('--min_pmid', '-min',
type=int, default=0,
help='Minumum PMID ID number of offtopic documents')
parser.add_argument('--min_pmid_unlabeled', '-min_un',
type=int, default=37000000,
help='Minumum PMID ID number of unlabeled documents')
parser.add_argument('--batch_size', '-b_s',
type=int, default=16, help='batch size for stage 2 LM classifier')
parser.add_argument('--test_size_stage_1', '-ts1',
type=float, default=0.50, help='Test split ratio for stage 1, NBC')
parser.add_argument('--test_size_stage_2', '-ts2',
type=float, default=0.50, help='Test split ratio for stage 2, LM')
args = parser.parse_args()
topic = args.topic
download = args.download_docs
num_ontopic_docs = args.num_ontopic_topic_docs
num_offtopic_docs = args.num_offtopic_docs
num_unlabeled_docs = args.num_unlabeled_docs
min_pmid = args.min_pmid
max_pmid = args.max_pmid
min_pmid_unlabeled = args.min_pmid_unlabeled
batch_size = args.batch_size
test_size_stage_1 = args.test_size_stage_1
test_size_stage_2 = args.test_size_stage_2
########################
## Document Download ##
########################
if download:
### Download ontopic documents ###
print('*'*50, '\nobtaining labeled on-topic documents\n', '*'*50, '\n')
categories_path = f'input/{topic.split("_")[0]}_tree_numbers.json'
cats_of_pmid_path = f'output/{topic}/category_of_pmids_{topic}.csv'
pmid_to_cat_path = f'output/{topic}/pmid_to_category_{topic}.json'
ft_mtrx_path = f'output/{topic}/feature_matrix_{topic}.csv'
pubmed_doc_cmd = [
'python', 'get_pubmed_docs.py',
'--topic', topic,
'--download_mesh_tree',
'--get_docs_on_pubmed',
'--get_pmids_via_mesh',
'--categories', categories_path,
'--cats_of_pmids', cats_of_pmid_path,
'--pmid_to_cat', pmid_to_cat_path,
'--ft_mtrx_pth', ft_mtrx_path,
'--max_num_docs', str(num_ontopic_docs)]
subprocess.run(pubmed_doc_cmd, check=True)
### Download offtopic documents ###
print('\n','*'*50,'\nobtaining labeled off-topic documents\n','*'*50,'\n')
pubmed_offtopic_cmd = [
'python', 'get_offtopic_or_unlabeled_docs.py',
'--topic', topic,
'--num_of_pmids', str(num_offtopic_docs),
'--get_offtopic_docs',
'--min_pmid', str(min_pmid),
'--max_pmid', str(max_pmid),
'-m2',]
subprocess.run(pubmed_offtopic_cmd, check=True)
### Download unlabeled documents ###
print('\n', '*'*50, '\nobtaining unlabeled documents\n', '*'*50, '\n')
pubmed_unlabeled_cmd = [
'python', 'get_offtopic_or_unlabeled_docs.py',
'--topic', topic,
'--num_of_pmids', str(num_unlabeled_docs),
'--get_unlabeled_docs',
'--min_pmid', '37000000',]
subprocess.run(pubmed_unlabeled_cmd, check=True)
##########################
## Stage One Classifier ##
##########################
off_topic_class_num = len(json.load(open(f'input/{topic}_tree_numbers.json')))
# Train Model
if args.train_stage_one_classifier:
# Load Training Data
path = f'output/{topic}/{topic}_original_feature_matrix_path.txt'
path_to_labeled_feature_matrix_path = path
with open(path_to_labeled_feature_matrix_path,'r') as fin:
labeled_feature_matrix_path = fin.readlines()[0].strip()
# Train Naive Bayes classifier
print('\n', '*'*50, '\n'+ 'Running Naive Bayes classifier', '\n', '*'*50, '\n')
naive_bayes_cmd = [
'python3', './NBC/NBC.py',
'--run_mode', 'train_test',
'--input_path', labeled_feature_matrix_path,
'--off_topic_class', str(off_topic_class_num),
'--topic', topic,
'--out_path', f'output/{topic}',
'--test_size_stage_one', str(test_size_stage_1),
]
subprocess.run(naive_bayes_cmd, check=True)
# Inference Time / Deploy Model
elif args.run_stage_one_classifier:
# Load unlabeled data
path = f'output/{topic}/{num_unlabeled_docs}_unlabeled_docs_feature_matrix_path.csv'
path_to_unlabeled_feature_matrix_path = path
with open(path_to_unlabeled_feature_matrix_path,'r') as fin:
unlabeled_feature_matrix_path = fin.readlines()[0].strip()
naive_bayes_cmd = [
'python', './NBC/NBC.py',
'--run_mode', 'predict_unlabeled ',
'--topic', topic,
'--unlabeled_docs_path', unlabeled_feature_matrix_path,
'--out_path', f'output/{topic}',
]
subprocess.run(naive_bayes_cmd, check=True)
##########################
## Stage Two Classifier ##
##########################
### Pick the GPU to use ###
# Check for CUDA capability
if cuda.is_available():
print('CUDA is available. Using GPU.')
chosen_gpu_id = choose_least_utilized_gpu()
if chosen_gpu_id is not None:
print(f'Chosen GPU: {chosen_gpu_id}')
cuda.set_device(chosen_gpu_id)
else:
print('!!!! NOTE: CUDA is not available. Using CPU !!!!')
# Train Model
direct = f'output/{topic}'
if args.train_stage_two_classifier:
mode = 'train'
stage_2 = True
if args.use_original_for_stage_two_training: # Original PubMed-downloaded data
path = f'{direct}/{topic}_original_feature_matrix_path.txt'
path_to_feature_matrix_path = path
else: #args.use_stage_one_predictions: # NB-labeled on-topic (ground truth labels)
feature_matrix_path = f'{direct}/NBC_test_docs_pred_ontopic_{topic}.csv'
# Inference Time / Deploy Model
elif args.run_stage_two_classifier:
mode = 'inference'
stage_2 = True
# Classify the NB-predictions labeled by Naive Bayes
if args.use_stage_one_predictions:
path = f'{direct}/{topic}_stage_one_predicted_postive_feature_matrix_path.txt'
path_to_feature_matrix_path = path
# Classify the unpredicted original dataset
else:
path = f'{direct}/{num_unlabeled_docs}_unlabeled_docs_feature_matrix_path.csv'
path_to_feature_matrix_path = path
else:
stage_2 = False
#exit('Note: Stage Two Classifier Not Run')
if stage_2:
# Load the feature matrix
try:
with open(path_to_feature_matrix_path,'r') as fin:
feature_matrix_path = fin.readlines()[0].strip()
except:
if feature_matrix_path:
pass
else:
raise Exception('No feature matrix / training set for stage 2')
# Process data
if feature_matrix_path:
train_test_data = prepare_feature_matrix(feature_matrix_path, test_size_stage_2)
num_labels = len(set(train_test_data['train']['labels']))
assert off_topic_class_num == num_labels-1
# Run model
DC_2 = DocumentClassifier(dataset=train_test_data, topic=topic, stage_num='2')
DC_2.classify_documents(model_name=args.model_name,
epochs=args.epochs,
num_labels=num_labels,
batch_size=args.batch_size,
model_name_suffix=topic+'_'+mode,
lr=3e-5,
save_model=True)
############################
## Stage Three Classifier ##
############################
# Train Model
direct = f'output/{topic}'
if args.train_stage_three_classifier:
feature_matrix_path = f'{direct}/stage_2_test_docs_pred_ontopic_{topic}.csv'
else:
exit('Note: Stage Three Classifier Not Run')
train_test_data = prepare_feature_matrix(feature_matrix_path)
num_labels = len(set(train_test_data['train']['labels']))
assert off_topic_class_num == num_labels-1
# Run model
mode = 'train'
DC_3 = DocumentClassifier(dataset=train_test_data, topic=topic, stage_num='3')
DC_3.classify_documents(model_name=args.model_name,
epochs=args.epochs,
num_labels=num_labels,
batch_size=args.batch_size,
model_name_suffix=topic+'_'+mode+'_stage_3',
lr=3e-5,
save_model=True)