-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
59 lines (47 loc) · 1.54 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from entity_recognition_datasets.src import stratified_split
from entity_recognition_datasets.src import utils
class DataSource:
def __init__(self, split, n=None):
"""
:param split: should either be "dev", "test", or "train"
"""
self.data = []
# add WNUT17
wnut17_path = "WNUT17-{}".format(split)
self.data.extend(utils.read_conll(wnut17_path))
# no dev set for BTC
if split != "dev":
btc_path = "BTC-{}".format(split)
self.data.extend(utils.read_conll(btc_path))
# slice the data as appropriate
if n is not None:
self.data = self.data[:n]
def __len__(self):
return len(self.data)
def __getitem__(self, key):
return self.data[key]
def print_dist(self):
"""
Prints information about the distribution of labels in the sets.
:return:
"""
total = {
0: 0,
1: 0
}
corpora = ["WNUT17", "BTC"]
for c in corpora:
counts = utils.get_NER_tagcounts(c)
for tag in counts:
if tag == "O":
total[0] += counts[tag]
else:
total[1] += counts[tag]
print(total)
return total
def create_btc_split():
"""
"""
stratified_split.write_new_split("BTC", 1000,
filedir="entity_recognition_datasets/data/BTC/CONLL-format/data_generated",
filename="btc")