-
Notifications
You must be signed in to change notification settings - Fork 0
/
load-names.py
87 lines (69 loc) · 2.97 KB
/
load-names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from __future__ import unicode_literals, print_function
import requests
from requests.adapters import HTTPAdapter
GROUPS = (
('fictional-character', 'https://wdq.wmflabs.org/api?q=claim[31:95074]'),
('philosophers', 'https://wdq.wmflabs.org/api?q=claim[106:4964182]'),
('warriors', 'https://wdq.wmflabs.org/api?q=claim[106:1250916]'),
('explorers', 'https://wdq.wmflabs.org/api?q=claim[106:11900058]'),
('monarchs', 'https://wdq.wmflabs.org/api?q=claim[39:116]'),
#('engineers', 'https://wdq.wmflabs.org/api?q=claim[106:Q81096]'),
#('aerospace-engineers', 'https://wdq.wmflabs.org/api?q=claim[106:Q15895020]'),
)
GROUPS_DICT = dict(GROUPS)
class Loader(object):
pass
class WikipediaLoader(Loader):
def __init__(self):
self.session = requests.Session()
self.session.mount('https://www.wikidata.org', HTTPAdapter(max_retries=42))
self.session.mount('https://wdq.wmflabs.org', HTTPAdapter(max_retries=42))
def get_ids(self, groupname):
url = GROUPS_DICT[groupname]
items = self.session.get(url).json()['items']
return items
def get_label_by_id(self, item_id):
if isinstance(item_id, int):
item_id = "Q{}".format(item_id)
url = "https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&languages=en&props=labels&ids={}".format(item_id)
data = self.session.get(url).json()['entities'][item_id]
return data.get('labels',{}).get('en', {}).get('value')
class CacheLoader(Loader):
def __init__(self, filename):
cache = {}
for line in open(filename):
parts = line.strip().split(b':')
groupname = parts[0]
id = parts[1]
label = b":".join(parts[3:])
cache.setdefault(groupname, []).append(id)
cache.setdefault('_global', {})[id] = label.decode('utf-8')
self.cache = cache
def get_ids(self, groupname):
return self.cache[groupname]
def get_label_by_id(self, item_id):
return self.cache['_global'][str(item_id)]
import re
NON_ASCII = re.compile(r"[^A-Za-z0-9\- ]")
def filter_name(name):
return not NON_ASCII.search(name)
def is_simple_name(label):
if not NON_ASCII.search(label) and len(label)>4 and len(label.split(' '))==2:
return True
def main():
#loader = WikipediaLoader()
loader = CacheLoader('names.txt')
cache = set()
for title, _ in GROUPS:
for item in loader.get_ids(groupname=title):
label = loader.get_label_by_id(item)
if label and is_simple_name(label):
for part in label.split(' '):
if len(part)>3:
part = part.lower()
if part not in cache:
print(part)
cache.add(part)
#print("{}:{}:{}:{}".format(title, item, b'simple' if is_simple_name(label) else b'complex', label).encode('utf-8'))
if __name__ == "__main__":
main()