-
Notifications
You must be signed in to change notification settings - Fork 0
/
thes2lcsh.py
executable file
·76 lines (58 loc) · 1.79 KB
/
thes2lcsh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Autore: Cristian Consonni <[email protected]>
# Inspired by this gist by atomotic:
# https://gist.github.com/atomotic/7229203
#
# The code is released with an MIT license
# please see the LICENSE file for details.
import sys
import csv
import requests
import urlparse
import StringIO
import pickle
from produce_enwiki_titles import PICKLE_FILE
FIELDNAMES_WIKIMAP = ('LC_head', 'relation', 'enwiki')
FIELDNAMES_THES2LSCH = ('thes_id', 'relation', 'lc_head_id', 'wikidata')
LOCH_BASEURL = 'http://id.loc.gov/authorities/label/'
OUTFILE = 'thes2lcsh.map'
with open(PICKLE_FILE, 'r') as infile:
enwiki_titles = pickle.load(infile)
f = StringIO.StringIO(sys.argv[1])
csvin = csv.DictReader(
filter(lambda row: row[0]!='#', f),
FIELDNAMES_WIKIMAP,
delimiter='|'
)
wikimap = [line for line in csvin]
if len(wikimap) == 1:
line = wikimap[0]
print "Process line: ", line
elif len(wikimap) == 0:
print "Discard comments or empty lines", wikimap
exit(0)
else:
print "Error! Line too long: ", wikimap
exit(-1)
finalout = open(OUTFILE, 'a+')
writer = csv.DictWriter(finalout, FIELDNAMES_THES2LSCH)
enwiki = line['enwiki']
if enwiki in enwiki_titles:
resolv = enwiki_titles[enwiki]
req = requests.get(LOCH_BASEURL+line['LC_head'])
if req.ok:
urlpath = urlparse.urlparse(req.url).path.split('/')[-1]
lc_head_no = urlpath.replace('.html', '')
fields = (resolv['thes_id'].strip().strip('"'),
line['relation'],
lc_head_no,
resolv['wikidata']
)
diz =dict(zip(FIELDNAMES_THES2LSCH, fields))
print "Writing: ", diz
writer.writerow(diz)
else:
print "Error with request: ", line
finalout.close()