-
Notifications
You must be signed in to change notification settings - Fork 1
/
gs_preprocessor2.py
99 lines (96 loc) · 4.18 KB
/
gs_preprocessor2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#This script processes both the tagged and untagged files in the second half of the gold standard corpus.
#It strips everything but the tokens and part of speech tags from the tagged files
#and produces text files listing both the tagged and untagged files
#in the order they were processed.
#It finally concatenates every processed file into a single file for the tagged
#and another file for the untagged files.
import os
import re
#here I access all of the file names in the directory for the second half of the tagged gold standard corpus:
taggeddir='C:/users/brendan/documents/library and information science/masters_project/gold-standard/second_half_tagged'
secondhalfgst=next(os.walk(taggeddir))[2]
if 'shgstagged.txt' in secondhalfgst:
secondhalfgst.remove('shgstagged.txt')
if 'shgstaggednames.txt' in secondhalfgst:
secondhalfgst.remove('shgstaggednames.txt')
#this does the same for the untagged files:
untaggeddir='C:/users/brendan/documents/library and information science/masters_project/gold-standard/second_half_untagged'
secondhalfgsu=next(os.walk(untaggeddir))[2]
if 'secondhalfgsuntagged.txt' in secondhalfgsu:
secondhalfgsu.remove('shgsuntagged.txt')
if 'shgsuntaggednames.txt' in secondhalfgsu:
secondhalfgsu.remove('shgsuntaggednames.txt')
#here I strip everything but the tokens and tags from every file in the tagged second half:
all = ''
filenamesl = []
filenamess = ''
endpath = ''
for ofilename in secondhalfgst:
s= ''
filename = taggeddir + "/" + ofilename
with open(filename,'r') as f:
#this adds the name of each file to a list of processed files as it is accessed:
if filename not in filenamesl:
filenamesl.append(filename)
print "Reading rows from %s..." % filename
for row in f:
#if a row consists of just whitespace, replace it with a newline (this will show up as a blank row, no spaces or tabs):
m = re.match("^\s*\n$", row)
if m:
s=s+"\n"
else:
#otherwise replace the row with just the first and fourth elements, i.e. just the token and the part of speech tag:
row = row.split("\t")
row = row[0] + "\t" + row[3]
s=s+row
all += s
#here I write each stripped file to its own new file:
newfilename = taggeddir + "/processed/" + ofilename.split(".")[0] + "_tagged.txt"
with open(newfilename,'w') as nf:
nf.write(s)
print("File written to %s" % newfilename)
#here I write a list of the files in the order they were processed to a variable and then to a text file:
for row in filenamesl:
filenamess += row
filenamess += "\n"
names = taggeddir + "/processed/shgstaggednames.txt"
with open(names, 'w') as namesfile:
namesfile.write(filenamess)
#here I write every file from that half to a single file:
endpath = taggeddir + "/processed/shgstagged.txt"
with open(endpath,'w') as f:
f.write(all)
print("All tagged files written to %s" % endpath)
#here I deal with the untagged files:
all=''
filenamesl = []
filenamess = ''
endpath = ''
for ofilename in secondhalfgsu:
s=''
filename = untaggeddir + "/" + ofilename
with open(filename,'r') as f:
#this adds the name of each file to a list of processed files as it is accessed:
if filename not in filenamesl:
filenamesl.append(filename)
print "Reading rows from %s..." % filename
for row in f:
s=s+row
all += s
#here I write each processed file to its own new file:
newfilename = untaggeddir + "/processed/" + ofilename.split(".")[0] + "_untagged.txt"
with open(newfilename,'w') as nf:
nf.write(s)
print("File written to %s" % newfilename)
#here I write a list of the files in the order they were processed to a variable and then to a text file:
for row in filenamesl:
filenamess += row
filenamess += "\n"
names = untaggeddir + "/processed/shgsuntaggednames.txt"
with open(names, 'w') as namesfile:
namesfile.write(filenamess)
#here I write every file from that half to a single file:
endpath = untaggeddir + "/processed/shgsuntagged.txt"
with open(endpath,'w') as f:
f.write(all)
print "All untagged files written to %s" % f