forked from seomoz/dragnet_data
-
Notifications
You must be signed in to change notification settings - Fork 1
/
cetr_to_dragnet.sh
executable file
·122 lines (99 loc) · 2.62 KB
/
cetr_to_dragnet.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
set -e
if [ -z "$1" ]
then
echo "Convert CETR to dragnet format"
echo ""
echo "Usage: $0 rootdir"
exit 1
fi
ROOTDIR=$1
for D in en zh bbc freep myriad nypost nytimes reuters suntimes techweb tribune
do
if [[ "$D" == "en" || "$D" == "zh" ]]
then
CLEANEVAL=1
DATADIR=$1/cleaneval/$D/
else
CLEANEVAL=0
DATADIR=$1/news/$D/
fi
echo "Processing $D"
# for news, CETR has a slightly different directory structure
if [[ $CLEANEVAL == 0 ]]
then
mkdir $DATADIR
mkdir $DATADIR/gold
mkdir $DATADIR/original
mv $1/news/gold/$D/*.txt $DATADIR/gold
mv $1/news/original/$D/*.html $DATADIR/original
fi
pushd $DATADIR
# rename HTML, copy over all .html files, delete the rest
mkdir HTML
mv original/*.html HTML
rm -r original
# rename/move the corrected files
mkdir Corrected
ls gold/*.txt | perl -pe 'm/\/([0-9]+)\.txt/; system "cp gold/$1.txt Corrected/$1.html.corrected.txt\n";'
rm -r gold
# block_corrected
mkdir block_corrected
# remove all files in HTML that are not in Corrected
# and vice versa
perl -e '
@html_files = glob "HTML/*";
@corrected_files = glob "Corrected/*";
%html = ();
foreach (@html_files) {
$m/\/([0-9]+)\.html/;
$html{$1} = 1;
}
%corrected = ();
foreach (@corrected_files) {
$m/\/([0-9]+)\.html/;
$corrected{$1} = 1;
}
# now check html. if not in corrected then delete
while ( ($key, $value) = each(%html) ) {
if (!defined $corrected{$key}) {
system "rm HTML/$key.html";
}
}
# vice versa
while ( ($key, $value) = each(%corrected) ) {
if (!defined $html{$key}) {
system "rm Corrected/$key.html.corrected.txt";
}
}
'
# now need to massage the formatting of the files
if [[ $CLEANEVAL == 1 ]]
then
# need to remove the first line of Corrected file
# this contains the URL, and we don't need it
echo "Cleaning corrected files"
ls Corrected/* | perl -pe '
$orig_file = $_;
open (F, $orig_file);
open (FOUT, ">t");
$k=0;
while (<F>) {
if ($k > 0) {
print FOUT $_;
}
$k++;
}
close(FOUT);
close(F);
system "mv t $orig_file";
'
fi
# clean up some things
rm -r vips || true
rm -r stripped || true
popd
done
rm -r $1/news/gold
rm -r $1/news/original
rm -r $1/news/vips