This repository has been archived by the owner on May 11, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetchbilltext.pl
740 lines (619 loc) · 23.1 KB
/
fetchbilltext.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
#!/usr/bin/perl
# This program uses pdftotext.
require "general.pl";
require "billdiff.pl";
require "db.pl";
my %fdsys_to_gt_billtype = (
'hr' => 'h', 'hres' => 'hr', 'hjres' => 'hj', 'hconres' => 'hc',
's' => 's', 'sres' => 'sr', 'sjres' => 'sj', 'sconres' => 'sc');
my @statuslist_h = (
ih, # Introduced in House
ihr, # Introduced in House-Reprint
ih_s, # Introduced in House (No.) Star Print
rih, # Referral Instructions House
rfh, # Referred in House
rfhr, # Referred in House-Reprint
rfh_s,# Referred in House (No.) Star Print
rth, # Referred to Committee House
rah, # Referred w/Amendments House
rch, # Reference Change House
rh, # Reported in House
rhr, # Reported in House-Reprint
rh_s, # Reported in House (No.) Star Print
rdh, # Received in House
ash, # Additional Sponsors House
sc, # Sponsor Change House
cdh, # Committee Discharged House
hdh, # Held at Desk House
iph, # Indefinitely Postponed in House
lth, # Laid on Table in House
oph, # Ordered to be Printed House
pch, # Placed on Calendar House
ah, # Amendment in House (never seen this but have seen 'as'/'as2' on HR 1, 111)
ah2, # Amendment in House (see above)
fah, # Failed Amendment House
ath, # Agreed to House
cph, # Considered and Passed House
eh, # Engrossed in House
ehr, # Engrossed in House-Reprint
eh_s, # Engrossed in House (No.) Star Print [*]
);
my @statuslist_h2 = (
eah, # Engrossed Amendment House
reah, # Re-engrossed Amendment House
);
my @statuslist_s = (
is, # Introduced in Senate
isr, # Introduced in Senate-Reprint
is_s, # Introduced in Senate (No.) Star Print
ris, # Referral Instructions Senate
rfs, # Referred in Senate
rfsr, # Referred in Senate-Reprint
rfs_s,# Referred in Senate (No.) Star Print
rts, # Referred to Committee Senate
ras, # Referred w/Amendments Senate
rcs, # Reference Change Senate
rs, # Reported in Senate
rsr, # Reported in Senate-Reprint
rs_s, # Reported in Senate (No.) Star Print
rds, # Received in Senate
sas, # Additional Sponsors Senate
cds, # Committee Discharged Senate
hds, # Held at Desk Senate
ips, # Indefinitely Postponed in Senate
lts, # Laid on Table in Senate
ops, # Ordered to be Printed Senate
pcs, # Placed on Calendar Senate
as, # Amendment in Senate
as2, # Amendment in Senate (again?)
ats, # Agreed to Senate
cps, # Considered and Passed Senate
fps, # Failed Passage Senate
es, # Engrossed in Senate
esr, # Engrossed in Senate-Reprint
es_s, # Engrossed in Senate (No.) Star Print
);
my @statuslist_s2 = (
eas, # Engrossed Amendment Senate
res, # Re-engrossed Amendment Senate
);
my @statuslist_all = (
re, # Reprint of an Amendment
s_p, # Star (No.) Print of an Amendment
# pp, # Public Print
enr, # Enrolled Bill
renr, # Re-enrolled
);
$HTMLPARSER = XML::LibXML->new();
$HTMLPARSER->recover(1);
if ($ARGV[0] eq "FULLTEXT") { shift(@ARGV); GetBillFullText(@ARGV); }
if ($ARGV[0] eq "GENERATE") { shift(@ARGV); CreateGeneratedBillTexts(@ARGV); }
if ($ARGV[0] eq "SIMHASH") { shift(@ARGV); ComputeSimHashes(@ARGV); }
if ($ARGV[0] eq "FINDSIMS") { shift(@ARGV); FindSimilarBills(@ARGV); }
1;
sub GetBillFullText {
my $session = shift;
my $nopdfs = shift;
my $billdir = "../data/us/$session/bills";
my $textdir = "../data/us/bills.text/$session";
mkdir $textdir;
for my $year (YearFromYMD(StartOfSessionYMD($session)) .. YearFromYMD(EndOfSessionYMD($session))) {
# skip if year is in the future - sitemap file won't exist yet, script will die
my ($sec,$min,$hour,$day,$month,$yr,@rest) = localtime(time);
$now_year = 1900+$yr;
if ($year > $now_year) { next; }
# file may contain bills from a different congress because it is
# by calendar year, but we are filtering in the regex properly.
my $response = $UA->get("https://www.gpo.gov/smap/fdsys/sitemap_$year/${year}_BILLS_sitemap.xml");
if (!$response->is_success) { warn "Could not fetch bill list for $year"; next; }
my $content = $response->content;
$HTTP_BYTES_FETCHED += length($content);
while ($content =~ m|https://www.gpo.gov/fdsys/pkg/BILLS-$session([a-z]+)(\d+)([a-z]\w*)/content-detail|g) {
FetchBillTextPDF($session, $1, $2, $3) if (!$nopdfs);
FetchBillTextHTML($session, $fdsys_to_gt_billtype{$1}, $2, $3);
}
}
if ($session <= 102) {
# Can't use bill text list from GPO.
opendir BILLS, "$billdir";
foreach my $bill (readdir(BILLS)) {
if ($bill !~ /([hsrcj]+)(\d+)\.xml/) { next; }
my ($type, $number) = ($1, $2);
foreach my $status (GetBillStatusList($type)) {
# It's harder to figure out what statuses are actually available. Just try all.
# This might not pick up something like an ats2. Oh well.
FetchBillTextHTML($session, $type, $number, $status);
}
}
closedir BILLS;
}
return;
# Download XML files
#FetchBillXml($session, $textdir) if ($session >= 108);
# Textify bills
if (-e "/usr/bin/pdftotext") {
foreach my $type (keys(%BillTypePrefix)) {
opendir BILLS, "$textdir/$type";
foreach my $bill (readdir(BILLS)) {
if ($bill !~ /$type(\d+)([a-z]+)\.pdf/) { next; }
my ($number, $status) = ($1, $2);
if (-e "$textdir/$type/$type$number$status.txt") { next; }
print "Textifying $bill\n" if (!$OUTPUT_ERRORS_ONLY);
system("pdftotext -layout -nopgbrk -enc UTF-8 $textdir/$type/$bill");
if (!-e "$textdir/$type/$type$number$status.txt") {
# PDF-to-text failed. It should have printed something.
unlink "$textdir/$type/$bill"; # fetch again next time
#open TEXT, ">$textdir/$type$number$status.txt";
#print TEXT "There was an error creating the text version of
#this bill. Please use the PDF version instead.\n";
#close TEXT;
} else {
system("perl billtextlinefixup.pl $textdir/$type/$type$number$status.txt");
}
}
closedir BILLS;
}
} else {
}
# Generate thumbnails
if (-e "/usr/bin/pdftoppm" && $session >= 112) {
foreach my $type (keys(%BillTypePrefix)) {
opendir BILLS, "$textdir/$type";
foreach my $bill (readdir(BILLS)) {
if ($bill !~ /$type(\d+)([a-z]+)\.pdf/) { next; }
my ($number, $status) = ($1, $2);
my $of = "$textdir/$type/$type$number$status-thumb200.png";
if (-e $of) { next; }
print "Generating image thumbnail $bill\n" if (!$OUTPUT_ERRORS_ONLY);
# Facebook requires min size of 200x200, so specify 260 for the long
# edge so that the short edge is about 200px.
system("pdftoppm -f 1 -l 1 -scale-to 260 -png $textdir/$type/$bill > $of");
}
closedir BILLS;
}
} else {
}
# Symlink the latest version to the unstatused files.
opendir BILLS, "$billdir";
foreach my $bill (readdir(BILLS)) {
if ($bill !~ /([hsrcj]+)(\d+)\.xml/) { next; }
my ($type, $number) = ($1, $2);
my @stz = GetBillStatusList($type);
for my $ext ('.pdf', '.txt', '.html', '.xml', ".mods.xml", "-thumb200.png") {
unlink "$textdir/$type/$type$number$ext";
for (my $sli = scalar(@stz)-1; $sli>=0; $sli--) {
my $file = "$type$number$stz[$sli]";
if (-e "$textdir/$type/$file$ext") {
symlink "$file$ext", "$textdir/$type/$type$number$ext";
last;
}
}
}
}
closedir BILLS;
}
sub FetchBillTextPDF {
my ($session, $fdstype, $number, $status) = @_;
my $type = $fdsys_to_gt_billtype{$fdstype};
my $basedir = "../data/congress-bill-text-legacy/$session";
# PDF
my $URL = "https://www.gpo.gov/fdsys/pkg/BILLS-$session$fdstype$number$status/pdf/BILLS-$session$fdstype$number$status.pdf";
my $file = "$basedir/$type/$type$number$status.pdf";
if (!-e $file || ($ENV{FORCE} && $session >= 113)) {
print "Bill Text PDF: $session/$type$number/$status\n" if (!$OUTPUT_ERRORS_ONLY);
#sleep(1);
my $response = $UA->get($URL);
if (!$response->is_success) {
warn "Could not fetch bill text at $URL: " .
$response->code . " " .
$response->message;
next;
}
$HTTP_BYTES_FETCHED += length($response->content);
mkdir $basedir;
mkdir "$basedir/$type";
open TEXT, ">$file";
print TEXT $response->content;
close TEXT;
}
# MODS
my $file = "$basedir/$type/$type$number$status.mods.xml";
if ((!-e $file || $ENV{FORCE}) && 0) {
# DISABLED - now done by new scrapers
print "Bill Text MODS: $session/$type$number/$status\n" if (!$OUTPUT_ERRORS_ONLY);
#sleep(1);
# Statuses on FDSYS are generally capitalized, but not always, and it seems to be random.
my $status2 = uc($status);
my $URL = "https://www.gpo.gov/fdsys/pkg/BILLS-${session}${fdstype}${number}${status2}/mods.xml";
my $response = $UA->get($URL);
if (!$response->is_success || $response->content =~ /Error Detected|nocontent.htm/) {
$status2 = lc($status);
$URL = "https://www.gpo.gov/fdsys/pkg/BILLS-${session}${fdstype}${number}${status2}/mods.xml";
$response = $UA->get($URL);
}
if (!$response->is_success || $response->content =~ /Error Detected|nocontent.htm/) {
warn "Could not fetch bill text at $URL (tried capital/lowercase status): " .
$response->code . " " .
$response->message;
next;
}
$HTTP_BYTES_FETCHED += length($response->content);
mkdir $basedir;
mkdir "$basedir/$type";
open TEXT, ">$file";
print TEXT $response->content;
close TEXT;
}
}
sub FetchBillXml {
my $session = shift;
my $textdir = shift;
#sleep(1);
print "Retreiving House XML Bill List... \n" if (!$OUTPUT_ERRORS_ONLY);;
my $URL = "http://thomas.loc.gov/home/gpoxmlc$session/";
my $response = $UA->get($URL);
if (!$response->is_success) {
die "Could not fetch XML bill list at $URL: " .
$response->code . " " .
$response->message;
}
$HTTP_BYTES_FETCHED += length($response->content);
my $list = $response->content;
while ($list =~ /"([hs][cjr]?)(\d+)_(\w+)\.xml"/g) {
my $type = $1;
my $num = $2;
my $st = $3;
my $file = "$textdir/$type/$type$num$st.xml";
if (!-e $file || $ENV{FORCE}) {
#sleep(1);
print "Bill Text XML: $session/$type$num/$st\n" if (!$OUTPUT_ERRORS_ONLY);
my $URL = "http://thomas.loc.gov/home/gpoxmlc$session/$type$num" . "_$st.xml";
my $response = $UA->get($URL);
if (!$response->is_success) {
die "Could not fetch XML bill text at $URL: " .
$response->code . " " .
$response->message;
}
$HTTP_BYTES_FETCHED += length($response->content);
mkdir "$textdir/$type";
open XML, ">$file";
print XML $response->content;
close XML;
}
}
}
sub FetchBillTextHTML {
my ($session, $type, $number, $status) = @_;
my $type2 = $type;
if ($type2 eq "hr") { $type2 = "hres"; }
my $file = "../data/congress-bill-text-legacy/$session/$type/$type$number$status.html";
if (-e $file && !$ENV{FORCE}) { next; }
print "Bill Text HTML: $session/$type$number/$status\n" if (!$OUTPUT_ERRORS_ONLY);
# THOMAS started generating pages w/o the temp link if you don't specify Mozilla in the UA
my $UA = LWP::UserAgent->new(keep_alive => 2, timeout => 30, agent => "Mozilla/4.0 (GovTrack.us scraper)", from => "[email protected]");
my $URL2 = "http://thomas.loc.gov/cgi-bin/query/z?c$session:$type2$number.$status:";
my $response = $UA->get($URL2);
if (!$response->is_success) {
warn "Could not fetch bill text at $URL: " . $response->code . " " . $response->message;
return;
}
my $htmltext = $response->content;
$HTTP_BYTES_FETCHED += length($htmltext);
FetchBillTextHTML2($session, $type, $number, $status, $htmltext);
}
sub FetchBillTextHTML2 {
my ($session, $type, $number, $status, $htmlpage) = @_;
my $file = "../data/congress-bill-text-legacy/$session/$type/$type$number$status.html";
#if (-e $file) { return; }
mkdir "../data/congress-bill-text-legacy/$session/$type";
# move to printer friendly page
if ($htmlpage !~ /<a href="(\/cgi-bin\/query\/C\?[^"]+)"[^>]*>(<em>)?Printer Friendly/i) {
warn "Could not find the link to the printer friendly display in $session/$type$number/$status";
return;
}
my $URL = "http://thomas.loc.gov" . $1;
#sleep(1);
my $response = $UA->get($URL);
if (!$response->is_success) {
warn "Could not fetch bill text at $URL: " . $response->code . " " . $response->message;
return;
}
$htmlpage = $response->content;
$HTTP_BYTES_FETCHED += length($htmlpage);
if ($status =~ /ea[sh]/ && $htmlpage =~ /^[\w\W]*?<p>([HRESCONJ\.]+ \?\? EA[SH][\n\r])/i) {
warn "Bill text for $session/$type$number/$status appears to be an amendment. Skipping.";
return;
}
# chop off everything before the status line
# sometimes IH appears here as RIH
# sometimes the wrong status code shows up (EH instead of ENR)
if ($htmlpage !~ s/^[\w\W]*?<p>(<em>.<\/em>)?\s*([HRESCONJ\.]+ *$number R?($status|[A-Z]{2,3})(\dS)?(\/PP)?[\n\r])/$2/i
&& $htmlpage !~ s/^[\w\W]*?\n\s*([HRESCONJ\.]+ *$number ?R?($status|[A-Z]{2,3}|IHIS|)(\dS)?<p>[\n\r])/$1/i
&& ($status ne 'enr' || $htmlpage !~ s/^[\w\W]*?<p>\s*([HRESCONJ\.]+ ?$number[\n\r])/$1/i)
&& $htmlpage !~ s/^[\w\W]*?<p>(<h3><b>Suspend the Rules and Pass the Bill)/$1/i
&& (($status ne 'as' && $status ne 'as2') || $htmlpage !~ s/^[\w\W]*?(<p>AMENDMENT NO. <b>\d+<\/b>\s*<p><i39>Purpose: In the nature of a substitute.<\/i39>)/$1/i)
) {
warn "Could not find start of bill text for $session/$type$number/$status";
return;
}
# chop off everything after the end
if ($htmlpage !~ s/[\n\r]+<p\/><em>END<\/em>[\w\W]*$//) {
die "Could not find end of bill text for $session/$type$number/$status";
}
# make some corrections that trick the HTML parser
$htmlpage =~ s/<\/?[tb]title>//g;
$htmlpage =~ s/<\/?b>//g;
# put <p> tags within the <ul> tags
$htmlpage =~ s/<p>((<ul>)*)/$1<p>/g;
# merge paragraphs at common indentation levels into one <ul>
while ($htmlpage =~ s/<\/ul>(\s*)<ul>/$1/gi) { }
# there are unescaped ampersands; although 'recover' mode
# will interpret them OK, we can get rid of some warnings
$htmlpage =~ s/ \& / \& /g;
# there are unescaped brackets too
$htmlpage =~ s/ < / \< /g;
$htmlpage = ToUTF8($htmlpage);
my $doc = $HTMLPARSER->parse_html_string($htmlpage);
($doc) = $doc->findnodes('html/body');
if (!defined($doc)) {
die "No body node in parsed HTML document for $session/$type$number/$status";
}
# This routine does two things:
# Making sure <center> elements contain only elements, and not text
# directly.
# Indenting insertions, which is important because some would otherwise
# appear as top-level text, which is confusing for section headings
# that get rendered in bold.
FixBillTextHtml($doc);
my $html = $doc->toString(1);
# correct some characters; can't do this earlier exactly because
# we look for `'s to insert blockquote elements
$html =~ s/\``/\“/g;
$html =~ s/\''/\”/g;
$html =~ s/\`/\‘/g;
$html =~ s/\'/\’/g;
# clean up some spaces in paragraphs
$html =~ s/(<p>)\s*/$1/g;
$html =~ s/\s*(<\/p>)/$1/g;
open H, ">$file";
binmode(H, "utf8");
print H $html;
close H
}
sub FixBillTextHtml {
my $node = shift;
my $alreadyinbq = shift;
my $child = $node->firstChild;
while ($child) {
if ($child->nodeName eq 'center') {
# Any text/<em> elements inside <center> are wrapped in <p> tags.
my $c = $child->firstChild;
my $lastp;
while ($c) {
if (ref($c) eq 'XML::LibXML::Text' || $c->nodeName eq 'em') {
if (!$lastp) {
$lastp = $node->ownerDocument->createElement('p');
$child->insertBefore($lastp, $c);
}
$child->removeChild($c);
if (ref($c) eq 'XML::LibXML::Text') {
$lastp->appendText($c->textContent);
} else {
$lastp->appendChild($c);
}
$c = $lastp;
} else {
undef $lastp;
}
$c = $c->nextSibling;
}
}
if (ref($child) eq 'XML::LibXML::Element'
&& $child->textContent =~ /^\s*\`/
&& !$alreadyinbq) {
# Turn "`..." into blockquotes.
my $bq = $node->ownerDocument->createElement('blockquote');
$node->insertBefore($bq, $child);
while ($bq->nextSibling && ($bq->nextSibling->textContent =~ /^\s*\`/ || ref($bq->nextSibling) eq 'XML::LibXML::Text')) {
my $s = $bq->nextSibling;
$node->removeChild($s);
$bq->appendChild($s);
FixBillTextHtml($s, 1);
}
$child = $bq->nextSibling;
next;
}
FixBillTextHtml($child, $alreadyinbq);
$child = $child->nextSibling;
}
}
sub CreateGeneratedBillTexts {
my $session = shift;
my $onlythisbill = shift;
print "Generating bill diffs and enhanced HTML...\n" if (!$OUTPUT_ERRORS_ONLY);
my $billdir = "../data/us/$session/bills";
my $textdir = "../data/us/bills.text/$session";
my $cmpdir = "../data/us/bills.text.cmp/$session";
# This isn't working and creates weird directories probably because
# it's executed with sh and not bash, so the braces are treated
# literally.
#system("mkdir -p {$textdir,$cmpdir}/{h,s,hr,sr,hj,sj,hc,sc}");
opendir BILLS, "$billdir";
foreach my $bill (sort(readdir(BILLS))) {
if ($bill !~ /([a-z]+)(\d+)\.xml/) { next; }
my ($type, $number) = ($1, $2);
if ($onlythisbill ne "" && $onlythisbill ne "ALL" && $onlythisbill ne "$type$number") { next; }
my @statuses = GetBillStatusList($type);
# Create a revised XML HTML version that marks up certain
# things.
foreach my $status (@statuses) {
my $infile = "$textdir/$type/$type$number$status.html";
if (!-e $infile) { next; }
my $genfile = "$textdir/$type/$type$number$status.gen.html";
if ($onlythisbill eq "" && -e $genfile) { next; }
if ($onlythisbill eq "ALL" && -e $genfile && (-M $genfile) < 2) { next; }
print "$genfile\n";
my $file;
eval {
$file = $XMLPARSER->parse_file("$infile");
};
if ($@) {
warn "$infile: $@";
next;
}
$file->documentElement->setAttribute('status', $status);
AddIdAttributesToBillText($file, "t0:" . $status);
my $g = $file->toString;
$g = BillTextMarkup($g);
open G, ">$genfile";
print G $g;
close G;
}
for (my $i = 0; $i < scalar(@statuses)-1; $i++) {
my $status1 = $statuses[$i];
my $g1 = "$textdir/$type/$type$number$status1.gen.html";
if (!-e $g1) { next; }
for (my $j = $i+1; $j < scalar(@statuses); $j++) {
my $status2 = $statuses[$j];
my $g2 = "$textdir/$type/$type$number$status2.gen.html";
if (!-e $g2) { next; }
mkdir "$cmpdir";
mkdir "$cmpdir/$type";
my $outfile = "$cmpdir/$type/$type${number}_$status1-$status2.xml";
if ($onlythisbill eq "" && -e $outfile) { next; }
if ($onlythisbill eq "ALL" && -e $outfile && (-M $outfile) < 2) { next; }
my $c = ComputeBillTextChanges($session, $type, $number, $status1, $status2);
my $g = $c->toString(1);
$g = BillTextMarkup($g);
open G, ">$outfile";
print G $g;
close G;
}
}
}
closedir BILLS;
}
sub BillTextMarkup {
my $g = shift;
$g =~ s/(-{80})-*/$1/g;
$g =~ s/([^\s<>]{80})/$1 /g;
# mark up U.S.C. references
$g =~ s/((\d[0-9A-Za-z\-]*) U\.S\.C\. (\d[0-9A-Za-z\-]*)((\s*\([^\) <\&]+\))*))/usctag($1, $2, $3, $4)/eg;
$g =~ s/(Section (\d[0-9A-Za-z\-]*)((\s*\([^\) <\&]+\))*) of title ([^\s<\&]+), United States Code)/usctag($1, $5, $2, $3)/egi;
# mark up references to public laws
$g =~ s/(Public Law (\d+)-(\d+))/<public-law-reference session="$2" number="$3">$1<\/public-law-reference>/g;
return $g;
}
sub GetBillStatusList {
my $type = shift;
if ($type =~ /^h/) { return (@statuslist_h, @statuslist_s, @statuslist_s2, @statuslist_h2, @statuslist_all); }
if ($type =~ /^s/) { return (@statuslist_s, @statuslist_h, @statuslist_h2, @statuslist_s2, @statuslist_all); }
die;
}
sub ComputeSimHashes {
my $session = shift;
my $onlythisbill = shift;
GovDBOpen();
print "Computing simhashes...\n" if (!$OUTPUT_ERRORS_ONLY);
my $billdir = "../data/us/$session/bills";
my $textdir = "../data/us/bills.text/$session";
opendir BILLS, "$billdir";
foreach my $bill (sort(readdir(BILLS))) {
if ($bill !~ /([a-z]+)(\d+)\.xml/) { next; }
my ($type, $number) = ($1, $2);
if ($onlythisbill ne "" && $onlythisbill ne "ALL" && $onlythisbill ne "$type$number") { next; }
foreach my $status (GetBillStatusList($type)) {
my $infile = "$textdir/$type/$type$number$status.html";
if (!-e $infile) { next; }
# Compute simhash. Get the text content of the original
# HTML version, put that in a file, and run a simhash
# program. Then put the result into a database.
my $doc = $XMLPARSER->parse_file($infile);
open DAT, ">/tmp/govtrack-simhash.txt";
binmode(DAT, ":utf8");
print DAT $doc->textContent;
close DAT;
my $hash = `simhash/shash-0.3/shash /tmp/govtrack-simhash.txt`;
#unlink "/tmp/govtrack-simhash.txt";
if ($hash !~ /^((....)(....)(....)(....)) /) { die; }
my ($hash, $b1, $b2, $b3, $b4) = ($1, $2, $3, $4, $5);
for my $b ($b1, $b2, $b3, $b4) {
$b = hex($b);
}
print "$infile $hash\n";
DBDelete(billtextsimhash, ["session=$session and type='$type' and number='$number' and status='$status'"]);
DBInsert(billtextsimhash,
session => $session, type => $type, number => $number, status => $status,
simhash => $hash, block1 => $b1, block2 => $b2, block3 => $b3, block4 => $b4);
}
}
DBClose();
}
sub FindSimilarBills {
my ($session, $type, $number, $status2) = @_;
GovDBOpen();
# Collect the comparisons for all of the bill versions we are comparing to.
my @hashes;
my $comp = '0';
for my $status (GetBillStatusList($type)) {
if ($status2 && $status2 ne $status) { next; }
# Get the hash for this bill text.
my ($hash, $b1, $b2, $b3, $b4) = DBSelectFirst(billtextsimhash,
["simhash, block1, block2, block3, block4"],
["session=$session and type='$type' and number='$number' and status='$status'"]);
if (!$hash) { next; }
push @hashes, $hash;
$comp .= " OR (block1=$b1 and block2=$b2 and block3=$b3) or (block1=$b1 and block2=$b2 and block4=$b4) or (block1=$b1 and block3=$b3 and block4=$b4) or (block2=$b2 and block3=$b3 and block4=$b4)";
}
# Look for similar hashes, with at most 16 bits difference
# (16 bit hamming distance), which means three of the four
# blocks must match, from any of the bill versions.
# We ignore status.
my @results = DBSelect(billtextsimhash,
["session, type, number, status, simhash"],
#["(block1=$b1 and block2=$b2 and block3=$b3) or (block1=$b1 and block2=$b2 and block4=$b4) or (block1=$b1 and block3=$b3 and block4=$b4) or (block2=$b2 and block3=$b3 and block4=$b4)"]
[$comp]
);
# Filter out the results that have a hamming distance
# greater than 5.
my %matches;
for my $r (@results) {
my ($s, $t, $n, $st, $h) = @$r;
if ($matches{"$s$t$n"}) { next; }
my $mind = 64;
my $minh;
for my $hash (@hashes) {
my $d = hamming($hash, $h);
if ($d < $mind) { $mind = $d; $minh = $hash; }
}
if ($mind > 4) { next; }
$matches{"$s$t$n"} = 1;
print "$s $t$n $mind $minh/$h\n";
}
DBClose();
}
sub hamming {
my ($a, $b) = @_;
my $d = 0;
if (length($a) != length($b)) { die; }
for (my $i = 0; $i < length($a); $i+=2) {
my $a1 = hex(substr($a, $i, 2));
my $b1 = hex(substr($b, $i, 2));
for (my $j = 0; $j < 8; $j++) {
$d += ((($a1 & 1<<$j) != ($b1 & 1<<$j)) ? 1 : 0);
}
}
return $d;
}
sub usctag {
my ($text, $title, $section, $paragraph) = @_;
$paragraph =~ s/<[^>]+>//g; # remove tags which occur rarely
return "<usc-reference title=\"$title\" section=\"$section\" paragraph=\"" . splitUSCGraphId($paragraph) . "\">$text<\/usc-reference>"
}
sub splitUSCGraphId {
my $x = shift;
my @xx = split(/[()\s]+/, $x);
if ($xx[0] eq '') { shift(@xx); }
if ($xx[-1] eq '') { pop(@xx); }
return join("_", @xx);
}