-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.js
156 lines (127 loc) · 6.51 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// Parses the development applications at the South Australian City of Holdfast Bay web site and
// places them in a database.
//
// Michael Bone
// 2nd August 2018
"use strict";
let cheerio = require("cheerio");
let request = require("request-promise-native");
let sqlite3 = require("sqlite3").verbose();
let moment = require("moment");
const DevelopmentApplicationsUrl = "https://chb-web.t1cloud.com/T1PRDefault/WebApps/eProperty/P1/eTrack/eTrackApplicationSearchResults.aspx?Field=S&Period=L28&r=P1.WEBGUEST&f=%24P1.ETR.SEARCH.SL28";
const DevelopmentApplicationUrl = "https://chb-web.t1cloud.com/T1PRDefault/WebApps/eProperty/P1/eTrack/eTrackApplicationDetails.aspx?r=P1.WEBGUEST&f=%24P1.ETR.APPDET.VIW&ApplicationId=";
const CommentUrl = "mailto:[email protected]";
// Sets up an sqlite database.
async function initializeDatabase() {
return new Promise((resolve, reject) => {
let database = new sqlite3.Database("data.sqlite");
database.serialize(() => {
database.run("create table if not exists [data] ([council_reference] text primary key, [address] text, [description] text, [info_url] text, [comment_url] text, [date_scraped] text, [date_received] text, [on_notice_from] text, [on_notice_to] text)");
resolve(database);
});
});
}
// Inserts a row in the database if it does not already exist.
async function insertRow(database, developmentApplication) {
return new Promise((resolve, reject) => {
let sqlStatement = database.prepare("insert or ignore into [data] values (?, ?, ?, ?, ?, ?, ?, ?, ?)");
sqlStatement.run([
developmentApplication.applicationNumber,
developmentApplication.address,
developmentApplication.reason,
developmentApplication.informationUrl,
developmentApplication.commentUrl,
developmentApplication.scrapeDate,
developmentApplication.receivedDate,
null,
null
], function(error, row) {
if (error) {
console.error(error);
reject(error);
} else {
if (this.changes > 0)
console.log(` Inserted: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\" and reason \"${developmentApplication.reason}\" into the database.`);
else
console.log(` Skipped: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\" and reason \"${developmentApplication.reason}\" because it was already present in the database.`);
sqlStatement.finalize(); // releases any locks
resolve(row);
}
});
});
}
// Parses the development applications.
async function main() {
// Ensure that the database exists.
let database = await initializeDatabase();
// Retrieve the first page.
console.log(`Retrieving page: ${DevelopmentApplicationsUrl}`);
let body = await request(DevelopmentApplicationsUrl);
let $ = cheerio.load(body);
// Examine the HTML to determine how many pages need to be retrieved.
let pageCount = Math.max(1, $("tr.pagerRow td").length - 1);
let eventValidation = $("input[name='__EVENTVALIDATION']").val();
let viewState = $("input[name='__VIEWSTATE']").val();
if (pageCount === 1)
console.log(`There is ${pageCount} page to parse.`)
else
console.log(`There are ${pageCount} pages to parse.`)
// Process the text from each page.
for (let pageIndex = 1; pageIndex <= pageCount; pageIndex++) {
console.log(`Parsing page ${pageIndex} of ${pageCount}.`);
// Retrieve a subsequent page.
if (pageIndex >= 2) {
try {
let body = await request.post({
url: DevelopmentApplicationsUrl,
headers: { "Content-Type": "application/x-www-form-urlencoded" },
form: {
__EVENTARGUMENT: `Page$${pageIndex}`,
__EVENTTARGET: "ctl00$Content$cusResultsGrid$repWebGrid$ctl00$grdWebGridTabularView",
__EVENTVALIDATION: eventValidation,
__VIEWSTATE: viewState
}});
$ = cheerio.load(body);
} catch (ex) {
console.log(ex);
console.log("Continuing to the next page.");
continue;
}
}
// Use cheerio to find all development applications listed in the current page.
for (let element of $("table.grid td a").get()) {
// Check that a valid development application number was provided.
let applicationNumber = element.children[0].data.trim();
if (!/^[0-9]{3}\/[0-9]{5}\/[0-9]{2}$/.test(applicationNumber))
continue;
// Retrieve the page that contains the details of the development application.
let developmentApplicationUrl = DevelopmentApplicationUrl + encodeURIComponent(applicationNumber);
let body = null;
try {
body = await request(developmentApplicationUrl);
} catch (ex) {
console.log(ex);
console.log("Continuing to the next development application.");
continue;
}
// Extract the details of the development application and insert those details into the
// database as a row in a table.
let $ = cheerio.load(body);
let receivedDate = moment($("td.headerColumn:contains('Lodgement Date') ~ td").text().trim(), "D/MM/YYYY", true); // allows the leading zero of the day to be omitted
let address = $($("table.grid th:contains('Address')").parent().parent().find("tr.normalRow td")[0]).text().trim();
let reason = $("td.headerColumn:contains('Description') ~ td").text().trim();
if (address.length > 0) {
await insertRow(database, {
applicationNumber: applicationNumber,
address: address,
reason: reason,
informationUrl: developmentApplicationUrl,
commentUrl: CommentUrl,
scrapeDate: moment().format("YYYY-MM-DD"),
receivedDate: receivedDate.isValid() ? receivedDate.format("YYYY-MM-DD") : ""
});
}
}
}
}
main().then(() => console.log("Complete.")).catch(error => console.error(error));