Skip to content

Commit

Permalink
added comment_url and documents fields
Browse files Browse the repository at this point in the history
  • Loading branch information
markdon committed Nov 17, 2023
1 parent 25694c4 commit 8259046
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 167 deletions.
81 changes: 81 additions & 0 deletions build/db.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.insertData = exports.fieldNames = void 0;
// Initialise our database
const sqlite3_1 = __importDefault(require("sqlite3"));
/** The field names in the SQL database.
* Use this for consistent ordering of fields in queries.
*/
exports.fieldNames = [
"council_reference",
"address",
"description",
"info_url",
"comment_url",
"date_scraped",
"on_notice_from",
"on_notice_to",
"documents",
];
const db = new sqlite3_1.default.Database("data.sqlite");
db.serialize(function () {
const createFields = exports.fieldNames
.map((f, i) => {
if (i === 0)
return `${f} TEXT PRIMARY KEY`;
return `${f} TEXT`;
})
.join(", ");
const createQuery = `CREATE TABLE IF NOT EXISTS data (${createFields})`;
//Create new table
console.log(`createQuery:`, createQuery);
db.run(createQuery);
});
// add the documents column if it doesn't exist
db.serialize(function () {
// Check if the column already exists
const checkQuery = `PRAGMA table_info(data)`;
db.all(checkQuery, function (err, rows) {
console.log("🚀 ~ file: index.ts:62 ~ row:", rows);
if (err) {
console.error(err.message);
return;
}
const rowExists = !!rows.find((r) => r.name === "documents");
if (!rowExists) {
// Column doesn't exist, execute the ALTER TABLE statement
db.run(`ALTER TABLE data ADD COLUMN documents TEXT`, function (err) {
if (err) {
console.error(err.message);
return;
}
console.log('Column "documents" added to the table "data"');
});
}
});
});
function insertData(data) {
db.serialize(function () {
const insertFields = exports.fieldNames.join(", ");
/** Morph.io appears to persist the database across scraper runs.
* This should be enough to insert new DAs, update DAs when they change,
* and keep their data when they are removed from the website.
*/
const insertQuery = `INSERT OR REPLACE INTO data (${insertFields}) VALUES (${exports.fieldNames
.map(() => "?")
.join(", ")})`;
console.log(`insertQuery:`, insertQuery);
/** Insert new records */
var statement = db.prepare(insertQuery);
data.forEach((record) => {
statement.run(record[exports.fieldNames[0]], record[exports.fieldNames[1]], record[exports.fieldNames[2]], record[exports.fieldNames[3]], record[exports.fieldNames[4]], record[exports.fieldNames[5]], record[exports.fieldNames[6]], record[exports.fieldNames[7]], record[exports.fieldNames[8]]);
});
statement.finalize();
console.log("Inserted/updated", data.length, "records");
});
}
exports.insertData = insertData;
exports.default = db;
87 changes: 18 additions & 69 deletions build/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,20 @@ const request_promise_1 = __importDefault(require("request-promise"));
const cheerio = __importStar(require("cheerio"));
const sqlite3_1 = __importDefault(require("sqlite3"));
const luxon_1 = require("luxon");
/** The field names in the SQL database.
* Use this for consistent ordering of fields in queries.
*/
const fieldNames = [
'council_reference',
'address',
'description',
'info_url',
'date_scraped',
'on_notice_from',
'on_notice_to',
// 'more_info',
];
const options = {
uri: 'https://www.kingborough.tas.gov.au/development/planning-notices/',
transform: (body) => cheerio.load(body),
};
const db_1 = require("./db");
sqlite3_1.default.verbose();
const info_url = "https://www.kingborough.tas.gov.au/development/planning-notices/";
const comment_url = "mailto:[email protected]";
(async () => {
const $ = await (0, request_promise_1.default)(options);
const $ = await (0, request_promise_1.default)({
uri: "https://www.kingborough.tas.gov.au/development/planning-notices/",
transform: (body) => cheerio.load(body),
});
/** Table rows parsed in to the database fields */
const data = $('#list tbody tr')
const data = $("#list tbody tr")
.toArray()
.map((el) => {
const cells = $(el).find('td');
const cells = $(el).find("td");
/** The first 5 fields are just simple strings */
const strings = cells
.toArray()
Expand All @@ -66,70 +55,30 @@ sqlite3_1.default.verbose();
* - Environmental Impact Assessments
* - etc
*/
const more_info = $(el)
.find('a')
const documents = $(el)
.find("a")
.toArray()
.map((el) => $(el).attr('href'))
.map((el) => $(el).attr("href"))
.filter((s) => !!s);
/** First link is the primary Development Application document */
const info_url = more_info.shift() || '';
/** Assign the string fields to variables */
const [council_reference, address, on_notice_from, on_notice_to, description,] = strings;
return {
council_reference,
address: `${address}, Tasmania`,
description,
info_url,
date_scraped: new Date().toISOString(),
comment_url,
date_scraped: luxon_1.DateTime.now().toISODate(),
on_notice_from:
/** Convert the date strings from localised version to ISO */
luxon_1.DateTime.fromFormat(on_notice_from, 'd MMM yyyy').toISODate() || '',
on_notice_to: luxon_1.DateTime.fromFormat(on_notice_to, 'd MMM yyyy').toISODate() || '',
luxon_1.DateTime.fromFormat(on_notice_from, "d MMM yyyy").toISODate() || "",
on_notice_to: luxon_1.DateTime.fromFormat(on_notice_to, "d MMM yyyy").toISODate() || "",
/** Dump the additional PDF links in to this extra variable
* and figure out what to do with them later 🤷‍♂️.
* morph.io API could be used to access this and download files */
more_info: JSON.stringify(more_info),
documents: JSON.stringify(documents),
};
});
console.log(data);
// Open a database handle
var db = new sqlite3_1.default.Database('data.sqlite');
const createFields = fieldNames
.map((f, i) => {
if (i === 0)
return `${f} TEXT PRIMARY KEY`;
return `${f} TEXT`;
})
.join(', ');
const createQuery = `CREATE TABLE IF NOT EXISTS data (${createFields})`;
const insertFields = fieldNames.join(', ');
/** Morph.io appears to persist the database across scraper runs.
* This should be enough to insert new DAs, update DAs when they change,
* and keep their data when they are removed from the website.
*/
const insertQuery = `INSERT OR REPLACE INTO data (${insertFields}) VALUES (${fieldNames
.map(() => '?')
.join(', ')})`;
db.serialize(function () {
//Create new table
console.log(`createQuery:`, createQuery);
db.run(createQuery);
// TODO: Figure out how to add a column if it doesn't exist
// db.get('SELECT more_info from data', (error) => {
// console.log('error', error);
// if (error) {
// db.run('ALTER TABLE data ADD COLUMN more_info TEXT');
// }
// });
console.log(`insertQuery:`, insertQuery);
/** Insert new records */
var statement = db.prepare(insertQuery);
data.forEach((record) => {
statement.run(record[fieldNames[0]], record[fieldNames[1]], record[fieldNames[2]], record[fieldNames[3]], record[fieldNames[4]], record[fieldNames[5]], record[fieldNames[6]]
// record[fieldNames[7]]
);
});
statement.finalize();
console.log('Inserted/updated', data.length, 'records');
});
(0, db_1.insertData)(data);
})();
105 changes: 105 additions & 0 deletions src/db.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Initialise our database
import sqlite3 from "sqlite3";

export type FieldNames =
| "council_reference"
| "address"
| "description"
| "info_url"
| "comment_url"
| "date_scraped"
| "on_notice_from"
| "on_notice_to"
| "documents";

/** The field names in the SQL database.
* Use this for consistent ordering of fields in queries.
*/
export const fieldNames: readonly FieldNames[] = [
"council_reference",
"address",
"description",
"info_url",
"comment_url",
"date_scraped",
"on_notice_from",
"on_notice_to",
"documents",
];

export type Document = Record<FieldNames, string>;

const db = new sqlite3.Database("data.sqlite");
db.serialize(function () {
const createFields = fieldNames
.map((f, i) => {
if (i === 0) return `${f} TEXT PRIMARY KEY`;
return `${f} TEXT`;
})
.join(", ");
const createQuery = `CREATE TABLE IF NOT EXISTS data (${createFields})`;
//Create new table
console.log(`createQuery:`, createQuery);
db.run(createQuery);
});

// add the documents column if it doesn't exist
db.serialize(function () {
// Check if the column already exists
const checkQuery = `PRAGMA table_info(data)`;
db.all(checkQuery, function (err, rows: { name: string }[]) {
console.log("🚀 ~ file: index.ts:62 ~ row:", rows);
if (err) {
console.error(err.message);
return;
}

const rowExists = !!rows.find((r) => r.name === "documents");

if (!rowExists) {
// Column doesn't exist, execute the ALTER TABLE statement
db.run(`ALTER TABLE data ADD COLUMN documents TEXT`, function (err) {
if (err) {
console.error(err.message);
return;
}
console.log('Column "documents" added to the table "data"');
});
}
});
});

export function insertData(data: Document[]) {
db.serialize(function () {
const insertFields = fieldNames.join(", ");
/** Morph.io appears to persist the database across scraper runs.
* This should be enough to insert new DAs, update DAs when they change,
* and keep their data when they are removed from the website.
*/
const insertQuery = `INSERT OR REPLACE INTO data (${insertFields}) VALUES (${fieldNames
.map(() => "?")
.join(", ")})`;

console.log(`insertQuery:`, insertQuery);

/** Insert new records */
var statement = db.prepare(insertQuery);
data.forEach((record) => {
statement.run(
record[fieldNames[0]],
record[fieldNames[1]],
record[fieldNames[2]],
record[fieldNames[3]],
record[fieldNames[4]],
record[fieldNames[5]],
record[fieldNames[6]],
record[fieldNames[7]],
record[fieldNames[8]]
);
});
statement.finalize();
console.log("Inserted/updated", data.length, "records");
});
}

export default db;
Loading

0 comments on commit 8259046

Please sign in to comment.