From 036781205e137fbe05ff45ec94b8e24e559866c9 Mon Sep 17 00:00:00 2001 From: kiwansim Date: Tue, 10 Sep 2024 12:34:26 -0400 Subject: [PATCH] sc-15538 flag star exemption changes on scraper --- doffer.ts | 340 ++++++++++++++++++++++++---------- lib/extract-rentstab-units.ts | 8 + lib/extract-star-program.ts | 13 ++ 3 files changed, 261 insertions(+), 100 deletions(-) create mode 100644 lib/extract-star-program.ts diff --git a/doffer.ts b/doffer.ts index 4014a14..fa9f4da 100644 --- a/doffer.ts +++ b/doffer.ts @@ -1,32 +1,53 @@ -import path from 'path'; -import puppeteer from 'puppeteer'; -import dotenv from 'dotenv'; - -import { FileSystemCacheBackend, asTextCache, asJSONCache, asBrotliCache, DOFCache, DOFCacheBackend } from './lib/cache'; -import { BBL } from './lib/bbl'; -import { searchForBBL, gotoSidebarLink, SidebarLinkName, parseNOPVLinks, NOPVLink, SOALink, parseSOALinks } from './lib/dof'; -import { getPageHTML } from './lib/page-util'; -import { download } from './lib/download'; -import { convertPDFToText, PDFToTextFlags, EXPECTED_PDFTOTEXT_VERSION } from './lib/pdf-to-text'; -import { extractNetOperatingIncome } from './lib/extract-noi'; -import { getFirstGeoSearchResult, GeoSearchProperties } from './lib/geosearch'; -import { extractRentStabilizedUnits } from './lib/extract-rentstab-units'; -import { launchBrowser } from './lib/browser'; -import { Log, defaultLog } from './lib/log'; -import { S3CacheBackend } from './lib/cache-s3'; -import { S3Client } from '@aws-sdk/client-s3-node'; -import docopt from 'docopt'; -import { assertNotNull, assertNullOrInt } from './util'; +import path from "path"; +import puppeteer from "puppeteer"; +import dotenv from "dotenv"; + +import { + FileSystemCacheBackend, + asTextCache, + asJSONCache, + asBrotliCache, + DOFCache, + DOFCacheBackend, +} from "./lib/cache"; +import { BBL } from "./lib/bbl"; +import { + searchForBBL, + gotoSidebarLink, + SidebarLinkName, + parseNOPVLinks, + NOPVLink, + SOALink, + parseSOALinks, +} from "./lib/dof"; +import { getPageHTML } from "./lib/page-util"; +import { download } from "./lib/download"; +import { + convertPDFToText, + PDFToTextFlags, + EXPECTED_PDFTOTEXT_VERSION, +} from "./lib/pdf-to-text"; +import { extractNetOperatingIncome } from "./lib/extract-noi"; +import { getFirstGeoSearchResult, GeoSearchProperties } from "./lib/geosearch"; +import { extractRentStabilizedUnits } from "./lib/extract-rentstab-units"; +import { launchBrowser } from "./lib/browser"; +import { Log, defaultLog } from "./lib/log"; +import { S3CacheBackend } from "./lib/cache-s3"; +import { S3Client } from "@aws-sdk/client-s3-node"; +import docopt from "docopt"; +import { assertNotNull, assertNullOrInt } from "./util"; +import { extractStarProgram } from "./lib/extract-star-program"; dotenv.config(); -export const CACHE_DIR = path.join(__dirname, '.dof-cache'); -export const S3_BUCKET = process.env.S3_BUCKET || ''; +export const CACHE_DIR = path.join(__dirname, ".dof-cache"); +export const S3_BUCKET = process.env.S3_BUCKET || ""; export const DISABLE_BROTLI = !!process.env.DISABLE_BROTLI; -export const HTML_CACHE_KEY_PREFIX = process.env.HTML_CACHE_KEY_PREFIX || 'html'; +export const HTML_CACHE_KEY_PREFIX = + process.env.HTML_CACHE_KEY_PREFIX || "html"; const PAGES_UNTIL_BROWSER_RESTART = 1000; -const VERSION = '0.0.1'; +const VERSION = "0.0.1"; const DOC = ` Tool for scraping the NYC DOF website. @@ -41,23 +62,27 @@ Options: type CommandOptions = { scrape: boolean; - '--only-year': string|null; - '--only-nopv': boolean; - '--only-soa': boolean; - '
': string|null; + "--only-year": string | null; + "--only-nopv": boolean; + "--only-soa": boolean; + "
": string | null; }; export type DofferScrapeOptions = { - onlyYear: number|null, - onlySOA: boolean, - onlyNOPV: boolean, + onlyYear: number | null; + onlySOA: boolean; + onlyNOPV: boolean; }; -export function makeLinkFilter({onlyNOPV, onlySOA, onlyYear}: DofferScrapeOptions): linkFilter { +export function makeLinkFilter({ + onlyNOPV, + onlySOA, + onlyYear, +}: DofferScrapeOptions): linkFilter { return (link) => { if (onlyYear && !link.date.startsWith(onlyYear.toString())) return false; - if (onlySOA && link.kind !== 'soa') return false; - if (onlyNOPV && link.kind !== 'nopv') return false; + if (onlySOA && link.kind !== "soa") return false; + if (onlyNOPV && link.kind !== "nopv") return false; return true; }; } @@ -81,13 +106,12 @@ export function getCacheFromEnvironment(): DOFCache { } export class PageGetter { - private browser: puppeteer.Browser|null = null; - private page: puppeteer.Page|null = null; - private bbl: BBL|null = null; + private browser: puppeteer.Browser | null = null; + private page: puppeteer.Page | null = null; + private bbl: BBL | null = null; private pagesRetrieved: number = 0; - constructor(readonly log: Log = defaultLog, readonly useBrowser = true) { - } + constructor(readonly log: Log = defaultLog, readonly useBrowser = true) {} async getPage(bbl: BBL, linkName: SidebarLinkName): Promise { if (!this.useBrowser) { @@ -107,7 +131,7 @@ export class PageGetter { } if (!this.bbl || this.bbl.toString() !== bbl.toString()) { this.bbl = bbl; - if (!await searchForBBL(this.page, this.bbl, this.log)) { + if (!(await searchForBBL(this.page, this.bbl, this.log))) { // NOTE: Do not change this error message, we currently search for // it in SQL queries in dbtool.ts! throw new Error(`DOF property page for BBL ${this.bbl} does not exist`); @@ -117,27 +141,55 @@ export class PageGetter { return getPageHTML(this.page); } - async cachedGetPageHTML(bbl: BBL, linkName: SidebarLinkName, cache: DOFCache, cacheSubkey: string): Promise { + async cachedGetPageHTML( + bbl: BBL, + linkName: SidebarLinkName, + cache: DOFCache, + cacheSubkey: string + ): Promise { return asTextCache(cache).lazyGet( `${HTML_CACHE_KEY_PREFIX}/${bbl.asPath()}/${cacheSubkey}.html`, () => this.getPage(bbl, linkName) ); } - async cachedDownloadPDF(bbl: BBL, url: string, name: string, cache: DOFCache, cacheSubkey: string): Promise { + async cachedDownloadPDF( + bbl: BBL, + url: string, + name: string, + cache: DOFCache, + cacheSubkey: string + ): Promise { return cache.lazyGet(`pdf/${bbl.asPath()}/${cacheSubkey}.pdf`, () => { this.log(`Downloading ${name} PDF...`); return download(url); }); } - async cachedDownloadAndConvertPDFToText(bbl: BBL, url: string, name: string, cache: DOFCache, cacheSubkey: string, extraFlags?: PDFToTextFlags[]): Promise { - const pdfToTextKey = `pdftotext-${EXPECTED_PDFTOTEXT_VERSION}` + (extraFlags || []).join(''); - return asTextCache(cache).lazyGet(`txt/${bbl.asPath()}/${cacheSubkey}_${pdfToTextKey}.txt`, async () => { - const pdfData = await this.cachedDownloadPDF(bbl, url, name, cache, cacheSubkey); - this.log(`Converting ${name} PDF to text...`); - return convertPDFToText(pdfData, extraFlags); - }); + async cachedDownloadAndConvertPDFToText( + bbl: BBL, + url: string, + name: string, + cache: DOFCache, + cacheSubkey: string, + extraFlags?: PDFToTextFlags[] + ): Promise { + const pdfToTextKey = + `pdftotext-${EXPECTED_PDFTOTEXT_VERSION}` + (extraFlags || []).join(""); + return asTextCache(cache).lazyGet( + `txt/${bbl.asPath()}/${cacheSubkey}_${pdfToTextKey}.txt`, + async () => { + const pdfData = await this.cachedDownloadPDF( + bbl, + url, + name, + cache, + cacheSubkey + ); + this.log(`Converting ${name} PDF to text...`); + return convertPDFToText(pdfData, extraFlags); + } + ); } async shutdown() { @@ -158,41 +210,63 @@ export class PageGetter { * Attempt to geolocate the given search text and return the result, using * a cached value if possible. */ -async function cachedGeoSearch(text: string, cache: DOFCache, log: Log = defaultLog): Promise { - const simpleText = text.toLowerCase().replace(/[^a-z0-9\- ]/g, ''); - const cacheKey = `geosearch/${simpleText.replace(/ /g, '_')}.json`; - return asJSONCache(cache).lazyGet(cacheKey, () => { - log(`Geocoding "${simpleText}"...`); - return getFirstGeoSearchResult(simpleText); - }); +async function cachedGeoSearch( + text: string, + cache: DOFCache, + log: Log = defaultLog +): Promise { + const simpleText = text.toLowerCase().replace(/[^a-z0-9\- ]/g, ""); + const cacheKey = `geosearch/${simpleText.replace(/ /g, "_")}.json`; + return asJSONCache(cache).lazyGet( + cacheKey, + () => { + log(`Geocoding "${simpleText}"...`); + return getFirstGeoSearchResult(simpleText); + } + ); } -export type linkFilter = (link: {kind: 'soa'|'nopv', date: string}) => boolean; +export type linkFilter = (link: { + kind: "soa" | "nopv"; + date: string; +}) => boolean; export const defaultLinkFilter: linkFilter = () => true; /** Information about a BBL's Notice of Property Value (NOPV) for a particular period. */ type NOPVInfo = NOPVLink & { /** The BBL's Net Operating Income (NOI) for the period. */ - noi: string|null + noi: string | null; }; /** Retrieves and extracts all information related to a BBL's Notices of Property Value. */ -async function getNOPVInfo(pageGetter: PageGetter, bbl: BBL, cache: DOFCache, filter: linkFilter = defaultLinkFilter): Promise { +async function getNOPVInfo( + pageGetter: PageGetter, + bbl: BBL, + cache: DOFCache, + filter: linkFilter = defaultLinkFilter +): Promise { const results: NOPVInfo[] = []; const page = SidebarLinkName.noticesOfPropertyValue; - const html = await pageGetter.cachedGetPageHTML(bbl, page, cache, 'nopv'); + const html = await pageGetter.cachedGetPageHTML(bbl, page, cache, "nopv"); const links = parseNOPVLinks(html).filter(filter); for (let link of links) { const name = `${link.date} NOPV for BBL ${bbl}`; const cacheSubkey = `nopv-${link.date}`; const extraFlags: PDFToTextFlags[] = ["-layout"]; - const text = await pageGetter.cachedDownloadAndConvertPDFToText(bbl, link.url, name, cache, cacheSubkey, extraFlags); + const text = await pageGetter.cachedDownloadAndConvertPDFToText( + bbl, + link.url, + name, + cache, + cacheSubkey, + extraFlags + ); assertSuccessfulDownloads(text, bbl, cache, cacheSubkey, extraFlags); const noi = extractNetOperatingIncome(text); - results.push({...link, noi}); + results.push({ ...link, noi }); } return results; @@ -206,9 +280,16 @@ async function getNOPVInfo(pageGetter: PageGetter, bbl: BBL, cache: DOFCache, fi // of clearing errors to scrape again. /** If page text is empty (indicates corrupted PDF download), deletes cached PDF and TXT files and throws error */ -function assertSuccessfulDownloads(pageText: string, bbl: BBL, cache: DOFCache, cacheSubkey: string, extraFlags?: PDFToTextFlags[]): void { +function assertSuccessfulDownloads( + pageText: string, + bbl: BBL, + cache: DOFCache, + cacheSubkey: string, + extraFlags?: PDFToTextFlags[] +): void { if (!pageText.length) { - const pdfToTextKey = `pdftotext-${EXPECTED_PDFTOTEXT_VERSION}` + (extraFlags || []).join(""); + const pdfToTextKey = + `pdftotext-${EXPECTED_PDFTOTEXT_VERSION}` + (extraFlags || []).join(""); cache.delete(`pdf/${bbl.asPath()}/${cacheSubkey}.pdf`); cache.delete(`txt/${bbl.asPath()}/${cacheSubkey}_${pdfToTextKey}.txt`); throw new Error(`DOF PDF download for BBL ${bbl} was corrupted`); @@ -216,32 +297,42 @@ function assertSuccessfulDownloads(pageText: string, bbl: BBL, cache: DOFCache, } type SOAInfo = SOALink & { - rentStabilizedUnits: number|null, + rentStabilizedUnits: number | null; + starEnrolled: boolean | null; }; export type PropertyInfo = { - name: string, - borough: string, - bbl: string, - nopv: NOPVInfo[], - soa: SOAInfo[] + name: string; + borough: string; + bbl: string; + nopv: NOPVInfo[]; + soa: SOAInfo[]; }; /** * Returns the URL to the cached PDF for the given BBL and ISO date. */ -export function getCachedSoaPdfUrl(cache: DOFCache, bblString: string, isoDate: string) { +export function getCachedSoaPdfUrl( + cache: DOFCache, + bblString: string, + isoDate: string +) { // This implementation isn't very DRY but I'm not sure how to // make it any more DRY, unfortunately. const bbl = BBL.from(bblString); return cache.urlForKey(`pdf/${bbl.asPath()}/soa-${isoDate}.pdf`); } -async function getSOAInfo(pageGetter: PageGetter, bbl: BBL, cache: DOFCache, filter: linkFilter = defaultLinkFilter): Promise { +async function getSOAInfo( + pageGetter: PageGetter, + bbl: BBL, + cache: DOFCache, + filter: linkFilter = defaultLinkFilter +): Promise { const results: SOAInfo[] = []; const page = SidebarLinkName.propertyTaxBills; - const html = await pageGetter.cachedGetPageHTML(bbl, page, cache, 'soa'); + const html = await pageGetter.cachedGetPageHTML(bbl, page, cache, "soa"); const links = parseSOALinks(html).filter(filter); for (let link of links) { @@ -250,90 +341,139 @@ async function getSOAInfo(pageGetter: PageGetter, bbl: BBL, cache: DOFCache, fil const name = `${link.date} Q1 SOA for BBL ${bbl}`; const cacheSubkey = `soa-${link.date}`; const extraFlags: PDFToTextFlags[] = ["-table"]; - const text = await pageGetter.cachedDownloadAndConvertPDFToText(bbl, link.url, name, cache, cacheSubkey, extraFlags); + const text = await pageGetter.cachedDownloadAndConvertPDFToText( + bbl, + link.url, + name, + cache, + cacheSubkey, + extraFlags + ); assertSuccessfulDownloads(text, bbl, cache, cacheSubkey, extraFlags); const rentStabilizedUnits = extractRentStabilizedUnits(text); - - results.push({...link, rentStabilizedUnits}); + const starEnrolled = extractStarProgram(text); + results.push({ ...link, rentStabilizedUnits, starEnrolled }); } return results; } -export type BasicPropertyInfo = Omit; +export type BasicPropertyInfo = Omit; -export async function getPropertyInfoForBBLWithPageGetter(bbl: BBL, cache: DOFCache, pageGetter: PageGetter, filter: linkFilter = defaultLinkFilter): Promise { +export async function getPropertyInfoForBBLWithPageGetter( + bbl: BBL, + cache: DOFCache, + pageGetter: PageGetter, + filter: linkFilter = defaultLinkFilter +): Promise { const soa = await getSOAInfo(pageGetter, bbl, cache, filter); const nopv = await getNOPVInfo(pageGetter, bbl, cache, filter); - return {bbl: bbl.toString(), nopv, soa}; + return { bbl: bbl.toString(), nopv, soa }; } -async function getPropertyInfoForBBL(bbl: BBL, name: string, borough: string, cache: DOFCache, log: Log = defaultLog, filter: linkFilter = defaultLinkFilter): Promise { +async function getPropertyInfoForBBL( + bbl: BBL, + name: string, + borough: string, + cache: DOFCache, + log: Log = defaultLog, + filter: linkFilter = defaultLinkFilter +): Promise { const pageGetter = new PageGetter(log); try { - return {...await getPropertyInfoForBBLWithPageGetter(bbl, cache, pageGetter, filter), name, borough}; + return { + ...(await getPropertyInfoForBBLWithPageGetter( + bbl, + cache, + pageGetter, + filter + )), + name, + borough, + }; } finally { await pageGetter.shutdown(); } } -export async function getPropertyInfoForAddress(address: string, cache: DOFCache, log: Log = defaultLog, filter: linkFilter = defaultLinkFilter): Promise { +export async function getPropertyInfoForAddress( + address: string, + cache: DOFCache, + log: Log = defaultLog, + filter: linkFilter = defaultLinkFilter +): Promise { const geo = await cachedGeoSearch(address, cache, log); if (!geo) { throw new GracefulError("The search text is invalid."); } const bbl = BBL.from(geo.addendum.pad.bbl); - log(`Searching NYC DOF website for BBL ${bbl} (${geo.name}, ${geo.borough}).`); + log( + `Searching NYC DOF website for BBL ${bbl} (${geo.name}, ${geo.borough}).` + ); return getPropertyInfoForBBL(bbl, geo.name, geo.borough, cache, log, filter); } -async function scrape(searchText: string, log: Log = defaultLog, filter: linkFilter = defaultLinkFilter) { +async function scrape( + searchText: string, + log: Log = defaultLog, + filter: linkFilter = defaultLinkFilter +) { const cache = getCacheFromEnvironment(); console.log(`Using cache ${cache.description}.`); - const rtfl = new Intl.RelativeTimeFormat('en'); + const rtfl = new Intl.RelativeTimeFormat("en"); const start = Date.now(); - const {nopv, soa} = await getPropertyInfoForAddress(searchText, cache, log, filter); - for (let {period, noi} of nopv) { + const { nopv, soa } = await getPropertyInfoForAddress( + searchText, + cache, + log, + filter + ); + for (let { period, noi } of nopv) { if (noi) { log(`The net operating income for ${period} is ${noi}.`); } } - for (let {period, rentStabilizedUnits} of soa) { + for (let { period, rentStabilizedUnits } of soa) { if (rentStabilizedUnits) { - log(`During ${period}, the property had ${rentStabilizedUnits} rent stabilized units.`); + log( + `During ${period}, the property had ${rentStabilizedUnits} rent stabilized units.` + ); } } - const relTime = rtfl.format((Date.now() - start) / 1000.0, 'second'); + const relTime = rtfl.format((Date.now() - start) / 1000.0, "second"); log(`Done ${relTime}.`); } /** The main CLI program. */ async function main(log: Log = defaultLog) { - const options: CommandOptions = docopt.docopt(DOC, {version: VERSION}); + const options: CommandOptions = docopt.docopt(DOC, { version: VERSION }); if (options.scrape) { - const searchText = assertNotNull(options['
']); - - return scrape(searchText, log, makeLinkFilter({ - onlyYear: assertNullOrInt(options['--only-year']), - onlyNOPV: options['--only-nopv'], - onlySOA: options['--only-soa'], - })); + const searchText = assertNotNull(options["
"]); + + return scrape( + searchText, + log, + makeLinkFilter({ + onlyYear: assertNullOrInt(options["--only-year"]), + onlyNOPV: options["--only-nopv"], + onlySOA: options["--only-soa"], + }) + ); } } /** Error subclass that represents a graceful failure of the CLI. */ -export class GracefulError extends Error { -} +export class GracefulError extends Error {} if (module.parent === null) { - main().catch(e => { + main().catch((e) => { if (e instanceof GracefulError) { e.message && console.log(e.message); } else { diff --git a/lib/extract-rentstab-units.ts b/lib/extract-rentstab-units.ts index 5d827cd..b1a23c9 100644 --- a/lib/extract-rentstab-units.ts +++ b/lib/extract-rentstab-units.ts @@ -11,4 +11,12 @@ export function extractRentStabilizedUnits(text: string): number|null { } while (match); return total === 0 ? null : total; + /* TODO: potential solutions to double counting + * 1. return 9999 if there's more than 1 match for manual flagging + Q: are there examples of rent stab units showing up partially to form a total sum? + * 2. execute do while loop just once + * + * test on regex101.com + * (?:Housing-Rent\s+Stabilization|Rent\s+Stabilization(?:\s+Fee)?-\s+Chg)\s+(\d+) + */ } diff --git a/lib/extract-star-program.ts b/lib/extract-star-program.ts new file mode 100644 index 0000000..daaef69 --- /dev/null +++ b/lib/extract-star-program.ts @@ -0,0 +1,13 @@ +export function extractStarProgram(text: string): boolean|null { + const re = /(Star\s+Savings|\w*\s*Star\s+-\s+School\s+Tax\s+Relief)/ig; + let match: RegExpExecArray|null = null; + + do { + match = re.exec(text); + if (match) { + return true; + } + } while (match); + + return false; +}