-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #155 from declanrjb/master
data and parsers for new counties
- Loading branch information
Showing
7 changed files
with
51,248 additions
and
0 deletions.
There are no files selected for viewing
30,368 changes: 30,368 additions & 0 deletions
30,368
2024/20240423__pa__primary__blair__precinct.csv
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
15,901 changes: 15,901 additions & 0 deletions
15,901
2024/20240423__pa__primary__centre__precinct.csv
Large diffs are not rendered by default.
Oops, something went wrong.
241 changes: 241 additions & 0 deletions
241
parsers/2024-primary_parsers/pa_blair_primary_2024_results_parser.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 97, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import pdfplumber\n", | ||
"import re" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 98, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def extract_votes_from_row(data_row,header):\n", | ||
" votes_section = re.search(r'[0-9]+\\s[0-9\\.]+\\s[0-9]+\\s[0-9]+\\s[0-9]+$',data_row).group(0)\n", | ||
" votes_ls = votes_section.split(' ')\n", | ||
" votes = {}\n", | ||
" for i in range(0,len(header)):\n", | ||
" votes[header[i]] = [votes_ls[i]]\n", | ||
" df = pd.DataFrame(votes)\n", | ||
" #df['candidate'] = re.search(r'[^0-9]*',data_row).group(0).strip()\n", | ||
" df['candidate'] = data_row.replace(votes_section,'')\n", | ||
" return df\n", | ||
"\n", | ||
"def extract_votes(data_rows,header):\n", | ||
" return pd.concat([extract_votes_from_row(data_row,header) for data_row in data_rows])\n", | ||
"\n", | ||
"def extract_data_rows(table_rows):\n", | ||
" data_rows = []\n", | ||
" for row in table_rows:\n", | ||
" row = row.replace('%','')\n", | ||
" row = row.replace(',','')\n", | ||
" if not re.search(r'[0-9]+\\s[0-9\\.]+\\s[0-9]+\\s[0-9]+\\s[0-9]+$',row) is None:\n", | ||
" data_rows.append(row)\n", | ||
"\n", | ||
" return data_rows\n", | ||
"\n", | ||
"def extract_box_data(page,bbox):\n", | ||
" data_section = page.crop(bbox)\n", | ||
" data_text = data_section.extract_text()\n", | ||
" table_rows = data_text.split('\\n')\n", | ||
" race_title = table_rows[0]\n", | ||
"\n", | ||
" table_header = table_rows[2].replace('%','').replace(' ',' ').split(' ')\n", | ||
" \n", | ||
" data_rows = extract_data_rows(table_rows)\n", | ||
"\n", | ||
" df = extract_votes(data_rows,table_header)\n", | ||
"\n", | ||
" if not re.search(r'^[A-Z]{,3}',race_title) is None:\n", | ||
" df['party'] = re.search(r'^[A-Z]{,3}',race_title).group(0)\n", | ||
" if not re.search(r'[0-9]+.*',race_title) is None:\n", | ||
" df['district'] = re.search(r'[0-9]+.*',race_title).group(0)\n", | ||
" if not re.search(r'\\s\\D+',race_title) is None:\n", | ||
" df['office'] = re.search(r'\\s\\D+',race_title).group(0).strip()\n", | ||
"\n", | ||
" return df\n", | ||
"\n", | ||
"def extract_precinct_name(page,strip_start=70,strip_height=15):\n", | ||
" return page.crop((0,strip_start,page.width,strip_start+strip_height)).extract_text()\n", | ||
"\n", | ||
"def extract_page_data(page):\n", | ||
" vote_headers = page.search('Vote For')\n", | ||
" all_data = []\n", | ||
" i = 0\n", | ||
" while i < len(vote_headers):\n", | ||
" if i < len(vote_headers) - 1:\n", | ||
" pair = vote_headers[i:i+2]\n", | ||
" bbox = (0,pair[0]['bottom']-30,page.width,pair[1]['top']-20)\n", | ||
" else:\n", | ||
" bbox = (0,vote_headers[i]['bottom']-30,page.width,page.height)\n", | ||
" temp = extract_box_data(page,bbox)\n", | ||
" all_data.append(temp)\n", | ||
" i += 1\n", | ||
"\n", | ||
" df = pd.concat(all_data)\n", | ||
" df['precinct'] = extract_precinct_name(page)\n", | ||
" return df\n", | ||
"\n", | ||
"def extract_statistics(page):\n", | ||
" if len(page.search('Statistics')) > 0:\n", | ||
" bbox = (0,page.search('Statistics')[0]['bottom'],page.width,page.search('Statistics')[0]['bottom'] + 150)\n", | ||
" stats_text = page.crop(bbox).extract_text()\n", | ||
"\n", | ||
" stats = pd.DataFrame({\n", | ||
" 'Registered Voters': [re.search(r'Registered Voters - Total ([0-9]*)',stats_text).group(1)],\n", | ||
" 'Ballots Cast': [re.search(r'Ballots Cast - Total ([0-9]*)',stats_text).group(1)]\n", | ||
" })\n", | ||
"\n", | ||
" stats = stats.melt().rename(columns={\n", | ||
" 'variable':'office',\n", | ||
" 'value':'votes'\n", | ||
" })\n", | ||
"\n", | ||
" stats['precinct'] = extract_precinct_name(page)\n", | ||
"\n", | ||
" return stats\n", | ||
" else:\n", | ||
" return None" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 99, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"file = '../data_2024/primary/Blair PA Official-Precinct-Summary-Report.pdf'\n", | ||
"pdf = pdfplumber.open(file)\n", | ||
"county_name = 'Blair'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 100, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.concat([extract_page_data(page) for page in pdf.pages])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 101, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['county'] = county_name" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 102, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = df.rename(columns={\n", | ||
" 'TOTAL':'votes',\n", | ||
" 'Election':'election_day',\n", | ||
" 'Provision':'provisional',\n", | ||
" 'Mail':'absentee'\n", | ||
"})\n", | ||
"df = df[df['candidate'].apply(lambda x: x not in ['Total Votes Cast','Overvotes','Undervotes','Contest Totals'])]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 103, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"stats_df = pd.concat([extract_statistics(page) for page in pdf.pages if len(page.search('Statistics')) > 0])\n", | ||
"stats_df['county'] = county_name" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 104, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.concat([df,stats_df])\n", | ||
"df['district'] = df['district'].str.extract('(\\d+)')\n", | ||
"df = df.fillna('')\n", | ||
"df = df.reset_index()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 105, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 106, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# data cleaning\n", | ||
"df['candidate'] = df['candidate'].str.title().str.strip()\n", | ||
"df['office'] = df['office'].replace('President of the United States'.upper(),'President')\n", | ||
"df['office'] = df['office'].replace('United States Senator'.upper(),'U.S. Senate')\n", | ||
"df['office'] = df['office'].replace('Representative in Congress'.upper(),'U.S. House')\n", | ||
"df['office'] = df['office'].replace('Senator in the General Assembly'.upper(),'State Senate')\n", | ||
"df['office'] = df['office'].replace('Representative in the General Assembly'.upper(),'General Assembly')\n", | ||
"\n", | ||
"df['precinct'] = df['precinct'].str.title()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 107, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = df.drop_duplicates()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 108, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df.to_csv(f'../data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.