Skip to content

Commit

Permalink
Merge pull request #155 from declanrjb/master
Browse files Browse the repository at this point in the history
data and parsers for new counties
  • Loading branch information
dwillis authored Aug 18, 2024
2 parents be9545f + 8fb707b commit aa20dcf
Show file tree
Hide file tree
Showing 7 changed files with 51,248 additions and 0 deletions.
30,368 changes: 30,368 additions & 0 deletions 2024/20240423__pa__primary__blair__precinct.csv

Large diffs are not rendered by default.

4,005 changes: 4,005 additions & 0 deletions 2024/20240423__pa__primary__carbon__precinct.csv

Large diffs are not rendered by default.

15,901 changes: 15,901 additions & 0 deletions 2024/20240423__pa__primary__centre__precinct.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import pdfplumber\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"def extract_votes_from_row(data_row,header):\n",
" votes_section = re.search(r'[0-9]+\\s[0-9\\.]+\\s[0-9]+\\s[0-9]+\\s[0-9]+$',data_row).group(0)\n",
" votes_ls = votes_section.split(' ')\n",
" votes = {}\n",
" for i in range(0,len(header)):\n",
" votes[header[i]] = [votes_ls[i]]\n",
" df = pd.DataFrame(votes)\n",
" #df['candidate'] = re.search(r'[^0-9]*',data_row).group(0).strip()\n",
" df['candidate'] = data_row.replace(votes_section,'')\n",
" return df\n",
"\n",
"def extract_votes(data_rows,header):\n",
" return pd.concat([extract_votes_from_row(data_row,header) for data_row in data_rows])\n",
"\n",
"def extract_data_rows(table_rows):\n",
" data_rows = []\n",
" for row in table_rows:\n",
" row = row.replace('%','')\n",
" row = row.replace(',','')\n",
" if not re.search(r'[0-9]+\\s[0-9\\.]+\\s[0-9]+\\s[0-9]+\\s[0-9]+$',row) is None:\n",
" data_rows.append(row)\n",
"\n",
" return data_rows\n",
"\n",
"def extract_box_data(page,bbox):\n",
" data_section = page.crop(bbox)\n",
" data_text = data_section.extract_text()\n",
" table_rows = data_text.split('\\n')\n",
" race_title = table_rows[0]\n",
"\n",
" table_header = table_rows[2].replace('%','').replace(' ',' ').split(' ')\n",
" \n",
" data_rows = extract_data_rows(table_rows)\n",
"\n",
" df = extract_votes(data_rows,table_header)\n",
"\n",
" if not re.search(r'^[A-Z]{,3}',race_title) is None:\n",
" df['party'] = re.search(r'^[A-Z]{,3}',race_title).group(0)\n",
" if not re.search(r'[0-9]+.*',race_title) is None:\n",
" df['district'] = re.search(r'[0-9]+.*',race_title).group(0)\n",
" if not re.search(r'\\s\\D+',race_title) is None:\n",
" df['office'] = re.search(r'\\s\\D+',race_title).group(0).strip()\n",
"\n",
" return df\n",
"\n",
"def extract_precinct_name(page,strip_start=70,strip_height=15):\n",
" return page.crop((0,strip_start,page.width,strip_start+strip_height)).extract_text()\n",
"\n",
"def extract_page_data(page):\n",
" vote_headers = page.search('Vote For')\n",
" all_data = []\n",
" i = 0\n",
" while i < len(vote_headers):\n",
" if i < len(vote_headers) - 1:\n",
" pair = vote_headers[i:i+2]\n",
" bbox = (0,pair[0]['bottom']-30,page.width,pair[1]['top']-20)\n",
" else:\n",
" bbox = (0,vote_headers[i]['bottom']-30,page.width,page.height)\n",
" temp = extract_box_data(page,bbox)\n",
" all_data.append(temp)\n",
" i += 1\n",
"\n",
" df = pd.concat(all_data)\n",
" df['precinct'] = extract_precinct_name(page)\n",
" return df\n",
"\n",
"def extract_statistics(page):\n",
" if len(page.search('Statistics')) > 0:\n",
" bbox = (0,page.search('Statistics')[0]['bottom'],page.width,page.search('Statistics')[0]['bottom'] + 150)\n",
" stats_text = page.crop(bbox).extract_text()\n",
"\n",
" stats = pd.DataFrame({\n",
" 'Registered Voters': [re.search(r'Registered Voters - Total ([0-9]*)',stats_text).group(1)],\n",
" 'Ballots Cast': [re.search(r'Ballots Cast - Total ([0-9]*)',stats_text).group(1)]\n",
" })\n",
"\n",
" stats = stats.melt().rename(columns={\n",
" 'variable':'office',\n",
" 'value':'votes'\n",
" })\n",
"\n",
" stats['precinct'] = extract_precinct_name(page)\n",
"\n",
" return stats\n",
" else:\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"file = '../data_2024/primary/Blair PA Official-Precinct-Summary-Report.pdf'\n",
"pdf = pdfplumber.open(file)\n",
"county_name = 'Blair'"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([extract_page_data(page) for page in pdf.pages])"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"df['county'] = county_name"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={\n",
" 'TOTAL':'votes',\n",
" 'Election':'election_day',\n",
" 'Provision':'provisional',\n",
" 'Mail':'absentee'\n",
"})\n",
"df = df[df['candidate'].apply(lambda x: x not in ['Total Votes Cast','Overvotes','Undervotes','Contest Totals'])]"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"stats_df = pd.concat([extract_statistics(page) for page in pdf.pages if len(page.search('Statistics')) > 0])\n",
"stats_df['county'] = county_name"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df,stats_df])\n",
"df['district'] = df['district'].str.extract('(\\d+)')\n",
"df = df.fillna('')\n",
"df = df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"# data cleaning\n",
"df['candidate'] = df['candidate'].str.title().str.strip()\n",
"df['office'] = df['office'].replace('President of the United States'.upper(),'President')\n",
"df['office'] = df['office'].replace('United States Senator'.upper(),'U.S. Senate')\n",
"df['office'] = df['office'].replace('Representative in Congress'.upper(),'U.S. House')\n",
"df['office'] = df['office'].replace('Senator in the General Assembly'.upper(),'State Senate')\n",
"df['office'] = df['office'].replace('Representative in the General Assembly'.upper(),'General Assembly')\n",
"\n",
"df['precinct'] = df['precinct'].str.title()"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(f'../data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit aa20dcf

Please sign in to comment.