Merge pull request #155 from declanrjb/master

data and parsers for new counties
openelections · Aug 18, 2024 · aa20dcf · aa20dcf
2 parents be9545f + 8fb707b
commit aa20dcf
Show file tree

Hide file tree

Showing 7 changed files with 51,248 additions and 0 deletions.
diff --git a/2024/20240423__pa__primary__blair__precinct.csv b/2024/20240423__pa__primary__blair__precinct.csv
diff --git a/2024/20240423__pa__primary__carbon__precinct.csv b/2024/20240423__pa__primary__carbon__precinct.csv
diff --git a/2024/20240423__pa__primary__centre__precinct.csv b/2024/20240423__pa__primary__centre__precinct.csv
diff --git a/parsers/2024-primary_parsers/pa_blair_primary_2024_results_parser.ipynb b/parsers/2024-primary_parsers/pa_blair_primary_2024_results_parser.ipynb
@@ -0,0 +1,241 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import pdfplumber\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_votes_from_row(data_row,header):\n",
+    "    votes_section = re.search(r'[0-9]+\\s[0-9\\.]+\\s[0-9]+\\s[0-9]+\\s[0-9]+$',data_row).group(0)\n",
+    "    votes_ls = votes_section.split(' ')\n",
+    "    votes = {}\n",
+    "    for i in range(0,len(header)):\n",
+    "        votes[header[i]] = [votes_ls[i]]\n",
+    "    df = pd.DataFrame(votes)\n",
+    "    #df['candidate'] = re.search(r'[^0-9]*',data_row).group(0).strip()\n",
+    "    df['candidate'] = data_row.replace(votes_section,'')\n",
+    "    return df\n",
+    "\n",
+    "def extract_votes(data_rows,header):\n",
+    "    return pd.concat([extract_votes_from_row(data_row,header) for data_row in data_rows])\n",
+    "\n",
+    "def extract_data_rows(table_rows):\n",
+    "    data_rows = []\n",
+    "    for row in table_rows:\n",
+    "        row = row.replace('%','')\n",
+    "        row = row.replace(',','')\n",
+    "        if not re.search(r'[0-9]+\\s[0-9\\.]+\\s[0-9]+\\s[0-9]+\\s[0-9]+$',row) is None:\n",
+    "            data_rows.append(row)\n",
+    "\n",
+    "    return data_rows\n",
+    "\n",
+    "def extract_box_data(page,bbox):\n",
+    "    data_section = page.crop(bbox)\n",
+    "    data_text = data_section.extract_text()\n",
+    "    table_rows = data_text.split('\\n')\n",
+    "    race_title = table_rows[0]\n",
+    "\n",
+    "    table_header = table_rows[2].replace('%','').replace('  ',' ').split(' ')\n",
+    "    \n",
+    "    data_rows = extract_data_rows(table_rows)\n",
+    "\n",
+    "    df = extract_votes(data_rows,table_header)\n",
+    "\n",
+    "    if not re.search(r'^[A-Z]{,3}',race_title) is None:\n",
+    "        df['party'] = re.search(r'^[A-Z]{,3}',race_title).group(0)\n",
+    "    if not re.search(r'[0-9]+.*',race_title) is None:\n",
+    "        df['district'] = re.search(r'[0-9]+.*',race_title).group(0)\n",
+    "    if not re.search(r'\\s\\D+',race_title) is None:\n",
+    "        df['office'] = re.search(r'\\s\\D+',race_title).group(0).strip()\n",
+    "\n",
+    "    return df\n",
+    "\n",
+    "def extract_precinct_name(page,strip_start=70,strip_height=15):\n",
+    "    return page.crop((0,strip_start,page.width,strip_start+strip_height)).extract_text()\n",
+    "\n",
+    "def extract_page_data(page):\n",
+    "    vote_headers = page.search('Vote For')\n",
+    "    all_data = []\n",
+    "    i = 0\n",
+    "    while i < len(vote_headers):\n",
+    "        if i < len(vote_headers) - 1:\n",
+    "            pair = vote_headers[i:i+2]\n",
+    "            bbox = (0,pair[0]['bottom']-30,page.width,pair[1]['top']-20)\n",
+    "        else:\n",
+    "            bbox = (0,vote_headers[i]['bottom']-30,page.width,page.height)\n",
+    "        temp = extract_box_data(page,bbox)\n",
+    "        all_data.append(temp)\n",
+    "        i += 1\n",
+    "\n",
+    "    df = pd.concat(all_data)\n",
+    "    df['precinct'] = extract_precinct_name(page)\n",
+    "    return df\n",
+    "\n",
+    "def extract_statistics(page):\n",
+    "    if len(page.search('Statistics')) > 0:\n",
+    "        bbox = (0,page.search('Statistics')[0]['bottom'],page.width,page.search('Statistics')[0]['bottom'] + 150)\n",
+    "        stats_text = page.crop(bbox).extract_text()\n",
+    "\n",
+    "        stats = pd.DataFrame({\n",
+    "            'Registered Voters': [re.search(r'Registered Voters - Total ([0-9]*)',stats_text).group(1)],\n",
+    "            'Ballots Cast': [re.search(r'Ballots Cast - Total ([0-9]*)',stats_text).group(1)]\n",
+    "        })\n",
+    "\n",
+    "        stats = stats.melt().rename(columns={\n",
+    "            'variable':'office',\n",
+    "            'value':'votes'\n",
+    "        })\n",
+    "\n",
+    "        stats['precinct'] = extract_precinct_name(page)\n",
+    "\n",
+    "        return stats\n",
+    "    else:\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file = '../data_2024/primary/Blair PA Official-Precinct-Summary-Report.pdf'\n",
+    "pdf = pdfplumber.open(file)\n",
+    "county_name = 'Blair'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.concat([extract_page_data(page) for page in pdf.pages])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['county'] = county_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={\n",
+    "    'TOTAL':'votes',\n",
+    "    'Election':'election_day',\n",
+    "    'Provision':'provisional',\n",
+    "    'Mail':'absentee'\n",
+    "})\n",
+    "df = df[df['candidate'].apply(lambda x: x not in ['Total Votes Cast','Overvotes','Undervotes','Contest Totals'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stats_df = pd.concat([extract_statistics(page) for page in pdf.pages if len(page.search('Statistics')) > 0])\n",
+    "stats_df['county'] = county_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.concat([df,stats_df])\n",
+    "df['district'] = df['district'].str.extract('(\\d+)')\n",
+    "df = df.fillna('')\n",
+    "df = df.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data cleaning\n",
+    "df['candidate'] = df['candidate'].str.title().str.strip()\n",
+    "df['office'] = df['office'].replace('President of the United States'.upper(),'President')\n",
+    "df['office'] = df['office'].replace('United States Senator'.upper(),'U.S. Senate')\n",
+    "df['office'] = df['office'].replace('Representative in Congress'.upper(),'U.S. House')\n",
+    "df['office'] = df['office'].replace('Senator in the General Assembly'.upper(),'State Senate')\n",
+    "df['office'] = df['office'].replace('Representative in the General Assembly'.upper(),'General Assembly')\n",
+    "\n",
+    "df['precinct'] = df['precinct'].str.title()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(f'../data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}