-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
224 lines (175 loc) · 9.26 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import pdfplumber
import pandas as pd
import re
from datetime import datetime
import os
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill, Font, Alignment
import glob
import numpy as np
def get_college_name(pdf_path):
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if "BASIC INFORMATION" in text:
tables = page.extract_tables()
for table in tables:
# Convert table cells to strings and check if we found the right table
str_table = [[str(cell).strip() if cell is not None else "" for cell in row] for row in table]
for i, row in enumerate(str_table):
# Look for the row with college name (typically second row)
if i > 0 and len(row) > 1: # Skip header row
college_name = row[1].strip()
if college_name and college_name.lower() != "name of the college":
return college_name
except Exception as e:
print(f"Error extracting college name from {pdf_path}: {str(e)}")
return None
def get_pdf_files(folder_path):
pdf_pattern = os.path.join(folder_path, '**', '*.[pP][dD][fF]')
pdf_files = glob.glob(pdf_pattern, recursive=True)
if not pdf_files:
print(f"No PDF files found in: {folder_path}")
return []
print(f"Found {len(pdf_files)} PDF files")
return pdf_files
def extract_table(pdf_path, table_index=0):
files_with_heading_no_table = []
try:
with pdfplumber.open(pdf_path) as pdf:
# Pattern to match the specific heading
heading_pattern = r'((?:\d+\.)+\d+)\s*Number\s+of\s+complaints/grievances\s+about\s+evaluation\s+year\s+wise\s+during\s+last\s+five\s+years'
# Track if heading is found and table is successfully extracted
heading_found = False
table_extracted = False
for page_num, page in enumerate(pdf.pages):
text = page.extract_text()
# Check if the heading is on this page
match = re.search(heading_pattern, text, re.IGNORECASE)
if match:
heading_found = True
section_number = match.group(1)
# Extract tables on this page
tables = page.extract_tables()
# Check if we have enough tables to extract the specified index
if tables and len(tables) > table_index:
table = tables[table_index]
# Check if the table has the expected structure
if len(table) >= 2 and len(table[0]) == 5:
first_row = table[0]
# Verify the first row contains year-like patterns
if all(re.match(r'\d{4}-\d{2}', str(cell)) for cell in first_row):
df = pd.DataFrame(table[1:], columns=table[0])
df = df.apply(pd.to_numeric, errors='ignore')
print(f"Found table in {pdf_path} on page {page_num + 1}, table index {table_index}")
table_extracted = True
return df, section_number, files_with_heading_no_table
# If we reached here, heading was found but table couldn't be extracted
if not table_extracted:
files_with_heading_no_table.append((pdf_path, section_number))
break
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
return None, None, files_with_heading_no_table
def save_to_excel(dataframes, output_path):
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
workbook = writer.book
# Create main data sheet
worksheet = workbook.create_sheet("College Data", 0)
# Starting column for first college
current_col = 1
# Default years to use when no data is available
default_years = ['x-x', 'x-x', 'x-x', 'x-x' , 'x-x']
# Process each college's data
for pdf_path, (df, section_number) in dataframes.items():
# Get college name
college_name = get_college_name(pdf_path) or os.path.splitext(os.path.basename(pdf_path))[0]
# Write college name
cell = worksheet.cell(row=1, column=current_col)
cell.value = college_name
cell.font = Font(bold=True)
cell.alignment = Alignment(horizontal='center')
# Merge cells for college name across its years
worksheet.merge_cells(
start_row=1,
start_column=current_col,
end_row=1,
end_column=current_col + 4 # Merge 5 cells for years
)
# Determine years and data to use
if df is not None and len(df) > 0:
# Use actual data from the dataframe
years = df.columns
data_row = df.iloc[0]
else:
# Use default years and 'x' for data
years = default_years
data_row = ['x'] * len(default_years)
# Write years (row 2)
for i, year in enumerate(years):
cell = worksheet.cell(row=2, column=current_col + i)
cell.value = year
cell.font = Font(bold=True)
cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
cell.font = Font(color='FFFFFF', bold=True)
cell.alignment = Alignment(horizontal='center')
# Write data (row 3)
for i, value in enumerate(data_row):
cell = worksheet.cell(row=3, column=current_col + i)
cell.value = value
cell.alignment = Alignment(horizontal='center')
# Adjust column widths
for i in range(5): # 5 columns for each college
col_letter = get_column_letter(current_col + i)
worksheet.column_dimensions[col_letter].width = 15
# Move to next college's columns
current_col += 5
if 'Sheet' in workbook.sheetnames:
workbook.remove(workbook['Sheet'])
def process_folder(input_folder, output_dir, table_index=0):
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Get all PDF files from the folder
pdf_files = get_pdf_files(input_folder)
if not pdf_files:
return
# Process all PDFs
results = {}
files_with_heading_no_table = []
for pdf_path in pdf_files:
try:
print(f"Processing: {pdf_path}")
df, section_number, no_table_files = extract_table(pdf_path, table_index)
results[pdf_path] = (df, section_number)
files_with_heading_no_table.extend(no_table_files)
if df is not None:
print(f"✓ Successfully extracted table from: {os.path.basename(pdf_path)}")
else:
print(f"✗ No matching table found in: {os.path.basename(pdf_path)}")
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
results[pdf_path] = (None, None)
# Timestamp for output files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save to Excel (ALL files, now including those without data)
excel_path = os.path.join(output_dir, f'listed_{timestamp}.xlsx')
save_to_excel(results, excel_path)
print(f"Excel output saved to: {excel_path}")
# Print processing summary
print(f"\nProcessing Summary:")
print(f"Total PDFs processed: {len(pdf_files)}")
successful_extractions = sum(1 for _, (df, _) in results.items() if df is not None)
print(f"Successfully extracted tables: {successful_extractions}")
print(f"Failed extractions: {len(pdf_files) - successful_extractions}")
# Print files with heading but no extractable table
if files_with_heading_no_table:
print("\nFiles with heading but no extractable table:")
for file, section_number in files_with_heading_no_table:
print(f"- {os.path.basename(file)} (Section: {section_number})")
else:
print("\nNo PDF files were found to process")
if __name__ == "__main__":
input_folder = "ALL" # Folder containing PDFs
output_dir = "heading_complaints" # Where to save the results
process_folder(input_folder, output_dir, table_index=2)