-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_table.py
172 lines (130 loc) · 5.47 KB
/
parse_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import cv2
import io
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
try:
import urllib.request as urllib
except ModuleNotFoundError:
import urllib
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
from config import debug_mode
BAR_LENGTH = 50
class Cell:
def __init__(self, x, y, w, h, text, conf):
self.x = x
self.y = y
self.w = w
self.h = h
self.text = text
self.conf = conf
def __le___(self, other):
return (self.x + self.w / 2) <= (other.x + other.w / 2)
def __lt__(self, other):
return (self.x + self.w / 2) < (other.x + other.w / 2)
# Read an image by filepath
def imgread(im):
try:
image = Image.open(io.BytesIO(urllib.urlopen(im).read()))
except ValueError:
try:
image = Image.open(im)
except FileExistsError:
return None
return cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
def extract_value(x):
return x[0].text
async def convert_to_csv(filename, page_index, output_path, prefix_path, user_connected, capture_stdout, sio=None, sid=None):
# Create page render debug directory
debug_dir = f"{prefix_path}output/debug/cells_{str(page_index).zfill(4)}"
os.makedirs(debug_dir, exist_ok=True)
im = imgread(filename)
width, height, _ = im.shape
im = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY)
threshold = cv2.adaptiveThreshold(~im, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
# Copy threshold, then for horizontal and vertical lines' detection.
horizontal = threshold.copy()
vertical = threshold.copy()
scale = 15 # Play with this variable in order to increase/decrease the amount of lines to be detected
# Specify size on horizontal axis
horizontal_size = horizontal.shape[1] // scale
horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
horizontal = cv2.erode(horizontal, horizontal_structure, (-1, -1))
horizontal = cv2.dilate(horizontal, horizontal_structure, (-1, -1))
# Vertical lines
vertical_size = vertical.shape[0] // scale
vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
vertical = cv2.erode(vertical, vertical_structure, (-1, -1))
vertical = cv2.dilate(vertical, vertical_structure, (-1, -1))
# All table lines
table = horizontal + vertical
cont, _ = cv2.findContours(table, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# Remove table lines (vertical and horizontal) from the image
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
thick_table = cv2.dilate(table, kernel, iterations=1) # Thicken mask
raw_image = im
for i, _ in enumerate(thick_table):
for j, _ in enumerate(thick_table[i]):
if thick_table[i][j] == 255:
raw_image[i][j] = 255
last_y = -1
row_index = 0
table_array, row = [], []
# Iterate over all contours and recognise text inside
if capture_stdout:
iterate_obj = enumerate(cont)
else:
iterate_obj = enumerate(tqdm(cont))
for ind, cnt in iterate_obj:
x, y, w, h = cv2.boundingRect(cnt)
if (w * h) > (width * height * 0.75):
continue # Skip if the contour is the whole table itself
cropped_image = raw_image[y:y + h, x: x + w]
cv2.imwrite(os.path.join(debug_dir, f"box_{str(ind).zfill(4)}.jpg"), cropped_image)
# Parse text
text = pytesseract.image_to_string(cropped_image, lang="rus", config="--psm 4")
text = text.strip('\u000C') # Remove unwanted characters
text = text.replace('\n', '')
if text == '': # If text is not recognised try searching for digits specifically
text = pytesseract.image_to_string(cropped_image, lang='eng',
config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
# Remove unwanted characters
text = text.strip('\u000C')
text = text.replace('\n', '')
if (y + h / 2) < last_y: # If the new cell is on a different row than the predecessor
row_index += 1
table_array.append(row)
row = []
last_y = y
# text = text[text.conf != -1]
# lines = text.groupby('block_num')['text'].apply(list)
# conf = text.groupby(['block_num'])['conf'].mean()
row.append(Cell(x, y, w, h, text, 0))
if debug_mode: # Show a window with the image we are trying to recognise
cv2.namedWindow("output", cv2.WINDOW_NORMAL) # Create window with freedom of dimensions
cv2.imshow("output", cropped_image) # Show image
cv2.waitKey(0)
if capture_stdout:
await sio.emit('progress', {
'stdout': int(ind / len(cont) * 100),
'index': page_index
}, room=sid)
if user_connected is not None and not user_connected[sid]:
print("User disconnected or pressed stop")
break
# sio.emit('send_table', {
# 'table': table_array,
# }, room=sid)
for i, _ in enumerate(table_array):
table_array[i].sort() # Sort cells in every row to get the correct order (initially it's not the correct)
for i in range(len(table_array)):
for j in range(len(table_array[i])):
table_array[i][j] = table_array[i][j].text
table_array.reverse()
df = pd.DataFrame(table_array)
df.to_csv(output_path)