Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pathlib for path handling #57

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 22 additions & 26 deletions run_app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import argparse
import glob
import imghdr
import os
import re
from datetime import datetime
from pathlib import Path

from tqdm import tqdm

Expand Down Expand Up @@ -37,10 +36,11 @@ def get_file_type(file_path):
:param file_path: file path to be checked
:return: "directory", "main_text", "linked_table" or "table_image"
'''
if os.path.isdir(file_path):
file_path = Path(file_path)
if file_path.is_dir():
return ("directory")
elif file_path.endswith(".html"):
if re.search("table_\d+.html", file_path):
elif file_path.suffix == ".html":
if re.search("table_\d+.html", file_path.name):
return ("linked_tables")
else:
return ("main_text")
Expand Down Expand Up @@ -86,15 +86,13 @@ def read_file_structure(file_path, target_dir):
:return: list of dicts
'''
structure = {}
if os.path.exists(file_path):
omit_dir = "/".join(file_path.split("/"))
if os.path.isdir(file_path):
all_fpaths = glob.iglob(file_path + '/**', recursive=True)
# turn the 3d file structure into a flat 2d list of file paths
file_path = Path(file_path)
if file_path.exists():
if file_path.is_dir():
all_fpaths = file_path.rglob('*')
for fpath in all_fpaths:
tmp_out = fpath.replace(omit_dir, "")
tmp_out = "/".join(tmp_out.split("/")[:-1])
out_dir = target_dir + tmp_out
tmp_out = fpath.relative_to(file_path).parent
out_dir = Path(target_dir) / tmp_out
ftype = get_file_type(fpath)
base_file = None
if ftype == "directory":
Expand Down Expand Up @@ -150,11 +148,12 @@ def read_file_structure(file_path, target_dir):
error_occurred = False
output_format = args.output_format if args.output_format else "JSON"
trained_data = args.trained_data_set if args.output_format else "eng"
if not os.path.exists(target_dir):
os.makedirs(target_dir)
logFileName = F"{target_dir}/autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}"
target_dir = Path(target_dir)
if not target_dir.exists():
target_dir.mkdir(parents=True)
logFileName = target_dir / f"autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}"

with open(logFileName, "w") as log_file:
with logFileName.open("w") as log_file:
log_file.write(F"Auto-CORPus log file from {cdate.hour}:{cdate.minute} on {cdate.day}/{cdate.month}/{cdate.year}\n")
log_file.write(F"Input directory provided: {file_path}\n")
log_file.write(F"Output directory provided: {target_dir}\n")
Expand All @@ -170,30 +169,27 @@ def read_file_structure(file_path, target_dir):
"table_images": len(structure[key]['table_images'])
}
)
if os.path.isdir(file_path):
base_dir = file_path
else:
base_dir = "/".join(file_path.split("/")[:-1])
base_dir = Path(file_path).parent if not Path(file_path).is_dir() else Path(file_path)
try:
AC = autoCORPus(config, base_dir=base_dir, main_text=structure[key]['main_text'],
linked_tables=sorted(structure[key]['linked_tables']),
table_images=sorted(structure[key]['table_images']), trainedData=trained_data)

out_dir = structure[key]['out_dir']
out_dir = Path(structure[key]['out_dir'])
if structure[key]["main_text"]:
key = key.replace('\\', '/')
if output_format.lower() == "json":
with open(out_dir + "/" + key.split("/")[-1] + "_bioc.json", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_bioc.json", "w", encoding='utf-8') as outfp:
outfp.write(AC.main_text_to_bioc_json())
else:
with open(out_dir + "/" + key.split("/")[-1] + "_bioc.xml", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_bioc.xml", "w", encoding='utf-8') as outfp:
outfp.write(AC.main_text_to_bioc_xml())
with open(out_dir + "/" + key.split("/")[-1] + "_abbreviations.json", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_abbreviations.json", "w", encoding='utf-8') as outfp:
outfp.write(AC.abbreviations_to_bioc_json())

# AC does not support the conversion of tables or abbreviations to the XML format
if AC.has_tables:
with open(out_dir + "/" + key.split("/")[-1] + "_tables.json", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_tables.json", "w", encoding='utf-8') as outfp:
outfp.write(AC.tables_to_bioc_json())
success.append(F"{key} was processed successfully.")
except Exception as e:
Expand Down
15 changes: 10 additions & 5 deletions src/autoCORPus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import json
import sys
from pathlib import Path

from bioc import loads, dumps, BioCFileType
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -37,28 +38,32 @@ class autoCORPus:

@handle_path
def __read_config(self, config_path):
with open(config_path, "r") as f:
config_path = Path(config_path)
with config_path.open("r") as f:
## TODO: validate config file here if possible
content = json.load(f)
return content["config"]

@handle_path
def __import_file(self, file_path):
with open(file_path, "r") as f:
file_path = Path(file_path)
with file_path.open("r") as f:
return f.read(), file_path

@handle_path
def __handle_target_dir(self, target_dir):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
target_dir = Path(target_dir)
if not target_dir.exists():
target_dir.mkdir(parents=True)
return

def __validate_infile(self):
pass

def __soupify_infile(self, fpath):
fpath = Path(fpath)
try:
with open(fpath, "r", encoding="utf-8") as fp:
with fpath.open("r", encoding="utf-8") as fp:
soup = BeautifulSoup(fp.read(), 'html.parser')
for e in soup.find_all(attrs={'style': ['display:none', 'visibility:hidden']}):
e.extract()
Expand Down
11 changes: 7 additions & 4 deletions src/table_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from datetime import datetime
from operator import itemgetter
from pathlib import Path

import cv2
import pytesseract
Expand Down Expand Up @@ -389,13 +390,15 @@ def __init__(self, table_images, base_dir, trainedData="eng"):
"infons": {},
"documents": []
}
base_dir = Path(base_dir)
for image_path in table_images:
imgname = image_path.split('/')[-1]
image_path = Path(image_path)
imgname = image_path.name
self.tableIdentifier = imgname.split("_")[-1].split(".")[0]
self.file_name = imgname.replace(base_dir + "/", "")
pmc = imgname[0:imgname.rfind('.')]
self.file_name = str(image_path.relative_to(base_dir))
pmc = imgname.stem

img = cv2.imread(image_path)
img = cv2.imread(str(image_path))

cells, added, thresh = self.find_cells(img)
table_row = self.cell2table(cells, added, thresh, "imagesOut", pmc)
Expand Down
16 changes: 8 additions & 8 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import re
import unicodedata
from pathlib import Path

import bs4
import networkx as nx
Expand All @@ -21,13 +21,13 @@ def get_files(base_dir, pattern=r'(.*).html'):
file_list: a list of filepath
"""
file_list = []
files = os.listdir(base_dir)
for i in files:
abs_path = os.path.join(base_dir, i)
if re.match(pattern, abs_path):
file_list.append(abs_path)
elif os.path.isdir(abs_path) & ('ipynb_checkpoints' not in abs_path):
file_list += get_files(abs_path)
base_dir = Path(base_dir)
for item in base_dir.iterdir():
abs_path = item.resolve()
if abs_path.is_file() and re.match(pattern, str(abs_path)):
file_list.append(str(abs_path))
elif abs_path.is_dir() and 'ipynb_checkpoints' not in str(abs_path):
file_list += get_files(abs_path, pattern)
return file_list


Expand Down
52 changes: 25 additions & 27 deletions tests/quick_comparison.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,35 @@
import os
import re
from pathlib import Path

folder1_path = 'OldOutput'
folder2_path = 'NewOutput'
folder1_path = Path('OldOutput')
folder2_path = Path('NewOutput')
lines_to_ignore = ["\"date\":", "\"offset\":", "\"inputfile\":"]

different_files = []

for root, dirs, files in os.walk(folder1_path):
for filename in files:
if "_bioc" not in filename:
continue
folder1_file_path = os.path.join(root, filename)
folder2_file_path = os.path.join(folder2_path, filename)
if os.path.exists(folder2_file_path):
with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
different_lines = [i for i, (line1, line2) in enumerate(zip(lines1, lines2)) if
re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2,
flags=re.UNICODE) and
not [x for x in lines_to_ignore if x in line1]]
false_positives = different_lines
different_lines = []
for i in range(len(false_positives)):
if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[
false_positives[i]]:
continue
else:
different_lines.append(false_positives[i])
for folder1_file_path in folder1_path.rglob('*'):
if "_bioc" not in folder1_file_path.name:
continue
folder2_file_path = folder2_path / folder1_file_path.name
if folder2_file_path.exists():
with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
different_lines = [
i for i, (line1, line2) in enumerate(zip(lines1, lines2))
if re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2, flags=re.UNICODE) and
not [x for x in lines_to_ignore if x in line1]
]
false_positives = different_lines
different_lines = []
for i in range(len(false_positives)):
if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[false_positives[i]]:
continue
else:
different_lines.append(false_positives[i])

if different_lines:
different_files.append(filename)
if different_lines:
different_files.append(folder1_file_path.name)

print("\n".join(different_files))
print(len(different_files))