Skip to content

Commit

Permalink
Merge pull request #57 from tsmbland/pathlib
Browse files Browse the repository at this point in the history
Use pathlib for path handling
  • Loading branch information
Thomas-Rowlands authored Oct 24, 2024
2 parents fb6ef37 + e5808ee commit 39dc6ce
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 70 deletions.
48 changes: 22 additions & 26 deletions run_app.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import argparse
import glob
import imghdr
import os
import re
from datetime import datetime
from pathlib import Path

from tqdm import tqdm

Expand Down Expand Up @@ -37,10 +36,11 @@ def get_file_type(file_path):
:param file_path: file path to be checked
:return: "directory", "main_text", "linked_table" or "table_image"
'''
if os.path.isdir(file_path):
file_path = Path(file_path)
if file_path.is_dir():
return ("directory")
elif file_path.endswith(".html"):
if re.search("table_\d+.html", file_path):
elif file_path.suffix == ".html":
if re.search("table_\d+.html", file_path.name):
return ("linked_tables")
else:
return ("main_text")
Expand Down Expand Up @@ -86,15 +86,13 @@ def read_file_structure(file_path, target_dir):
:return: list of dicts
'''
structure = {}
if os.path.exists(file_path):
omit_dir = "/".join(file_path.split("/"))
if os.path.isdir(file_path):
all_fpaths = glob.iglob(file_path + '/**', recursive=True)
# turn the 3d file structure into a flat 2d list of file paths
file_path = Path(file_path)
if file_path.exists():
if file_path.is_dir():
all_fpaths = file_path.rglob('*')
for fpath in all_fpaths:
tmp_out = fpath.replace(omit_dir, "")
tmp_out = "/".join(tmp_out.split("/")[:-1])
out_dir = target_dir + tmp_out
tmp_out = fpath.relative_to(file_path).parent
out_dir = Path(target_dir) / tmp_out
ftype = get_file_type(fpath)
base_file = None
if ftype == "directory":
Expand Down Expand Up @@ -150,11 +148,12 @@ def read_file_structure(file_path, target_dir):
error_occurred = False
output_format = args.output_format if args.output_format else "JSON"
trained_data = args.trained_data_set if args.output_format else "eng"
if not os.path.exists(target_dir):
os.makedirs(target_dir)
logFileName = F"{target_dir}/autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}"
target_dir = Path(target_dir)
if not target_dir.exists():
target_dir.mkdir(parents=True)
logFileName = target_dir / f"autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}"

with open(logFileName, "w") as log_file:
with logFileName.open("w") as log_file:
log_file.write(F"Auto-CORPus log file from {cdate.hour}:{cdate.minute} on {cdate.day}/{cdate.month}/{cdate.year}\n")
log_file.write(F"Input directory provided: {file_path}\n")
log_file.write(F"Output directory provided: {target_dir}\n")
Expand All @@ -170,30 +169,27 @@ def read_file_structure(file_path, target_dir):
"table_images": len(structure[key]['table_images'])
}
)
if os.path.isdir(file_path):
base_dir = file_path
else:
base_dir = "/".join(file_path.split("/")[:-1])
base_dir = Path(file_path).parent if not Path(file_path).is_dir() else Path(file_path)
try:
AC = autoCORPus(config, base_dir=base_dir, main_text=structure[key]['main_text'],
linked_tables=sorted(structure[key]['linked_tables']),
table_images=sorted(structure[key]['table_images']), trainedData=trained_data)

out_dir = structure[key]['out_dir']
out_dir = Path(structure[key]['out_dir'])
if structure[key]["main_text"]:
key = key.replace('\\', '/')
if output_format.lower() == "json":
with open(out_dir + "/" + key.split("/")[-1] + "_bioc.json", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_bioc.json", "w", encoding='utf-8') as outfp:
outfp.write(AC.main_text_to_bioc_json())
else:
with open(out_dir + "/" + key.split("/")[-1] + "_bioc.xml", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_bioc.xml", "w", encoding='utf-8') as outfp:
outfp.write(AC.main_text_to_bioc_xml())
with open(out_dir + "/" + key.split("/")[-1] + "_abbreviations.json", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_abbreviations.json", "w", encoding='utf-8') as outfp:
outfp.write(AC.abbreviations_to_bioc_json())

# AC does not support the conversion of tables or abbreviations to the XML format
if AC.has_tables:
with open(out_dir + "/" + key.split("/")[-1] + "_tables.json", "w", encoding='utf-8') as outfp:
with open(out_dir / f"{Path(key).name}_tables.json", "w", encoding='utf-8') as outfp:
outfp.write(AC.tables_to_bioc_json())
success.append(F"{key} was processed successfully.")
except Exception as e:
Expand Down
15 changes: 10 additions & 5 deletions src/autoCORPus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import json
import sys
from pathlib import Path

from bioc import loads, dumps, biocxml, biocjson
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -37,28 +38,32 @@ class autoCORPus:

@handle_path
def __read_config(self, config_path):
with open(config_path, "r") as f:
config_path = Path(config_path)
with config_path.open("r") as f:
## TODO: validate config file here if possible
content = json.load(f)
return content["config"]

@handle_path
def __import_file(self, file_path):
with open(file_path, "r") as f:
file_path = Path(file_path)
with file_path.open("r") as f:
return f.read(), file_path

@handle_path
def __handle_target_dir(self, target_dir):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
target_dir = Path(target_dir)
if not target_dir.exists():
target_dir.mkdir(parents=True)
return

def __validate_infile(self):
pass

def __soupify_infile(self, fpath):
fpath = Path(fpath)
try:
with open(fpath, "r", encoding="utf-8") as fp:
with fpath.open("r", encoding="utf-8") as fp:
soup = BeautifulSoup(fp.read(), 'html.parser')
for e in soup.find_all(attrs={'style': ['display:none', 'visibility:hidden']}):
e.extract()
Expand Down
11 changes: 7 additions & 4 deletions src/table_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from datetime import datetime
from operator import itemgetter
from pathlib import Path

import cv2
import pytesseract
Expand Down Expand Up @@ -389,13 +390,15 @@ def __init__(self, table_images, base_dir, trainedData="eng"):
"infons": {},
"documents": []
}
base_dir = Path(base_dir)
for image_path in table_images:
imgname = image_path.split('/')[-1]
image_path = Path(image_path)
imgname = image_path.name
self.tableIdentifier = imgname.split("_")[-1].split(".")[0]
self.file_name = imgname.replace(base_dir + "/", "")
pmc = imgname[0:imgname.rfind('.')]
self.file_name = str(image_path.relative_to(base_dir))
pmc = imgname.stem

img = cv2.imread(image_path)
img = cv2.imread(str(image_path))

cells, added, thresh = self.find_cells(img)
table_row = self.cell2table(cells, added, thresh, "imagesOut", pmc)
Expand Down
16 changes: 8 additions & 8 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import re
import unicodedata
from pathlib import Path

import bs4
import networkx as nx
Expand All @@ -21,13 +21,13 @@ def get_files(base_dir, pattern=r'(.*).html'):
file_list: a list of filepath
"""
file_list = []
files = os.listdir(base_dir)
for i in files:
abs_path = os.path.join(base_dir, i)
if re.match(pattern, abs_path):
file_list.append(abs_path)
elif os.path.isdir(abs_path) & ('ipynb_checkpoints' not in abs_path):
file_list += get_files(abs_path)
base_dir = Path(base_dir)
for item in base_dir.iterdir():
abs_path = item.resolve()
if abs_path.is_file() and re.match(pattern, str(abs_path)):
file_list.append(str(abs_path))
elif abs_path.is_dir() and 'ipynb_checkpoints' not in str(abs_path):
file_list += get_files(abs_path, pattern)
return file_list


Expand Down
52 changes: 25 additions & 27 deletions tests/quick_comparison.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,35 @@
import os
import re
from pathlib import Path

folder1_path = 'OldOutput'
folder2_path = 'NewOutput'
folder1_path = Path('OldOutput')
folder2_path = Path('NewOutput')
lines_to_ignore = ["\"date\":", "\"offset\":", "\"inputfile\":"]

different_files = []

for root, dirs, files in os.walk(folder1_path):
for filename in files:
if "_bioc" not in filename:
continue
folder1_file_path = os.path.join(root, filename)
folder2_file_path = os.path.join(folder2_path, filename)
if os.path.exists(folder2_file_path):
with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
different_lines = [i for i, (line1, line2) in enumerate(zip(lines1, lines2)) if
re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2,
flags=re.UNICODE) and
not [x for x in lines_to_ignore if x in line1]]
false_positives = different_lines
different_lines = []
for i in range(len(false_positives)):
if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[
false_positives[i]]:
continue
else:
different_lines.append(false_positives[i])
for folder1_file_path in folder1_path.rglob('*'):
if "_bioc" not in folder1_file_path.name:
continue
folder2_file_path = folder2_path / folder1_file_path.name
if folder2_file_path.exists():
with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
different_lines = [
i for i, (line1, line2) in enumerate(zip(lines1, lines2))
if re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2, flags=re.UNICODE) and
not [x for x in lines_to_ignore if x in line1]
]
false_positives = different_lines
different_lines = []
for i in range(len(false_positives)):
if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[false_positives[i]]:
continue
else:
different_lines.append(false_positives[i])

if different_lines:
different_files.append(filename)
if different_lines:
different_files.append(folder1_file_path.name)

print("\n".join(different_files))
print(len(different_files))

0 comments on commit 39dc6ce

Please sign in to comment.