diff --git a/run_app.py b/run_app.py index 898d160..f1e6911 100644 --- a/run_app.py +++ b/run_app.py @@ -1,9 +1,8 @@ import argparse -import glob import imghdr -import os import re from datetime import datetime +from pathlib import Path from tqdm import tqdm @@ -37,10 +36,11 @@ def get_file_type(file_path): :param file_path: file path to be checked :return: "directory", "main_text", "linked_table" or "table_image" ''' - if os.path.isdir(file_path): + file_path = Path(file_path) + if file_path.is_dir(): return ("directory") - elif file_path.endswith(".html"): - if re.search("table_\d+.html", file_path): + elif file_path.suffix == ".html": + if re.search("table_\d+.html", file_path.name): return ("linked_tables") else: return ("main_text") @@ -86,15 +86,13 @@ def read_file_structure(file_path, target_dir): :return: list of dicts ''' structure = {} - if os.path.exists(file_path): - omit_dir = "/".join(file_path.split("/")) - if os.path.isdir(file_path): - all_fpaths = glob.iglob(file_path + '/**', recursive=True) - # turn the 3d file structure into a flat 2d list of file paths + file_path = Path(file_path) + if file_path.exists(): + if file_path.is_dir(): + all_fpaths = file_path.rglob('*') for fpath in all_fpaths: - tmp_out = fpath.replace(omit_dir, "") - tmp_out = "/".join(tmp_out.split("/")[:-1]) - out_dir = target_dir + tmp_out + tmp_out = fpath.relative_to(file_path).parent + out_dir = Path(target_dir) / tmp_out ftype = get_file_type(fpath) base_file = None if ftype == "directory": @@ -150,11 +148,12 @@ def read_file_structure(file_path, target_dir): error_occurred = False output_format = args.output_format if args.output_format else "JSON" trained_data = args.trained_data_set if args.output_format else "eng" -if not os.path.exists(target_dir): - os.makedirs(target_dir) -logFileName = F"{target_dir}/autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}" +target_dir = Path(target_dir) +if not target_dir.exists(): + target_dir.mkdir(parents=True) +logFileName = target_dir / f"autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}" -with open(logFileName, "w") as log_file: +with logFileName.open("w") as log_file: log_file.write(F"Auto-CORPus log file from {cdate.hour}:{cdate.minute} on {cdate.day}/{cdate.month}/{cdate.year}\n") log_file.write(F"Input directory provided: {file_path}\n") log_file.write(F"Output directory provided: {target_dir}\n") @@ -170,30 +169,27 @@ def read_file_structure(file_path, target_dir): "table_images": len(structure[key]['table_images']) } ) - if os.path.isdir(file_path): - base_dir = file_path - else: - base_dir = "/".join(file_path.split("/")[:-1]) + base_dir = Path(file_path).parent if not Path(file_path).is_dir() else Path(file_path) try: AC = autoCORPus(config, base_dir=base_dir, main_text=structure[key]['main_text'], linked_tables=sorted(structure[key]['linked_tables']), table_images=sorted(structure[key]['table_images']), trainedData=trained_data) - out_dir = structure[key]['out_dir'] + out_dir = Path(structure[key]['out_dir']) if structure[key]["main_text"]: key = key.replace('\\', '/') if output_format.lower() == "json": - with open(out_dir + "/" + key.split("/")[-1] + "_bioc.json", "w", encoding='utf-8') as outfp: + with open(out_dir / f"{Path(key).name}_bioc.json", "w", encoding='utf-8') as outfp: outfp.write(AC.main_text_to_bioc_json()) else: - with open(out_dir + "/" + key.split("/")[-1] + "_bioc.xml", "w", encoding='utf-8') as outfp: + with open(out_dir / f"{Path(key).name}_bioc.xml", "w", encoding='utf-8') as outfp: outfp.write(AC.main_text_to_bioc_xml()) - with open(out_dir + "/" + key.split("/")[-1] + "_abbreviations.json", "w", encoding='utf-8') as outfp: + with open(out_dir / f"{Path(key).name}_abbreviations.json", "w", encoding='utf-8') as outfp: outfp.write(AC.abbreviations_to_bioc_json()) # AC does not support the conversion of tables or abbreviations to the XML format if AC.has_tables: - with open(out_dir + "/" + key.split("/")[-1] + "_tables.json", "w", encoding='utf-8') as outfp: + with open(out_dir / f"{Path(key).name}_tables.json", "w", encoding='utf-8') as outfp: outfp.write(AC.tables_to_bioc_json()) success.append(F"{key} was processed successfully.") except Exception as e: diff --git a/src/autoCORPus.py b/src/autoCORPus.py index c4a6e4b..f93ccee 100644 --- a/src/autoCORPus.py +++ b/src/autoCORPus.py @@ -1,6 +1,7 @@ import argparse import json import sys +from pathlib import Path from bioc import loads, dumps, BioCFileType from bs4 import BeautifulSoup @@ -37,28 +38,32 @@ class autoCORPus: @handle_path def __read_config(self, config_path): - with open(config_path, "r") as f: + config_path = Path(config_path) + with config_path.open("r") as f: ## TODO: validate config file here if possible content = json.load(f) return content["config"] @handle_path def __import_file(self, file_path): - with open(file_path, "r") as f: + file_path = Path(file_path) + with file_path.open("r") as f: return f.read(), file_path @handle_path def __handle_target_dir(self, target_dir): - if not os.path.exists(target_dir): - os.makedirs(target_dir) + target_dir = Path(target_dir) + if not target_dir.exists(): + target_dir.mkdir(parents=True) return def __validate_infile(self): pass def __soupify_infile(self, fpath): + fpath = Path(fpath) try: - with open(fpath, "r", encoding="utf-8") as fp: + with fpath.open("r", encoding="utf-8") as fp: soup = BeautifulSoup(fp.read(), 'html.parser') for e in soup.find_all(attrs={'style': ['display:none', 'visibility:hidden']}): e.extract() diff --git a/src/table_image.py b/src/table_image.py index 314866c..57ba101 100644 --- a/src/table_image.py +++ b/src/table_image.py @@ -3,6 +3,7 @@ from datetime import datetime from operator import itemgetter +from pathlib import Path import cv2 import pytesseract @@ -389,13 +390,15 @@ def __init__(self, table_images, base_dir, trainedData="eng"): "infons": {}, "documents": [] } + base_dir = Path(base_dir) for image_path in table_images: - imgname = image_path.split('/')[-1] + image_path = Path(image_path) + imgname = image_path.name self.tableIdentifier = imgname.split("_")[-1].split(".")[0] - self.file_name = imgname.replace(base_dir + "/", "") - pmc = imgname[0:imgname.rfind('.')] + self.file_name = str(image_path.relative_to(base_dir)) + pmc = imgname.stem - img = cv2.imread(image_path) + img = cv2.imread(str(image_path)) cells, added, thresh = self.find_cells(img) table_row = self.cell2table(cells, added, thresh, "imagesOut", pmc) diff --git a/src/utils.py b/src/utils.py index fb7bd01..a184c37 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,6 +1,6 @@ -import os import re import unicodedata +from pathlib import Path import bs4 import networkx as nx @@ -21,13 +21,13 @@ def get_files(base_dir, pattern=r'(.*).html'): file_list: a list of filepath """ file_list = [] - files = os.listdir(base_dir) - for i in files: - abs_path = os.path.join(base_dir, i) - if re.match(pattern, abs_path): - file_list.append(abs_path) - elif os.path.isdir(abs_path) & ('ipynb_checkpoints' not in abs_path): - file_list += get_files(abs_path) + base_dir = Path(base_dir) + for item in base_dir.iterdir(): + abs_path = item.resolve() + if abs_path.is_file() and re.match(pattern, str(abs_path)): + file_list.append(str(abs_path)) + elif abs_path.is_dir() and 'ipynb_checkpoints' not in str(abs_path): + file_list += get_files(abs_path, pattern) return file_list diff --git a/tests/quick_comparison.py b/tests/quick_comparison.py index 8cd0dc8..5b9fbac 100644 --- a/tests/quick_comparison.py +++ b/tests/quick_comparison.py @@ -1,37 +1,35 @@ -import os import re +from pathlib import Path -folder1_path = 'OldOutput' -folder2_path = 'NewOutput' +folder1_path = Path('OldOutput') +folder2_path = Path('NewOutput') lines_to_ignore = ["\"date\":", "\"offset\":", "\"inputfile\":"] different_files = [] -for root, dirs, files in os.walk(folder1_path): - for filename in files: - if "_bioc" not in filename: - continue - folder1_file_path = os.path.join(root, filename) - folder2_file_path = os.path.join(folder2_path, filename) - if os.path.exists(folder2_file_path): - with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2: - lines1 = f1.readlines() - lines2 = f2.readlines() - different_lines = [i for i, (line1, line2) in enumerate(zip(lines1, lines2)) if - re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2, - flags=re.UNICODE) and - not [x for x in lines_to_ignore if x in line1]] - false_positives = different_lines - different_lines = [] - for i in range(len(false_positives)): - if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[ - false_positives[i]]: - continue - else: - different_lines.append(false_positives[i]) +for folder1_file_path in folder1_path.rglob('*'): + if "_bioc" not in folder1_file_path.name: + continue + folder2_file_path = folder2_path / folder1_file_path.name + if folder2_file_path.exists(): + with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2: + lines1 = f1.readlines() + lines2 = f2.readlines() + different_lines = [ + i for i, (line1, line2) in enumerate(zip(lines1, lines2)) + if re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2, flags=re.UNICODE) and + not [x for x in lines_to_ignore if x in line1] + ] + false_positives = different_lines + different_lines = [] + for i in range(len(false_positives)): + if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[false_positives[i]]: + continue + else: + different_lines.append(false_positives[i]) - if different_lines: - different_files.append(filename) + if different_lines: + different_files.append(folder1_file_path.name) print("\n".join(different_files)) print(len(different_files))