Merge pull request #57 from tsmbland/pathlib

Use pathlib for path handling
omicsNLP · Oct 24, 2024 · 39dc6ce · 39dc6ce
2 parents fb6ef37 + e5808ee
commit 39dc6ce
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 70 deletions.
diff --git a/run_app.py b/run_app.py
@@ -1,9 +1,8 @@
 import argparse
-import glob
 import imghdr
-import os
 import re
 from datetime import datetime
+from pathlib import Path
 
 from tqdm import tqdm
 
@@ -37,10 +36,11 @@ def get_file_type(file_path):
     :param file_path: file path to be checked
     :return: "directory", "main_text", "linked_table" or "table_image"
     '''
-    if os.path.isdir(file_path):
+    file_path = Path(file_path)
+    if file_path.is_dir():
         return ("directory")
-    elif file_path.endswith(".html"):
-        if re.search("table_\d+.html", file_path):
+    elif file_path.suffix == ".html":
+        if re.search("table_\d+.html", file_path.name):
             return ("linked_tables")
         else:
             return ("main_text")
@@ -86,15 +86,13 @@ def read_file_structure(file_path, target_dir):
     :return: list of dicts
     '''
     structure = {}
-    if os.path.exists(file_path):
-        omit_dir = "/".join(file_path.split("/"))
-        if os.path.isdir(file_path):
-            all_fpaths = glob.iglob(file_path + '/**', recursive=True)
-            # turn the 3d file structure into a flat 2d list of file paths
+    file_path = Path(file_path)
+    if file_path.exists():
+        if file_path.is_dir():
+            all_fpaths = file_path.rglob('*')
             for fpath in all_fpaths:
-                tmp_out = fpath.replace(omit_dir, "")
-                tmp_out = "/".join(tmp_out.split("/")[:-1])
-                out_dir = target_dir + tmp_out
+                tmp_out = fpath.relative_to(file_path).parent
+                out_dir = Path(target_dir) / tmp_out 
                 ftype = get_file_type(fpath)
                 base_file = None
                 if ftype == "directory":
@@ -150,11 +148,12 @@ def read_file_structure(file_path, target_dir):
 error_occurred = False
 output_format = args.output_format if args.output_format else "JSON"
 trained_data = args.trained_data_set if args.output_format else "eng"
-if not os.path.exists(target_dir):
-    os.makedirs(target_dir)
-logFileName = F"{target_dir}/autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}"
+target_dir = Path(target_dir)
+if not target_dir.exists():
+    target_dir.mkdir(parents=True)
+logFileName = target_dir / f"autoCORPus-log-{cdate.day}-{cdate.month}-{cdate.year}-{cdate.hour}-{cdate.minute}"
 
-with open(logFileName, "w") as log_file:
+with logFileName.open("w") as log_file:
     log_file.write(F"Auto-CORPus log file from {cdate.hour}:{cdate.minute} on {cdate.day}/{cdate.month}/{cdate.year}\n")
     log_file.write(F"Input directory provided: {file_path}\n")
     log_file.write(F"Output directory provided: {target_dir}\n")
@@ -170,30 +169,27 @@ def read_file_structure(file_path, target_dir):
                 "table_images": len(structure[key]['table_images'])
             }
         )
-        if os.path.isdir(file_path):
-            base_dir = file_path
-        else:
-            base_dir = "/".join(file_path.split("/")[:-1])
+        base_dir = Path(file_path).parent if not Path(file_path).is_dir() else Path(file_path)
         try:
             AC = autoCORPus(config, base_dir=base_dir, main_text=structure[key]['main_text'],
                             linked_tables=sorted(structure[key]['linked_tables']),
                             table_images=sorted(structure[key]['table_images']), trainedData=trained_data)
 
-            out_dir = structure[key]['out_dir']
+            out_dir = Path(structure[key]['out_dir'])
             if structure[key]["main_text"]:
                 key = key.replace('\\', '/')
                 if output_format.lower() == "json":
-                    with open(out_dir + "/" + key.split("/")[-1] + "_bioc.json", "w", encoding='utf-8') as outfp:
+                    with open(out_dir / f"{Path(key).name}_bioc.json", "w", encoding='utf-8') as outfp:
                         outfp.write(AC.main_text_to_bioc_json())
                 else:
-                    with open(out_dir + "/" + key.split("/")[-1] + "_bioc.xml", "w", encoding='utf-8') as outfp:
+                    with open(out_dir / f"{Path(key).name}_bioc.xml", "w", encoding='utf-8') as outfp:
                         outfp.write(AC.main_text_to_bioc_xml())
-                with open(out_dir + "/" + key.split("/")[-1] + "_abbreviations.json", "w", encoding='utf-8') as outfp:
+                with open(out_dir / f"{Path(key).name}_abbreviations.json", "w", encoding='utf-8') as outfp:
                     outfp.write(AC.abbreviations_to_bioc_json())
 
             # AC does not support the conversion of tables or abbreviations to the XML format
             if AC.has_tables:
-                with open(out_dir + "/" + key.split("/")[-1] + "_tables.json", "w", encoding='utf-8') as outfp:
+                with open(out_dir / f"{Path(key).name}_tables.json", "w", encoding='utf-8') as outfp:
                     outfp.write(AC.tables_to_bioc_json())
             success.append(F"{key} was processed successfully.")
         except Exception as e:

diff --git a/src/autoCORPus.py b/src/autoCORPus.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import sys
+from pathlib import Path
 
 from bioc import loads, dumps, biocxml, biocjson
 from bs4 import BeautifulSoup
@@ -37,28 +38,32 @@ class autoCORPus:
 
     @handle_path
     def __read_config(self, config_path):
-        with open(config_path, "r") as f:
+        config_path = Path(config_path)
+        with config_path.open("r") as f:
             ## TODO: validate config file here if possible
             content = json.load(f)
             return content["config"]
 
     @handle_path
     def __import_file(self, file_path):
-        with open(file_path, "r") as f:
+        file_path = Path(file_path)
+        with file_path.open("r") as f:
             return f.read(), file_path
 
     @handle_path
     def __handle_target_dir(self, target_dir):
-        if not os.path.exists(target_dir):
-            os.makedirs(target_dir)
+        target_dir = Path(target_dir)
+        if not target_dir.exists():
+            target_dir.mkdir(parents=True)
         return
 
     def __validate_infile(self):
         pass
 
     def __soupify_infile(self, fpath):
+        fpath = Path(fpath)
         try:
-            with open(fpath, "r", encoding="utf-8") as fp:
+            with fpath.open("r", encoding="utf-8") as fp:
                 soup = BeautifulSoup(fp.read(), 'html.parser')
                 for e in soup.find_all(attrs={'style': ['display:none', 'visibility:hidden']}):
                     e.extract()

diff --git a/src/table_image.py b/src/table_image.py
@@ -3,6 +3,7 @@
 
 from datetime import datetime
 from operator import itemgetter
+from pathlib import Path
 
 import cv2
 import pytesseract
@@ -389,13 +390,15 @@ def __init__(self, table_images, base_dir, trainedData="eng"):
             "infons": {},
             "documents": []
         }
+        base_dir = Path(base_dir)
         for image_path in table_images:
-            imgname = image_path.split('/')[-1]
+            image_path = Path(image_path) 
+            imgname = image_path.name
             self.tableIdentifier = imgname.split("_")[-1].split(".")[0]
-            self.file_name = imgname.replace(base_dir + "/", "")
-            pmc = imgname[0:imgname.rfind('.')]
+            self.file_name = str(image_path.relative_to(base_dir))
+            pmc = imgname.stem
 
-            img = cv2.imread(image_path)
+            img = cv2.imread(str(image_path))
 
             cells, added, thresh = self.find_cells(img)
             table_row = self.cell2table(cells, added, thresh, "imagesOut", pmc)

diff --git a/src/utils.py b/src/utils.py
@@ -1,6 +1,6 @@
-import os
 import re
 import unicodedata
+from pathlib import Path
 
 import bs4
 import networkx as nx
@@ -21,13 +21,13 @@ def get_files(base_dir, pattern=r'(.*).html'):
         file_list: a list of filepath
     """
     file_list = []
-    files = os.listdir(base_dir)
-    for i in files:
-        abs_path = os.path.join(base_dir, i)
-        if re.match(pattern, abs_path):
-            file_list.append(abs_path)
-        elif os.path.isdir(abs_path) & ('ipynb_checkpoints' not in abs_path):
-            file_list += get_files(abs_path)
+    base_dir = Path(base_dir)
+    for item in base_dir.iterdir():
+        abs_path = item.resolve()
+        if abs_path.is_file() and re.match(pattern, str(abs_path)):
+            file_list.append(str(abs_path))
+        elif abs_path.is_dir() and 'ipynb_checkpoints' not in str(abs_path):
+            file_list += get_files(abs_path, pattern)
     return file_list
 
 

diff --git a/tests/quick_comparison.py b/tests/quick_comparison.py
@@ -1,37 +1,35 @@
-import os
 import re
+from pathlib import Path
 
-folder1_path = 'OldOutput'
-folder2_path = 'NewOutput'
+folder1_path = Path('OldOutput')
+folder2_path = Path('NewOutput')
 lines_to_ignore = ["\"date\":", "\"offset\":", "\"inputfile\":"]
 
 different_files = []
 
-for root, dirs, files in os.walk(folder1_path):
-    for filename in files:
-        if "_bioc" not in filename:
-            continue
-        folder1_file_path = os.path.join(root, filename)
-        folder2_file_path = os.path.join(folder2_path, filename)
-        if os.path.exists(folder2_file_path):
-            with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2:
-                lines1 = f1.readlines()
-                lines2 = f2.readlines()
-                different_lines = [i for i, (line1, line2) in enumerate(zip(lines1, lines2)) if
-                                   re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2,
-                                                                                         flags=re.UNICODE) and
-                                   not [x for x in lines_to_ignore if x in line1]]
-                false_positives = different_lines
-                different_lines = []
-                for i in range(len(false_positives)):
-                    if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[
-                        false_positives[i]]:
-                        continue
-                    else:
-                        different_lines.append(false_positives[i])
+for folder1_file_path in folder1_path.rglob('*'):
+    if "_bioc" not in folder1_file_path.name:
+        continue
+    folder2_file_path = folder2_path / folder1_file_path.name
+    if folder2_file_path.exists():
+        with open(folder1_file_path, 'r') as f1, open(folder2_file_path, 'r') as f2:
+            lines1 = f1.readlines()
+            lines2 = f2.readlines()
+            different_lines = [
+                i for i, (line1, line2) in enumerate(zip(lines1, lines2)) 
+                if re.sub(r"\s+", "", line1, flags=re.UNICODE) != re.sub(r"\s+", "", line2, flags=re.UNICODE) and
+                not [x for x in lines_to_ignore if x in line1]
+            ]
+            false_positives = different_lines
+            different_lines = []
+            for i in range(len(false_positives)):
+                if "[PMC free article]" not in lines1[false_positives[i]] and "[PMC free article]" in lines2[false_positives[i]]:
+                    continue
+                else:
+                    different_lines.append(false_positives[i])
 
-                if different_lines:
-                    different_files.append(filename)
+            if different_lines:
+                different_files.append(folder1_file_path.name)
 
 print("\n".join(different_files))
 print(len(different_files))