fix: add parse tests for every supported extensions (#198)

* fix: add parse tests for every supported extensions * add: each parser has supported FileExtensions * fix: ValueError for unsupported extensions * fix: python version required * fix: python version * fix: python version
QuivrHQ · Dec 20, 2024 · 9dff0de · 9dff0de
1 parent 13c2677
commit 9dff0de
Show file tree

Hide file tree

Showing 31 changed files with 175 additions and 199 deletions.
diff --git a/libs/megaparse/src/megaparse/megaparse.py b/libs/megaparse/src/megaparse/megaparse.py
@@ -67,11 +67,6 @@ def validate_input(
                 raise ValueError(
                     f"Format Checker : Unsupported file extension: {file_extension}"
                 )
-            # TODO(@chloedia): each parser should have a list of supported extensions
-            if not isinstance(self.parser, UnstructuredParser):
-                raise ValueError(
-                    f" Unsupported file extension : Parser {self.parser} do not support {file_extension}"
-                )
         return file_extension
 
     async def aload(

diff --git a/libs/megaparse/src/megaparse/parser/base.py b/libs/megaparse/src/megaparse/parser/base.py
@@ -8,6 +8,23 @@
 class BaseParser(ABC):
     """Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]"""
 
+    supported_extensions = []
+
+    def check_supported_extension(
+        self, file_extension: FileExtension | None, file_path: str | Path | None = None
+    ):
+        if not file_extension and not file_path:
+            raise ValueError(
+                "Either file_path or file_extension must be provided for {self.__class__.__name__}"
+            )
+        if file_path and not file_extension:
+            file_path = Path(file_path) if isinstance(file_path, str) else file_path
+            file_extension = FileExtension(file_path.suffix)
+        if file_extension not in self.supported_extensions:
+            raise ValueError(
+                f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}"
+            )
+
     @abstractmethod
     async def aconvert(
         self,

diff --git a/libs/megaparse/src/megaparse/parser/doctr_parser.py b/libs/megaparse/src/megaparse/parser/doctr_parser.py
@@ -14,6 +14,8 @@
 
 
 class DoctrParser(BaseParser):
+    supported_extensions = [FileExtension.PDF]
+
     def __init__(
         self,
         det_predictor_model: str = "db_resnet50",
@@ -74,6 +76,9 @@ def convert(
             pdf = file_path  # type: ignore
         else:
             raise ValueError("Can't convert if file and file_path are None")
+
+        self.check_supported_extension(file_extension, file_path)
+
         doc = DocumentFile.from_pdf(pdf)
         # Analyze
         result = self.predictor(doc)

diff --git a/libs/megaparse/src/megaparse/parser/llama.py b/libs/megaparse/src/megaparse/parser/llama.py
@@ -11,6 +11,8 @@
 
 
 class LlamaParser(BaseParser):
+    supported_extensions = [FileExtension.PDF]
+
     def __init__(
         self,
         api_key: str,
@@ -37,6 +39,7 @@ async def aconvert(
     ) -> str:
         if not file_path:
             raise ValueError("File_path should be provided to run LlamaParser")
+        self.check_supported_extension(file_extension, file_path)
 
         llama_parser = _LlamaParse(
             api_key=self.api_key,
@@ -64,6 +67,7 @@ def convert(
     ) -> str:
         if not file_path:
             raise ValueError("File_path should be provided to run LlamaParser")
+        self.check_supported_extension(file_extension, file_path)
 
         llama_parser = _LlamaParse(
             api_key=self.api_key,

diff --git a/libs/megaparse/src/megaparse/parser/megaparse_vision.py b/libs/megaparse/src/megaparse/parser/megaparse_vision.py
@@ -53,6 +53,8 @@
 
 
 class MegaParseVision(BaseParser):
+    supported_extensions = [FileExtension.PDF]
+
     def __init__(self, model: BaseChatModel, **kwargs):
         if hasattr(model, "model_name"):
             if not SupportedModel.is_supported(model.model_name):
@@ -158,6 +160,9 @@ async def aconvert(
 
         if isinstance(file_path, Path):
             file_path = str(file_path)
+
+        self.check_supported_extension(file_extension, file_path)
+
         pdf_base64 = self.process_file(file_path)
         tasks = [
             self.asend_to_mlm(pdf_base64[i : i + batch_size])
@@ -187,6 +192,9 @@ def convert(
 
         if isinstance(file_path, Path):
             file_path = str(file_path)
+
+        self.check_supported_extension(file_extension, file_path)
+
         pdf_base64 = self.process_file(file_path)
         chunks = [
             pdf_base64[i : i + batch_size]

diff --git a/libs/megaparse/src/megaparse/parser/unstructured_parser.py b/libs/megaparse/src/megaparse/parser/unstructured_parser.py
@@ -15,6 +15,21 @@
 
 class UnstructuredParser(BaseParser):
     load_dotenv()
+    supported_extensions = [
+        FileExtension.PDF,
+        FileExtension.DOCX,
+        FileExtension.TXT,
+        FileExtension.OTF,
+        FileExtension.EPUB,
+        FileExtension.HTML,
+        FileExtension.XML,
+        FileExtension.CSV,
+        FileExtension.XLSX,
+        FileExtension.XLS,
+        FileExtension.PPTX,
+        FileExtension.MD,
+        FileExtension.MARKDOWN,
+    ]
 
     def __init__(
         self, strategy=StrategyEnum.AUTO, model: BaseChatModel | None = None, **kwargs
@@ -107,6 +122,7 @@ async def aconvert(
         file_extension: FileExtension | None = None,
         **kwargs,
     ) -> str:
+        self.check_supported_extension(file_extension, file_path)
         warnings.warn(
             "The UnstructuredParser is a sync parser, please use the sync convert method",
             UserWarning,
@@ -121,6 +137,8 @@ def convert(
         file_extension: FileExtension | None = None,
         **kwargs,
     ) -> str:
+        self.check_supported_extension(file_extension, file_path)
+
         elements = partition(
             filename=str(file_path) if file_path else None,
             file=file,

diff --git a/libs/megaparse/tests/docx/test_docx_processing.py b/libs/megaparse/tests/docx/test_docx_processing.py
diff --git a/libs/megaparse/tests/epub/test_epub_processing.py b/libs/megaparse/tests/epub/test_epub_processing.py
diff --git a/libs/megaparse/tests/fixtures/__init__.py b/libs/megaparse/tests/fixtures/__init__.py
diff --git a/libs/megaparse/tests/html/test_html_processing.py b/libs/megaparse/tests/html/test_html_processing.py
diff --git a/libs/megaparse/tests/odt/test_odt_processing.py b/libs/megaparse/tests/odt/test_odt_processing.py
diff --git a/...rse/tests/pdf/test_unstructured_parser.py → ...megaparse/tests/pdf/test_pdfium_parser.py b/...rse/tests/pdf/test_unstructured_parser.py → ...megaparse/tests/pdf/test_pdfium_parser.py
diff --git a/libs/megaparse/tests/pptx/test_pptx_processing.py b/libs/megaparse/tests/pptx/test_pptx_processing.py
diff --git a/libs/megaparse/tests/epub/Sway.epub → .../megaparse/tests/supported_docs/Sway.epub b/libs/megaparse/tests/epub/Sway.epub → .../megaparse/tests/supported_docs/Sway.epub
diff --git a/...megaparse/tests/odt/file-sample_500kB.odt → ...ests/supported_docs/file-sample_500kB.odt b/...megaparse/tests/odt/file-sample_500kB.odt → ...ests/supported_docs/file-sample_500kB.odt
diff --git a/...parse/tests/xls/file_example_XLSX_50.xlsx → .../supported_docs/file_example_XLSX_50.xlsx b/...parse/tests/xls/file_example_XLSX_50.xlsx → .../supported_docs/file_example_XLSX_50.xlsx
diff --git a/...gaparse/tests/xls/file_example_XLS_50.xls → ...ts/supported_docs/file_example_XLS_50.xls b/...gaparse/tests/xls/file_example_XLS_50.xls → ...ts/supported_docs/file_example_XLS_50.xls
diff --git a/libs/megaparse/tests/supported_docs/sample.csv b/libs/megaparse/tests/supported_docs/sample.csv
@@ -0,0 +1,4 @@
+Name,Description
+MegaParse,"MegaParse is the best parser, even with accents like é, è, and ñ."
+OtherParse,"OtherParse is a decent parser, but it struggles with accents."
+RandomParse,"RandomParse is another parser, but it often fails with special characters."
diff --git a/libs/megaparse/tests/docx/sample.docx → ...egaparse/tests/supported_docs/sample.docx b/libs/megaparse/tests/docx/sample.docx → ...egaparse/tests/supported_docs/sample.docx
diff --git a/libs/megaparse/tests/supported_docs/sample.markdown b/libs/megaparse/tests/supported_docs/sample.markdown
@@ -0,0 +1,21 @@
+# The Difficulty of Parsing Files
+
+Parsing files can be a challenging task due to several factors:
+
+## 1. File Format Variability
+Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.
+
+## 2. Inconsistent Data
+Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.
+
+## 3. Large File Sizes
+Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.
+
+## 4. Encoding Issues
+Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.
+
+## 5. Nested Structures
+Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.
+
+## Conclusion
+Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
diff --git a/libs/megaparse/tests/supported_docs/sample.md b/libs/megaparse/tests/supported_docs/sample.md
@@ -0,0 +1,21 @@
+# The Difficulty of Parsing Files
+
+Parsing files can be a challenging task due to several factors:
+
+## 1. File Format Variability
+Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.
+
+## 2. Inconsistent Data
+Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.
+
+## 3. Large File Sizes
+Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.
+
+## 4. Encoding Issues
+Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.
+
+## 5. Nested Structures
+Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.
+
+## Conclusion
+Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
diff --git a/libs/megaparse/tests/supported_docs/sample.otf b/libs/megaparse/tests/supported_docs/sample.otf
diff --git a/libs/megaparse/tests/pptx/sample.pptx → ...egaparse/tests/supported_docs/sample.pptx b/libs/megaparse/tests/pptx/sample.pptx → ...egaparse/tests/supported_docs/sample.pptx
diff --git a/libs/megaparse/tests/supported_docs/sample.txt b/libs/megaparse/tests/supported_docs/sample.txt
@@ -0,0 +1,13 @@
+Lorem ipsum 
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. 
+
+Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.
+Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.
+Maecenas non lorem quis tellus placerat varius. 
+Nulla facilisi. 
+Aenean congue fringilla justo ut aliquam. 
+Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. 
+Morbi viverra semper lorem nec molestie. 
+Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.
+https://github.com/QuivrHQ/MegaParse
diff --git a/libs/megaparse/tests/supported_docs/sample.xml b/libs/megaparse/tests/supported_docs/sample.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<customers>
+   <customer id="55000">
+      <name>Charter Group</name>
+      <address>
+         <street>100 Main</street>
+         <city>Framingham</city>
+         <state>MA</state>
+         <zip>01701</zip>
+      </address>
+      <address>
+         <street>720 Prospect</street>
+         <city>Framingham</city>
+         <state>MA</state>
+         <zip>01701</zip>
+      </address>
+      <address>
+         <street>120 Ridge</street>
+         <state>MA</state>
+         <zip>01760</zip>
+      </address>
+   </customer>
+</customers>
diff --git a/...megaparse/tests/html/sample_complexe.html → ...tests/supported_docs/sample_complexe.html b/...megaparse/tests/html/sample_complexe.html → ...tests/supported_docs/sample_complexe.html
diff --git a/libs/megaparse/tests/supported_docs/sample_native.pdf b/libs/megaparse/tests/supported_docs/sample_native.pdf