Skip to content

Commit

Permalink
fix: add parse tests for every supported extensions (#198)
Browse files Browse the repository at this point in the history
* fix: add parse tests for every supported extensions

* add: each parser has supported FileExtensions

* fix: ValueError for unsupported extensions

* fix: python version required

* fix: python version

* fix: python version
  • Loading branch information
chloedia authored Dec 20, 2024
1 parent 13c2677 commit 9dff0de
Show file tree
Hide file tree
Showing 31 changed files with 175 additions and 199 deletions.
5 changes: 0 additions & 5 deletions libs/megaparse/src/megaparse/megaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,6 @@ def validate_input(
raise ValueError(
f"Format Checker : Unsupported file extension: {file_extension}"
)
# TODO(@chloedia): each parser should have a list of supported extensions
if not isinstance(self.parser, UnstructuredParser):
raise ValueError(
f" Unsupported file extension : Parser {self.parser} do not support {file_extension}"
)
return file_extension

async def aload(
Expand Down
17 changes: 17 additions & 0 deletions libs/megaparse/src/megaparse/parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@
class BaseParser(ABC):
"""Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]"""

supported_extensions = []

def check_supported_extension(
self, file_extension: FileExtension | None, file_path: str | Path | None = None
):
if not file_extension and not file_path:
raise ValueError(
"Either file_path or file_extension must be provided for {self.__class__.__name__}"
)
if file_path and not file_extension:
file_path = Path(file_path) if isinstance(file_path, str) else file_path
file_extension = FileExtension(file_path.suffix)
if file_extension not in self.supported_extensions:
raise ValueError(
f"Unsupported file extension {file_extension.value} for {self.__class__.__name__}"
)

@abstractmethod
async def aconvert(
self,
Expand Down
5 changes: 5 additions & 0 deletions libs/megaparse/src/megaparse/parser/doctr_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@


class DoctrParser(BaseParser):
supported_extensions = [FileExtension.PDF]

def __init__(
self,
det_predictor_model: str = "db_resnet50",
Expand Down Expand Up @@ -74,6 +76,9 @@ def convert(
pdf = file_path # type: ignore
else:
raise ValueError("Can't convert if file and file_path are None")

self.check_supported_extension(file_extension, file_path)

doc = DocumentFile.from_pdf(pdf)
# Analyze
result = self.predictor(doc)
Expand Down
4 changes: 4 additions & 0 deletions libs/megaparse/src/megaparse/parser/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@


class LlamaParser(BaseParser):
supported_extensions = [FileExtension.PDF]

def __init__(
self,
api_key: str,
Expand All @@ -37,6 +39,7 @@ async def aconvert(
) -> str:
if not file_path:
raise ValueError("File_path should be provided to run LlamaParser")
self.check_supported_extension(file_extension, file_path)

llama_parser = _LlamaParse(
api_key=self.api_key,
Expand Down Expand Up @@ -64,6 +67,7 @@ def convert(
) -> str:
if not file_path:
raise ValueError("File_path should be provided to run LlamaParser")
self.check_supported_extension(file_extension, file_path)

llama_parser = _LlamaParse(
api_key=self.api_key,
Expand Down
8 changes: 8 additions & 0 deletions libs/megaparse/src/megaparse/parser/megaparse_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@


class MegaParseVision(BaseParser):
supported_extensions = [FileExtension.PDF]

def __init__(self, model: BaseChatModel, **kwargs):
if hasattr(model, "model_name"):
if not SupportedModel.is_supported(model.model_name):
Expand Down Expand Up @@ -158,6 +160,9 @@ async def aconvert(

if isinstance(file_path, Path):
file_path = str(file_path)

self.check_supported_extension(file_extension, file_path)

pdf_base64 = self.process_file(file_path)
tasks = [
self.asend_to_mlm(pdf_base64[i : i + batch_size])
Expand Down Expand Up @@ -187,6 +192,9 @@ def convert(

if isinstance(file_path, Path):
file_path = str(file_path)

self.check_supported_extension(file_extension, file_path)

pdf_base64 = self.process_file(file_path)
chunks = [
pdf_base64[i : i + batch_size]
Expand Down
18 changes: 18 additions & 0 deletions libs/megaparse/src/megaparse/parser/unstructured_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@

class UnstructuredParser(BaseParser):
load_dotenv()
supported_extensions = [
FileExtension.PDF,
FileExtension.DOCX,
FileExtension.TXT,
FileExtension.OTF,
FileExtension.EPUB,
FileExtension.HTML,
FileExtension.XML,
FileExtension.CSV,
FileExtension.XLSX,
FileExtension.XLS,
FileExtension.PPTX,
FileExtension.MD,
FileExtension.MARKDOWN,
]

def __init__(
self, strategy=StrategyEnum.AUTO, model: BaseChatModel | None = None, **kwargs
Expand Down Expand Up @@ -107,6 +122,7 @@ async def aconvert(
file_extension: FileExtension | None = None,
**kwargs,
) -> str:
self.check_supported_extension(file_extension, file_path)
warnings.warn(
"The UnstructuredParser is a sync parser, please use the sync convert method",
UserWarning,
Expand All @@ -121,6 +137,8 @@ def convert(
file_extension: FileExtension | None = None,
**kwargs,
) -> str:
self.check_supported_extension(file_extension, file_path)

elements = partition(
filename=str(file_path) if file_path else None,
file=file,
Expand Down
32 changes: 0 additions & 32 deletions libs/megaparse/tests/docx/test_docx_processing.py

This file was deleted.

30 changes: 0 additions & 30 deletions libs/megaparse/tests/epub/test_epub_processing.py

This file was deleted.

Empty file.
30 changes: 0 additions & 30 deletions libs/megaparse/tests/html/test_html_processing.py

This file was deleted.

30 changes: 0 additions & 30 deletions libs/megaparse/tests/odt/test_odt_processing.py

This file was deleted.

30 changes: 0 additions & 30 deletions libs/megaparse/tests/pptx/test_pptx_processing.py

This file was deleted.

File renamed without changes.
4 changes: 4 additions & 0 deletions libs/megaparse/tests/supported_docs/sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Description
MegaParse,"MegaParse is the best parser, even with accents like é, è, and ñ."
OtherParse,"OtherParse is a decent parser, but it struggles with accents."
RandomParse,"RandomParse is another parser, but it often fails with special characters."
File renamed without changes.
21 changes: 21 additions & 0 deletions libs/megaparse/tests/supported_docs/sample.markdown
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# The Difficulty of Parsing Files

Parsing files can be a challenging task due to several factors:

## 1. File Format Variability
Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.

## 2. Inconsistent Data
Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.

## 3. Large File Sizes
Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.

## 4. Encoding Issues
Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.

## 5. Nested Structures
Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.

## Conclusion
Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
21 changes: 21 additions & 0 deletions libs/megaparse/tests/supported_docs/sample.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# The Difficulty of Parsing Files

Parsing files can be a challenging task due to several factors:

## 1. File Format Variability
Different file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.

## 2. Inconsistent Data
Files often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.

## 3. Large File Sizes
Parsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.

## 4. Encoding Issues
Files may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.

## 5. Nested Structures
Some file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.

## Conclusion
Despite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.
Binary file added libs/megaparse/tests/supported_docs/sample.otf
Binary file not shown.
File renamed without changes.
13 changes: 13 additions & 0 deletions libs/megaparse/tests/supported_docs/sample.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Lorem ipsum

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.

Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.
Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.
Maecenas non lorem quis tellus placerat varius.
Nulla facilisi.
Aenean congue fringilla justo ut aliquam.
Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis.
Morbi viverra semper lorem nec molestie.
Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.
https://github.com/QuivrHQ/MegaParse
23 changes: 23 additions & 0 deletions libs/megaparse/tests/supported_docs/sample.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0"?>
<customers>
<customer id="55000">
<name>Charter Group</name>
<address>
<street>100 Main</street>
<city>Framingham</city>
<state>MA</state>
<zip>01701</zip>
</address>
<address>
<street>720 Prospect</street>
<city>Framingham</city>
<state>MA</state>
<zip>01701</zip>
</address>
<address>
<street>120 Ridge</street>
<state>MA</state>
<zip>01760</zip>
</address>
</customer>
</customers>
Binary file not shown.
Loading

0 comments on commit 9dff0de

Please sign in to comment.