From b092fb7f474cc585d14db6773f25a0b1c62f2e82 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 16 Dec 2024 11:39:55 -0800 Subject: [PATCH 1/5] fix: add .grype.yaml (#3834) **Summary** CVE-2024-11053 https://curl.se/docs/CVE-2024-11053.html (severity Low) was published on Dec 11, 2024 and began failing CI builds on open-core on Dec 13, 2024 when it appeared in `grype` apparently misclassified as a critical vulnerability. The severity reported on the CVE is "Low" so it should not fail builds. Add a `.grype.yaml` file to ignore this CVE until grype is updated. --- .github/workflows/ci.yml | 1 + .grype.yaml | 2 ++ CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 .grype.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 88fe84680b..8d364840dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -398,3 +398,4 @@ jobs: image: "unstructured:dev" severity-cutoff: critical only-fixed: true + output-format: table diff --git a/.grype.yaml b/.grype.yaml new file mode 100644 index 0000000000..5e041c462f --- /dev/null +++ b/.grype.yaml @@ -0,0 +1,2 @@ +ignore: + - vulnerability: CVE-2024-11053 diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f929c0429..14c8afe1f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev0 +## 0.16.12-dev1 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5e9d1b8bb0..095d056ec7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev0" # pragma: no cover +__version__ = "0.16.12-dev1" # pragma: no cover From 10f0d54ac2ba5e2998da428c3d459f94d8f3c3b0 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 16 Dec 2024 15:01:22 -0800 Subject: [PATCH 2/5] build: remove ruff version upper bound (#3829) **Summary** Remove pin on `ruff` linter and fix the handful of lint errors a newer version catches. --- CHANGELOG.md | 4 +- pyproject.toml | 20 ++++---- requirements/test.in | 4 +- requirements/test.txt | 2 +- .../partition/pdf_image/test_pdf.py | 48 ++++++++++--------- test_unstructured/partition/test_docx.py | 29 +++++------ test_unstructured/partition/test_pptx.py | 24 +++++----- test_unstructured/partition/test_xlsx.py | 10 ++-- unstructured/__version__.py | 2 +- .../partition/html/transformations.py | 21 +++----- 10 files changed, 81 insertions(+), 83 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14c8afe1f3..959f6c581b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev1 +## 0.16.12-dev2 ### Enhancements @@ -8,6 +8,8 @@ ### Fixes +- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. + ## 0.16.11 ### Enhancements diff --git a/pyproject.toml b/pyproject.toml index ea90b03210..f91637cae6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,17 @@ verboseOutput = true [tool.ruff] line-length = 100 +target-version = "py39" -# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` -- -lint.select = [ +[tool.ruff.lint] +ignore = [ + "COM812", # -- over aggressively insists on trailing commas where not desireable -- + "PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) -- + "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- + "PT012", # -- pytest.raises() block should contain a single simple statement -- + "SIM117", # -- merge `with` statements for context managers that have same scope -- +] +select = [ "C4", # -- flake8-comprehensions -- "COM", # -- flake8-commas -- "E", # -- pycodestyle errors -- @@ -29,11 +37,3 @@ lint.select = [ "UP034", # -- Avoid extraneous parentheses -- "W", # -- Warnings, including invalid escape-sequence -- ] -lint.ignore = [ - "COM812", # -- over aggressively insists on trailing commas where not desireable -- - "PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) -- - "PT005", # -- flags mock fixtures with names intentionally matching private method name -- - "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- - "PT012", # -- pytest.raises() block should contain a single simple statement -- - "SIM117", # -- merge `with` statements for context managers that have same scope -- -] diff --git a/requirements/test.in b/requirements/test.in index c763c091d1..ca9d2d5bfe 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -11,9 +11,7 @@ mypy pydantic pytest-cov pytest-mock -# NOTE(robison) - we need to do additional cleanup to pass -# linting for the latest version of ruff -ruff<0.5.0 +ruff types-Markdown types-requests types-tabulate diff --git a/requirements/test.txt b/requirements/test.txt index 241f941d87..e3762557da 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -171,7 +171,7 @@ requests==2.32.3 # requests-mock requests-mock==1.12.1 # via label-studio-sdk -ruff==0.4.10 +ruff==0.8.3 # via -r ./test.in semantic-version==2.10.0 # via liccheck diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index cea6b44129..9b1b8de6e1 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -245,12 +245,14 @@ def _test(result): _test(result) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - result = pdf.partition_pdf( - file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number - ) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf( + file=spooled_temp_file, + strategy=strategy, + starting_page_number=starting_page_number, + ) _test(result) @@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date( ) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - elements = pdf.partition_pdf( - file=spooled_temp_file, - strategy=strategy, - metadata_last_modified=metadata_last_modified, - ) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = pdf.partition_pdf( + file=spooled_temp_file, + strategy=strategy, + metadata_last_modified=metadata_last_modified, + ) assert {el.metadata.last_modified for el in elements} == {expected_last_modified} @@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy( ) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - elements = pdf.partition_pdf( - file=spooled_temp_file, - strategy=PartitionStrategy.OCR_ONLY, - languages=["eng"], - is_image=is_image, - ) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = pdf.partition_pdf( + file=spooled_temp_file, + strategy=PartitionStrategy.OCR_ONLY, + languages=["eng"], + is_image=is_image, + ) assert elements[0].metadata.languages == ["eng"] # check pages diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 4b30e03379..1330b4a79a 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file( `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need to ensure the source file is appropriately converted in this case. """ - with open(mock_document_file_path, "rb") as test_file: - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + with open(mock_document_file_path, "rb") as test_file: + spooled_temp_file.write(test_file.read()) spooled_temp_file.seek(0) + elements = partition_docx(file=spooled_temp_file) - assert elements == expected_elements - for element in elements: - assert element.metadata.filename is None + + assert elements == expected_elements + assert all(e.metadata.filename is None for e in elements) def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]): @@ -921,16 +922,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( self, opts_args: dict[str, Any] ): - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(b"abcdefg") - opts_args["file"] = spooled_temp_file - opts = DocxPartitionerOptions(**opts_args) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = DocxPartitionerOptions(**opts_args) - docx_file = opts._docx_file + docx_file = opts._docx_file - assert docx_file is not spooled_temp_file - assert isinstance(docx_file, io.BytesIO) - assert docx_file.getvalue() == b"abcdefg" + assert docx_file is not spooled_temp_file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( self, opts_args: dict[str, Any] diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index e19d87ff98..afa3ea2bf5 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file(): Including one that does not have its read-pointer set to the start. """ - with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: + spooled_temp_file.write(test_file.read()) + elements = partition_pptx(file=spooled_temp_file) + assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename is None @@ -701,16 +703,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( self, opts_args: dict[str, Any] ): - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(b"abcdefg") - opts_args["file"] = spooled_temp_file - opts = PptxPartitionerOptions(**opts_args) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = PptxPartitionerOptions(**opts_args) - pptx_file = opts.pptx_file + pptx_file = opts.pptx_file - assert pptx_file is not spooled_temp_file - assert isinstance(pptx_file, io.BytesIO) - assert pptx_file.getvalue() == b"abcdefg" + assert pptx_file is not spooled_temp_file + assert isinstance(pptx_file, io.BytesIO) + assert pptx_file.getvalue() == b"abcdefg" def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( self, opts_args: dict[str, Any] diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index 2e951d321f..6697577a1e 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename(): def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji(): - f = tempfile.SpooledTemporaryFile() - with open("example-docs/emoji.xlsx", "rb") as g: - f.write(g.read()) - elements = partition_xlsx(file=f, include_header=False) + with tempfile.SpooledTemporaryFile() as f: + with open("example-docs/emoji.xlsx", "rb") as g: + f.write(g.read()) + + elements = partition_xlsx(file=f, include_header=False) + assert sum(isinstance(element, Text) for element in elements) == 1 assert len(elements) == 1 assert clean_extra_whitespace(elements[0].text) == "🤠😅" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 095d056ec7..0dbfa1eb73 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev1" # pragma: no cover +__version__ = "0.16.12-dev2" # pragma: no cover diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index d90e589f8f..c595055ec0 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -53,7 +53,6 @@ def ontology_to_unstructured_elements( """ elements_to_return = [] if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT: - if page_number is None and isinstance(ontology_element, ontology.Page): page_number = ontology_element.page_number @@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool: if any(isinstance(ontology_element, class_) for class_ in text_classes): return True - if any(ontology_element.elementType == category for category in text_categories): - return True - - return False + return any(ontology_element.elementType == category for category in text_categories) def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: @@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: if any(isinstance(ontology_element, class_) for class_ in inline_classes): return True - if any(ontology_element.elementType == category for category in inline_categories): - return True - - return False + return any(ontology_element.elementType == category for category in inline_categories) def unstructured_elements_to_ontology( @@ -327,10 +320,7 @@ def is_empty(tag): if tag.attrs: return False - if not tag.get_text(strip=True): - return True - - return False + return bool(not tag.get_text(strip=True)) def remove_empty_tags(soup): for tag in soup.find_all(): @@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag( # Scenario 1: Valid Ontology Element if soup.attrs.get("class"): - html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get( - (soup.name, soup.attrs["class"][0]) + html_tag, element_class = ( + soup.name, + HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])), ) # Scenario 2: HTML tag incorrect, CSS class correct From b5ff79d8dbc4c86b9fec9f7632a6a6e2c20b5f17 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 16 Dec 2024 16:56:21 -0800 Subject: [PATCH 3/5] fix: refine filetype detection (#3828) **Summary** Fixes a bug where a CSV file with asserted content-type `application/vnd.ms-excel` was incorrectly identified as an XLS file and failed partitioning. **Additional Context** The `content_type` argument to partitioning is often authored by the client system (e.g. Unstructured SDK) and is both unreliable and outside the control of the user. In this case the `.csv -> XLS` mapping is correct for certain purposes (Excel is often used to load and edit CSV files) but not for partitioning, and the user has no readily available way to override the mapping. XLS files as well as seven other common binary file types can be efficiently detected 100% of the time (at least 99.999%) using code we already have in the file detector. - Promote this direct-inspection strategy to be tried first. - When DOC, DOCX, EPUB, ODT, PPT, PPTX, XLS, or XLSX is detected, use that file-type. - When one of those types is NOT detected, clear the asserted `content_type` when it matches any of those types. This prevents the problem seen in the bug where the asserted content type was used to determine the file-type. - The remaining content_type, guess MIME-type, and filename-extension mapping strategies are tried, in that order, only when direct inspection fails. This is largely the same as it was before. - Fix #3781 while we were in the neighborhood. - Fix #3596 as well, essentially an earlier report of #3781. --- CHANGELOG.md | 3 +- test_unstructured/file_utils/test_filetype.py | 440 +++--------------- unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 258 +++++----- 4 files changed, 224 insertions(+), 479 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 959f6c581b..d13d859802 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev2 +## 0.16.12-dev3 ### Enhancements @@ -9,6 +9,7 @@ ### Fixes - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. +- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. ## 0.16.11 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 933882f9e2..c1f7ad1f8d 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -14,15 +14,14 @@ LogCaptureFixture, Mock, example_doc_path, - function_mock, patch, property_mock, ) from unstructured.file_utils.filetype import ( _FileTypeDetectionContext, - _OleFileDifferentiator, + _OleFileDetector, _TextFileDifferentiator, - _ZipFileDifferentiator, + _ZipFileDetector, detect_filetype, is_json_processable, ) @@ -31,7 +30,41 @@ is_in_docker = os.path.exists("/.dockerenv") # ================================================================================================ -# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL +# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) +# ================================================================================================ + + +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOC, "simple.doc"), + (FileType.DOCX, "simple.docx"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.ODT, "simple.odt"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.XLS, "tests-example.xls"), + (FileType.XLSX, "stanley-cups.xlsx"), + ], +) +def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direct_inspection( + file_name: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- disable other strategies; no content-type, guessed MIME-type or extension -- + ctx_mime_type_.return_value = None + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + + file_type = detect_filetype(file=file) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall back to MIME-type guessing for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type == expected_value + + +# ================================================================================================ +# STRATEGY #2 - CONTENT-TYPE ASSERTED IN CALL # ================================================================================================ @@ -40,41 +73,21 @@ [ (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), (FileType.CSV, "stanley-cups.csv", "text/csv"), - (FileType.DOC, "simple.doc", "application/msword"), - ( - FileType.DOCX, - "simple.docx", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), (FileType.EML, "eml/fake-email.eml", "message/rfc822"), - (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), - (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PNG, "img/DA-1p.png", "image/png"), - (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), - ( - FileType.PPTX, - "fake-power-point.pptx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), (FileType.RST, "README.rst", "text/x-rst"), (FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), (FileType.TSV, "stanley-cups.tsv", "text/tsv"), (FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"), - (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), - ( - FileType.XLSX, - "stanley-cups.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), (FileType.XML, "factbook.xml", "application/xml"), (FileType.ZIP, "simple.zip", "application/zip"), ], @@ -82,13 +95,13 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - # -- disable strategy #2, leaving only asserted content-type and extension -- + # -- disable mime-guessing leaving only asserted content-type and extension -- ctx_mime_type_.return_value = None file_type = detect_filetype(example_doc_path(file_name), content_type=content_type) - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall back to strategy 2 for any of these test cases. + # -- Content-type strategy should not need to refer to guessed MIME-type and detection should + # not -- fall back to strategy 2 for any of these test cases. ctx_mime_type_.assert_not_called() assert file_type == expected_value @@ -98,41 +111,21 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte [ (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), (FileType.CSV, "stanley-cups.csv", "text/csv"), - (FileType.DOC, "simple.doc", "application/msword"), - ( - FileType.DOCX, - "simple.docx", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), (FileType.EML, "eml/fake-email.eml", "message/rfc822"), - (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), - (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PNG, "img/DA-1p.png", "image/png"), - (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), - ( - FileType.PPTX, - "fake-power-point.pptx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), (FileType.RST, "README.rst", "text/x-rst"), (FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), (FileType.TSV, "stanley-cups.tsv", "text/tsv"), (FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"), - (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), - ( - FileType.XLSX, - "stanley-cups.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), (FileType.XML, "factbook.xml", "application/xml"), (FileType.ZIP, "simple.zip", "application/zip"), ], @@ -140,93 +133,22 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type( file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - # -- disable strategy #2 (guessed mime-type) -- - ctx_mime_type_.return_value = None - # -- disable strategy #3 (filename extension) by supplying no source of file name -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - file_type = detect_filetype(file=file, content_type=content_type) - - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall-back to strategy 2 for any of these test cases. - ctx_mime_type_.assert_not_called() - assert file_type is expected_value - - -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOCX, "simple.docx"), - (FileType.PPTX, "fake-power-point.pptx"), - (FileType.XLSX, "stanley-cups.xlsx"), - ], -) -@pytest.mark.parametrize( - "content_type", - [ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ], -) -def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type( - file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock -): - # -- disable strategies 2 & 3, content-type strategy should get this on its own -- + # -- disable mime-guessing -- ctx_mime_type_.return_value = None + # -- disable filename extension mapping by supplying no source of file name -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) file_type = detect_filetype(file=file, content_type=content_type) - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall-back to strategy 2 for any of these test cases. - ctx_mime_type_.assert_not_called() - assert file_type is expected_value - - -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOC, "simple.doc"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.XLS, "tests-example.xls"), - ], -) -@pytest.mark.parametrize( - "content_type", - [ - "application/msword", - "application/vnd.ms-outlook", - "application/vnd.ms-powerpoint", - "application/vnd.ms-excel", - "anything/else", - ], -) -def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type( - file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock -): - """Fixes wrong XLS asserted as DOC, PPT, etc. - - Asserted content-type can be anything except `None` and differentiator will fix it if the file - is DOC, PPT, or XLS type. - """ - # -- disable strategies 2 & 3, content-type strategy should get this on its own -- - ctx_mime_type_.return_value = None - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - file_type = detect_filetype(file=file, content_type=content_type) - - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall-back to strategy 2 for any of these test cases. + # -- Content-type strategy should not need to refer to guessed MIME-type and detection should + # -- not fall-back to strategy 2 for any of these test cases. ctx_mime_type_.assert_not_called() assert file_type is expected_value # ================================================================================================ -# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC +# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY # ================================================================================================ @@ -237,31 +159,16 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ (FileType.CSV, "stanley-cups.csv", "text/csv"), (FileType.CSV, "stanley-cups.csv", "application/csv"), (FileType.CSV, "stanley-cups.csv", "application/x-csv"), - (FileType.DOC, "simple.doc", "application/msword"), - ( - FileType.DOCX, - "simple.docx", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), (FileType.EML, "eml/fake-email.eml", "message/rfc822"), - (FileType.EPUB, "winter-sports.epub", "application/epub"), - (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), (FileType.MD, "README.md", "text/x-markdown"), - (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PNG, "img/DA-1p.png", "image/png"), - (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), - ( - FileType.PPTX, - "fake-power-point.pptx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), (FileType.RST, "README.rst", "text/x-rst"), (FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.RTF, "fake-doc.rtf", "application/rtf"), @@ -270,18 +177,11 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ (FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.TXT, "simple.yaml", "text/yaml"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"), - (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), - ( - FileType.XLSX, - "stanley-cups.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), (FileType.XML, "factbook.xml", "application/xml"), (FileType.XML, "factbook.xml", "text/xml"), - (FileType.ZIP, "simple.zip", "application/zip"), ], ) -def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type( +def test_it_detects_correct_file_type_by_guessed_MIME_when_libmagic_guesses_recognized_mime_type( file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock ): # -- libmagic guesses a MIME-type mapped to a `FileType` -- @@ -290,7 +190,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) - # -- disable strategy #1 by not asserting a content_type in the call -- + # -- disable content-type strategy by not asserting a content_type in the call -- file_type = detect_filetype(file=file) # -- ctx.mime_type may be referenced multiple times, but at least once -- @@ -303,30 +203,22 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec [ (FileType.BMP, "img/bmp_24.bmp"), (FileType.CSV, "stanley-cups.csv"), - (FileType.DOC, "simple.doc"), - (FileType.DOCX, "simple.docx"), (FileType.EML, "eml/fake-email.eml"), - (FileType.EPUB, "winter-sports.epub"), (FileType.HEIC, "img/DA-1p.heic"), (FileType.HTML, "ideas-page.html"), (FileType.JPG, "img/example.jpg"), (FileType.JSON, "spring-weather.html.json"), - (FileType.ODT, "simple.odt"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.PPTX, "fake-power-point.pptx"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TXT, "norwich-city.txt"), (FileType.WAV, "CantinaBand3.wav"), - (FileType.XLS, "tests-example.xls"), - (FileType.XLSX, "stanley-cups.xlsx"), (FileType.XML, "factbook.xml"), (FileType.ZIP, "simple.zip"), ], ) -def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself( +def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mime_type_for_itself( file_name: str, expected_value: FileType ): """Does not work for all types, in particular: @@ -339,90 +231,26 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_ - ORG is identified as TXT - RST is identified as TXT """ - # -- disable strategy #1 by not asserting a content_type in the call -- - # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + # -- disable content-type strategy by not asserting a content_type in the call -- + # -- disable extension-mapping strategy by passing file-like object with no `.name` attribute -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) assert detect_filetype(file=file) is expected_value -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOC, "simple.doc"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.XLS, "tests-example.xls"), - ], -) -@pytest.mark.parametrize( - "guessed_mime_type", - [ - "application/msword", - "application/vnd.ms-excel", - "application/vnd.ms-outlook", - "application/vnd.ms-powerpoint", - "application/x-ole-storage", - "anything/else", - ], -) -def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type( - file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock -): - """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc. - - It's better than that actually, the OLE differentiator will get the right file-type for any DOC, - PPT, XLS, or MSG file, regardless of guessed MIME-type. - """ - ctx_mime_type_.return_value = guessed_mime_type - # -- disable strategy 3 by not providing a file-name source -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - # -- disable strategy 1 by not asserting a content-type -- - file_type = detect_filetype(file=file) - - ctx_mime_type_.assert_called_with() - assert file_type is expected_value - - -@pytest.mark.parametrize( - ("filename", "mime_type", "expected"), - [ - ("fake.doc", "application/vnd.ms-excel", FileType.DOC), - ("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT), - ("tests-example.xls", "application/msword", FileType.XLS), - ("fake-email.msg", "application/vnd.ms-excel", FileType.MSG), - ], -) -def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected): - def _guess_mime(*args, **kwargs): - return mime_type - - with patch("filetype.guess_mime", _guess_mime): - detect_filetype(example_doc_path(filename)) == expected - - @pytest.mark.parametrize( ("expected_value", "file_name"), [ # -- `filetype` lib recognizes all these binary file-types -- (FileType.BMP, "img/bmp_24.bmp"), - (FileType.DOC, "simple.doc"), - (FileType.DOCX, "simple.docx"), - (FileType.EPUB, "winter-sports.epub"), (FileType.HEIC, "img/DA-1p.heic"), (FileType.JPG, "img/example.jpg"), - (FileType.ODT, "simple.odt"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.PPTX, "fake-power-point.pptx"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.WAV, "CantinaBand3.wav"), - (FileType.XLS, "tests-example.xls"), - (FileType.XLSX, "stanley-cups.xlsx"), (FileType.ZIP, "simple.zip"), # -- but it doesn't recognize textual file-types at all -- (FileType.UNK, "stanley-cups.csv"), @@ -435,11 +263,9 @@ def _guess_mime(*args, **kwargs): (FileType.UNK, "stanley-cups.tsv"), (FileType.UNK, "norwich-city.txt"), (FileType.UNK, "factbook.xml"), - # -- and it doesn't recognize MSG files -- - (FileType.UNK, "fake-email.msg"), ], ) -def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable( +def test_strategy_mime_guessing_can_detect_only_binary_file_types_when_libmagic_is_unavailable( file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool ): """File-type is detected using `filetype` library when libmagic is not available. @@ -447,7 +273,7 @@ def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailab `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office), but doesn't even try to guess textual file-types. """ - # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + # -- disable detection by extension by passing file-like object with no `.name` attribute -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) # -- simulate libmagic is not available -- @@ -470,7 +296,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( # ================================================================================================ -# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE +# STRATEGY #4 - MAP FILENAME EXTENSION TO FILETYPE # ================================================================================================ @@ -479,35 +305,25 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( [ (FileType.BMP, "img/bmp_24.bmp"), (FileType.CSV, "stanley-cups.csv"), - (FileType.DOC, "simple.doc"), - (FileType.DOCX, "simple.docx"), (FileType.EML, "eml/fake-email.eml"), - (FileType.EPUB, "winter-sports.epub"), (FileType.HEIC, "img/DA-1p.heic"), (FileType.HTML, "example-10k-1p.html"), (FileType.JPG, "img/example.jpg"), (FileType.JSON, "spring-weather.html.json"), (FileType.MD, "README.md"), - (FileType.MSG, "fake-email.msg"), - (FileType.ODT, "simple.odt"), (FileType.ORG, "README.org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.PPTX, "fake-power-point.pptx"), (FileType.RST, "README.rst"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TSV, "stanley-cups.tsv"), (FileType.TXT, "norwich-city.txt"), (FileType.WAV, "CantinaBand3.wav"), - (FileType.XLS, "tests-example.xls"), - (FileType.XLSX, "stanley-cups.xlsx"), (FileType.XML, "factbook.xml"), - (FileType.ZIP, "simple.zip"), ], ) -def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type( +def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type( file_name: str, expected_value: FileType, ctx_mime_type_: Mock ): # -- disable strategy #2 by making libmagic always guess `None` -- @@ -525,10 +341,8 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil @pytest.mark.parametrize( ("expected_value", "file_name", "mime_type"), [ - (FileType.BMP, "img/bmp_24.bmp", "application/zip"), - (FileType.DOC, "simple.doc", None), - (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"), - (FileType.MSG, "fake-email.msg", "application/octet-stream"), + (FileType.BMP, "img/bmp_24.bmp", "application/octet-stream"), + (FileType.HEIC, "img/DA-1p.heic", "application/octet-stream"), ], ) def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( @@ -547,6 +361,12 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( # ================================================================================================ +@pytest.mark.parametrize("mime_type", [FileType.XLS.mime_type, FileType.XLSX.mime_type]) +def test_it_ignores_asserted_XLS_content_type_when_file_is_CSV(mime_type: str): + file_path = example_doc_path("stanley-cups.csv") + assert detect_filetype(file_path, content_type=mime_type) == FileType.CSV + + @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) @pytest.mark.parametrize("extension", [".html", ".htm"]) def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( @@ -563,39 +383,6 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi assert file_type is FileType.HTML -@pytest.mark.parametrize( - "mime_type", - [ - "application/octet-stream", - "application/zip", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ], -) -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOCX, "simple.docx"), - (FileType.PPTX, "fake-power-point.pptx"), - (FileType.XLSX, "stanley-cups.xlsx"), - (FileType.ZIP, "simple.zip"), - ], -) -def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office( - mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock -): - ctx_mime_type_.return_value = mime_type - # -- disable extension-based strategy #3 -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - file_type = detect_filetype(file=file) - - ctx_mime_type_.assert_called_with() - assert file_type is expected_value - - @pytest.mark.parametrize( ("mime_type", "file_name"), [ @@ -1000,29 +787,8 @@ def mime_type_prop_(self, request: FixtureRequest): return property_mock(request, _FileTypeDetectionContext, "mime_type") -class Describe_OleFileDifferentiator: - """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`.""" - - # -- .applies() --------------------------------------------- - - def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): - """The constructor determines whether this differentiator is applicable. - - It returns an instance only when differentiating a CFBF file-type is required, which it - judges by inspecting the initial bytes of the file for the CFBF magic-bytes. - """ - ctx = _FileTypeDetectionContext(example_doc_path("simple.doc")) - - differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar") - - assert differentiator is not None - assert isinstance(differentiator, _OleFileDifferentiator) - - def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self): - ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub")) - assert _OleFileDifferentiator.applies(ctx, "application/epub") is None - - # -- .file_type --------------------------------------------- +class Describe_OleFileDetector: + """Unit-test suite for `unstructured.file_utils.filetype._OleFileDetector`.""" @pytest.mark.parametrize( ("file_name", "expected_value"), @@ -1034,59 +800,15 @@ def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_t ("README.org", None), ], ) - def it_distinguishes_the_file_type_of_applicable_OLE_files( + def it_distinguishes_the_file_type_of_applicable_CFB_files( self, file_name: str, expected_value: FileType | None ): # -- no file-name available, just to make sure we're not relying on an extension -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) ctx = _FileTypeDetectionContext(file=file) - differentiator = _OleFileDifferentiator(ctx) - assert differentiator.file_type is expected_value - - @pytest.mark.parametrize( - ("file_name", "expected_value"), - [ - ("simple.doc", FileType.DOC), - ("fake-power-point.ppt", FileType.PPT), - ("tests-example.xls", FileType.XLS), - ("fake-email.msg", FileType.MSG), - ], - ) - def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content( - self, file_name: str, expected_value: FileType | None - ): - # -- no file-name available, just to make sure we're not relying on an extension -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - ctx = _FileTypeDetectionContext(file=file) - differentiator = _OleFileDifferentiator(ctx) - - assert differentiator._check_ole_file_type(ctx) is expected_value - - def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime( - self, guess_mime_: Mock - ): - guess_mime_.return_value = None - # -- no file-name available, just to make sure we're not relying on an extension -- - with open(example_doc_path("fake-email.msg"), "rb") as f: - file = io.BytesIO(f.read()) - ctx = _FileTypeDetectionContext(file=file) - differentiator = _OleFileDifferentiator(ctx) - # -- force method to return None to trigger the mime type being guessed - differentiator._check_ole_file_type = lambda ctx: None - - file_type = differentiator.file_type - - guess_mime_.assert_called_once_with(file) - assert file_type is None - - # -- fixtures -------------------------------------------------------------------------------- - - @pytest.fixture - def guess_mime_(self, request: FixtureRequest): - return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime") + assert _OleFileDetector.file_type(ctx) is expected_value class Describe_TextFileDifferentiator: @@ -1164,33 +886,15 @@ def it_distinguishes_a_JSON_file_from_other_text_files( assert differentiator._is_json is expected_value -class Describe_ZipFileDifferentiator: - """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`.""" - - # -- .applies() --------------------------------------------- - - def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): - """The constructor determines whether this differentiator is applicable. - - It returns an instance only when differentiating a zip file-type is required, which it can - judge from the mime-type provided by the context (`ctx`). - """ - ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) - - differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip") - - assert isinstance(differentiator, _ZipFileDifferentiator) - - def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self): - ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) - assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None - - # -- .file_type --------------------------------------------- +class Describe_ZipFileDetector: + """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDetector`.""" @pytest.mark.parametrize( ("file_name", "expected_value"), [ ("simple.docx", FileType.DOCX), + ("winter-sports.epub", FileType.EPUB), + ("simple.odt", FileType.ODT), ("picture.pptx", FileType.PPTX), ("vodafone.xlsx", FileType.XLSX), ("simple.zip", FileType.ZIP), @@ -1201,6 +905,4 @@ def it_distinguishes_the_file_type_of_applicable_zip_files( self, file_name: str, expected_value: FileType | None ): ctx = _FileTypeDetectionContext(example_doc_path(file_name)) - differentiator = _ZipFileDifferentiator(ctx) - - assert differentiator.file_type is expected_value + assert _ZipFileDetector.file_type(ctx) is expected_value diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 0dbfa1eb73..d1e3d3bd18 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev2" # pragma: no cover +__version__ = "0.16.12-dev3" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index d109cd7384..4c8e4d2be8 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -51,7 +51,11 @@ from unstructured.partition.common.metadata import set_element_hierarchy from unstructured.utils import get_call_args_applying_defaults, lazyproperty -LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) +try: + importlib.import_module("magic") + LIBMAGIC_AVAILABLE = True +except ImportError: + LIBMAGIC_AVAILABLE = False # pyright: ignore[reportConstantRedefinition] def detect_filetype( @@ -133,43 +137,57 @@ def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType: @property def _file_type(self) -> FileType: """FileType member corresponding to this document source.""" - # -- strategy 1: use content-type asserted by caller -- + # -- An explicit content-type most commonly asserted by the client/SDK and is therefore + # -- inherently unreliable. On the other hand, binary file-types can be detected with 100% + # -- accuracy. So start with binary types and only then consider an asserted content-type, + # -- generally as a last resort. + + # -- strategy 1: most binary types can be detected with 100% accuracy -- + if file_type := self._known_binary_file_type: + return file_type + + # -- strategy 2: use content-type asserted by caller -- if file_type := self._file_type_from_content_type: return file_type - # -- strategy 2: guess MIME-type using libmagic and use that -- + # -- strategy 3: guess MIME-type using libmagic and use that -- if file_type := self._file_type_from_guessed_mime_type: return file_type - # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX -- + # -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX -- if file_type := self._file_type_from_file_extension: return file_type - # -- strategy 4: give up and report FileType.UNK -- + # -- strategy 5: give up and report FileType.UNK -- return FileType.UNK # == STRATEGIES ============================================================ + @property + def _known_binary_file_type(self) -> FileType | None: + """Detect file-type for binary types we can positively detect.""" + if file_type := _OleFileDetector.file_type(self._ctx): + return file_type + + self._ctx.rule_out_cfb_content_types() + + if file_type := _ZipFileDetector.file_type(self._ctx): + return file_type + + self._ctx.rule_out_zip_content_types() + + return None + @property def _file_type_from_content_type(self) -> FileType | None: """Map passed content-type argument to a file-type, subject to certain rules.""" - content_type = self._ctx.content_type # -- when no content-type was asserted by caller, this strategy is not applicable -- - if not content_type: + if not self._ctx.content_type: return None - # -- OLE-based file-format content_type values are sometimes unreliable. These are - # -- DOC, PPT, XLS, and MSG. - if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type): - return differentiator.file_type - - # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable -- - if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type): - return differentiator.file_type - # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- - return FileType.from_mime_type(content_type) + return FileType.from_mime_type(self._ctx.content_type) @property def _file_type_from_guessed_mime_type(self) -> FileType | None: @@ -188,24 +206,12 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None: if mime_type is None: return None - if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type): - return differentiator.file_type - if mime_type.endswith("xml"): return FileType.HTML if extension in (".html", ".htm") else FileType.XML if differentiator := _TextFileDifferentiator.applies(self._ctx): return differentiator.file_type - # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+ - # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT - # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and - # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are - # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type - # -- is actually a PPTX file etc. - if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type): - return differentiator.file_type - # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment -- if self._ctx.has_code_mime_type: return FileType.TXT @@ -214,14 +220,8 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None: return FileType.EMPTY # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present -- - if file_type := FileType.from_mime_type(mime_type): - return file_type - - logger.warning( - f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is" - f" {mime_type!r}. This file type is not currently supported in unstructured.", - ) - return None + file_type = FileType.from_mime_type(mime_type) + return file_type if file_type != FileType.UNK else None @lazyproperty def _file_type_from_file_extension(self) -> FileType | None: @@ -236,6 +236,9 @@ def _file_type_from_file_extension(self) -> FileType | None: class _FileTypeDetectionContext: """Provides all arguments to auto-file detection and values derived from them. + NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should + not be assumed to be a constant value across those calls. + This keeps computation of derived values out of the file-detection code but more importantly allows the main filetype-detector to pass the full context to any delegates without coupling itself to which values it might need. @@ -276,7 +279,7 @@ def new( self._validate() return self - @lazyproperty + @property def content_type(self) -> str | None: """MIME-type asserted by caller; not based on inspection of file by this process. @@ -284,6 +287,8 @@ def content_type(self) -> str | None: present on the response. These are often ambiguous and sometimes just wrong so get some further verification. All lower-case when not `None`. """ + # -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a + # -- `@lazyproperty`. return self._content_type.lower() if self._content_type else None @lazyproperty @@ -327,12 +332,6 @@ def file_path(self) -> str | None: return os.path.realpath(file_path) if os.path.islink(file_path) else file_path - @lazyproperty - def is_zipfile(self) -> bool: - """True when file is a Zip archive.""" - with self.open() as file: - return zipfile.is_zipfile(file) - @lazyproperty def has_code_mime_type(self) -> bool: """True when `mime_type` plausibly indicates a programming language source-code file.""" @@ -347,9 +346,27 @@ def has_code_mime_type(self) -> bool: return any( lang in mime_type - for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split() + for lang in [ + "c#", + "c++", + "cpp", + "csharp", + "java", + "javascript", + "php", + "python", + "ruby", + "swift", + "typescript", + ] ) + @lazyproperty + def is_zipfile(self) -> bool: + """True when file is a Zip archive.""" + with self.open() as file: + return zipfile.is_zipfile(file) + @lazyproperty def mime_type(self) -> str | None: """The best MIME-type we can get from `magic` (or `filetype` package). @@ -401,6 +418,38 @@ def open(self) -> Iterator[IO[bytes]]: file.seek(0) yield file + def rule_out_cfb_content_types(self) -> None: + """Invalidate content-type when a legacy MS-Office file-type is asserted. + + Used before returning `None`; at that point we know the file is not one of these formats + so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should + not be used as a fallback later in the detection process. + """ + if FileType.from_mime_type(self._content_type) in ( + FileType.DOC, + FileType.MSG, + FileType.PPT, + FileType.XLS, + ): + self._content_type = None + + def rule_out_zip_content_types(self) -> None: + """Invalidate content-type when an MS-Office 2007+ file-type is asserted. + + Used before returning `None`; at that point we know the file is not one of these formats + so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should + not be used as a fallback later in the detection process. + """ + if FileType.from_mime_type(self._content_type) in ( + FileType.DOCX, + FileType.EPUB, + FileType.ODT, + FileType.PPTX, + FileType.XLSX, + FileType.ZIP, + ): + self._content_type = None + @lazyproperty def text_head(self) -> str: """The initial characters of the text file for use with text-format differentiation. @@ -440,27 +489,23 @@ def _validate(self) -> None: raise ValueError("either `file_path` or `file` argument must be provided") -class _OleFileDifferentiator: - """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be. +class _OleFileDetector: + """Detect and differentiate a CFB file, aka. "OLE" file. - Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office - files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as - `"application/x-ole-storage"` which is true but too not specific enough for partitioning - purposes. + Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office + files (DOC, PPT, XLS) as well as for Outlook MSG files. """ def __init__(self, ctx: _FileTypeDetectionContext): self._ctx = ctx @classmethod - def applies( - cls, ctx: _FileTypeDetectionContext, mime_type: str - ) -> _OleFileDifferentiator | None: - """Constructs an instance, but only if this differentiator applies for `mime_type`.""" - return cls(ctx) if cls._is_ole_file(ctx) else None + def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: + """Specific file-type when file is a CFB file, `None` otherwise.""" + return cls(ctx)._file_type @property - def file_type(self) -> FileType | None: + def _file_type(self) -> FileType | None: """Differentiated file-type for Microsoft Compound File Binary Format (CFBF). Returns one of: @@ -468,34 +513,27 @@ def file_type(self) -> FileType | None: - `FileType.PPT` - `FileType.XLS` - `FileType.MSG` + - `None` when the file is not one of these. """ - # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return - # -- `None` to trigger fall-back to next strategy. - if not self._is_ole_file(self._ctx): + # -- all CFB files share common magic number, start with that -- + if not self._is_ole_file: return None - # -- check storage contents of the ole file for file type markers - if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None: + # -- check storage contents of the ole file for file-type specific stream names -- + if (ole_file_type := self._ole_file_type) is not None: return ole_file_type - # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it - # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always - # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we - # -- rely on filename-extension to identify those. - with self._ctx.open() as file: - mime_type = ft.guess_mime(file) - - return FileType.from_mime_type(mime_type) if mime_type else None + return None - @staticmethod - def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: - """True when file has CFBF magic first 8 bytes.""" - with ctx.open() as file: + @lazyproperty + def _is_ole_file(self) -> bool: + """True when file has CFB magic first 8 bytes.""" + with self._ctx.open() as file: return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" - @staticmethod - def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: - with ctx.open() as f: + @lazyproperty + def _ole_file_type(self) -> FileType | None: + with self._ctx.open() as f: ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType] root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType] @@ -537,7 +575,20 @@ def file_type(self) -> FileType: """ extension = self._ctx.extension - if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split(): + if extension in [ + ".csv", + ".eml", + ".html", + ".json", + ".markdown", + ".md", + ".org", + ".p7s", + ".rst", + ".rtf", + ".tab", + ".tsv", + ]: return FileType.from_extension(extension) or FileType.TXT # NOTE(crag): for older versions of the OS libmagic package, such as is currently @@ -616,40 +667,28 @@ def _is_json(self) -> bool: return False -class _ZipFileDifferentiator: - """Refine a Zip-packaged file-type that may be ambiguous or swapped.""" +class _ZipFileDetector: + """Detect and differentiate a Zip-archive file.""" def __init__(self, ctx: _FileTypeDetectionContext): self._ctx = ctx @classmethod - def applies( - cls, ctx: _FileTypeDetectionContext, mime_type: str - ) -> _ZipFileDifferentiator | None: - """Constructs an instance, but only if this differentiator applies for `mime_type`. + def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: + """Most specific file-type available when file is a Zip file, `None` otherwise. - Separate `mime_type` argument allows it to be applied to either asserted content-type or - guessed mime-type. + MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even + when we can tell it's a Zip file, so later strategies can have a crack at it. In + particular, ODT and EPUB files are Zip archives but are not detected here. """ - return ( - cls(ctx) - if mime_type - in ( - "application/octet-stream", - "application/zip", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ) - else None - ) + return cls(ctx)._file_type @lazyproperty - def file_type(self) -> FileType | None: + def _file_type(self) -> FileType | None: """Differentiated file-type for a Zip archive. - Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`, - `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise. + Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies, + `None` otherwise. """ if not self._ctx.is_zipfile: return None @@ -657,20 +696,23 @@ def file_type(self) -> FileType | None: with self._ctx.open() as file: zip = zipfile.ZipFile(file) - # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx - # extension. If the MIME type is application/octet-stream, we check if it's a - # .docx/.xlsx file by looking for expected filenames within the zip file. - filenames = [f.filename for f in zip.filelist] + filenames = zip.namelist() - if all(f in filenames for f in ("word/document.xml",)): + if "word/document.xml" in filenames: return FileType.DOCX - if all(f in filenames for f in ("xl/workbook.xml",)): + if "xl/workbook.xml" in filenames: return FileType.XLSX - if all(f in filenames for f in ("ppt/presentation.xml",)): + if "ppt/presentation.xml" in filenames: return FileType.PPTX + # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root -- + if "mimetype" in filenames: + with zip.open("mimetype") as f: + mime_type = f.read().decode("utf-8").strip() + return FileType.from_mime_type(mime_type) + return FileType.ZIP From 9a9bf4c4f587ff4c504bf0eb8682bfacb90082e1 Mon Sep 17 00:00:00 2001 From: Ribhu Lahiri Date: Tue, 17 Dec 2024 08:23:17 +0530 Subject: [PATCH 4/5] Added contributing from archived repo (#3616) Added `CONTRIBUTING.md` from the archived repo as mentioned in the issue: https://github.com/Unstructured-IO/unstructured/issues/3540 Co-authored-by: John <43506685+Coniferish@users.noreply.github.com> --- CONTRIBUTING.md | 134 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..9cade1f91d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,134 @@ +## Contributing to Unstructured + +[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md) + +👍🎉 First off, thank you for taking the time to contribute! 🎉👍 + +The following is a set of guidelines for contributing to the open source ecosystem of preprocessing pipeline APIs and supporting libraries hosted [here](https://github.com/Unstructured-IO). + +This is meant to help the review process go smoothly, save the reviewer(s) time in catching common issues, and avoid submitting PRs that will be rejected by the CI. + +In some cases it's convenient to put up a PR that's not ready for final review. This is fine (and under those circumstances it's not necessary to go through this checklist), but the PR should be put in draft mode so everyone knows it's not ready for review. + +### How to Contribute? + +If you want to contribute, start working through the Unstructured codebase, navigate to the Github "issues" tab and start looking through interesting issues. If you are not sure of where to start, then start by trying one of the smaller/easier issues here i.e. issues with the "good first issue" label and then take a look at the issues with the "contributions welcome" label. These are issues that we believe are particularly well suited for outside contributions, often because we probably won't get to them right now. If you decide to start on an issue, leave a comment so that other people know that you're working on it. If you want to help out, but not alone, use the issue comment thread to coordinate. + + +## Pull-Request Checklist + +The following is a list of tasks to be completed before submitting a pull request for final review. + +### Before creating PR: + +1. Follow coding best practices + 1. [ ] Make sure all new classes/functions/methods have docstrings. + 1. [ ] Make sure all new functions/methods have type hints (optional for tests). + 1. [ ] Make sure all new functions/methods have associated tests. + 1. [ ] Update `CHANGELOG.md` and `__version__.py` if the core code has changed +

+1. Ensure environment is consistent + 1. [ ] Update dependencies in `.in` files if needed (pay special attention to whether the current PR depends on changes to internal repos that are not packaged - if so the commit needs to be bumped). + 1. [ ] If dependencies have changed, recompile dependencies with `make pip-compile`. + 1. [ ] Make sure local virtual environment matches what CI will see - reinstall internal/external dependencies as needed.\ +Follow the [virtualenv install instructions](https://github.com/Unstructured-IO/community#mac--homebrew) if you are unsure about working with virtual environments. +

+1. Run tests and checks locally + 1. [ ] Run tests locally with `make test`. Some repositories have supplemental tests with targets like `make test-integration` or `make test-sample-docs`. If applicable, run these as well. Try to make sure all tests are passing before submitting the PR, unless you are submitting in draft mode. + 1. [ ] Run typing, linting, and formatting checks with `make check`. Some repositories have supplemental checks with targets like `make check-scripts` or `make check-notebooks`. If applicable, run these as well. Try to make sure all checks are passing before submitting the PR, unless you are submitting in draft mode. +

+1. Ensure code is clean + 1. [ ] Remove all debugging artifacts. + 1. [ ] Remove commented out code. + 1. [ ] For actual comments, note that our typical format is `# NOTE(): ` + 1. [ ] Double check everything has been committed and pushed, recommended that local feature branch is clean. + +### PR Guidelines: + +1. [ ] PR title should follow [conventional commit](https://www.conventionalcommits.org/en/v1.0.0/) standards. + +1. [ ] PR description should give enough detail that the reviewer knows what they reviewing - sometimes a copy-paste of the added `CHANGELOG.md` items is enough, sometimes more detail is needed. + +1. [ ] If applicable, add a testing section to the PR description that recommends steps a reviewer can take to verify the changes, e.g. a snippet of code they can run locally. + +### License + +Unstructured open source projects are licensed under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + +Include a license at the top of new `setup.py` files: + +- [Python license example](https://github.com/Unstructured-IO/unstructured/blob/main/setup.py) + + +## Conventions + +For pull requests, our convention is to squash and merge. For PR titles, we use [conventional commit](https://www.freecodecamp.org/news/how-to-write-better-git-commit-messages/#conventional-commits) messages. The format should look like + +- `: `. + +For example, if the PR addresses a new feature, the PR title should look like: + +- `feat: Implements exciting new feature`. + +For feature branches, the naming convention is: + +- `/`. + +For the commit above, coming from the user called `contributor` the branch name would look like: + +- `contributor/exciting-new-feature`. + +Here is a list of some of the most common possible commit types: + +- `feat` – a new feature is introduced with the changes +- `fix` – a bug fix has occurred +- `chore` – changes that do not relate to a fix or feature and don't modify src or test files (for example updating dependencies) +- `refactor` – refactored code that neither fixes a bug nor adds a feature +- `docs` – updates to documentation such as a the README or other markdown files + +### Why should you write better commit messages? + +By writing good commits, you are simply future-proofing yourself. You could save yourself and/or coworkers hours of digging around while troubleshooting by providing that helpful description 🙂. + +The extra time it takes to write a thoughtful commit message as a letter to your potential future self is extremely worthwhile. On large scale projects, documentation is imperative for maintenance. + +Collaboration and communication are of utmost importance within engineering teams. The Git commit message is a prime example of this. I highly suggest setting up a convention for commit messages on your team if you do not already have one in place. + + +## Code of Conduct + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +### Enforcement + +Please report unacceptable behavior to support@unstructured.io. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +Thank you! 🤗 + +The Unstructured Team + + +## Learn more + +| Section | Description | +|-|-| +| [Company Website](https://unstructured.io) | Unstructured.io product and company info | +| [Documentation](https://unstructured-io.github.io/unstructured) | Full API documentation | +| [Working with Pull Requests](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) | About pull requests | +| [Code of Conduct](https://www.contributor-covenant.org/version/1/4/code-of-conduct/) | Contributor Covenant Code Of Conduct | +| [Conventional Commits](https://www.freecodecamp.org/news/how-to-write-better-git-commit-messages/) | How to write better git commit messages | +| [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) | Lightweight convention on top of commit messages | +| [First Contributions](https://github.com/firstcontributions/first-contributions/blob/main/README.md) | Beginners' guide to make their first contribution! | + + +## Contributing Guides + +If you're stumped 😓, here are some good examples of contribution guidelines: + +- The GitHub Docs [contribution guidelines](https://github.com/github/docs/blob/main/CONTRIBUTING.md). +- The Ruby on Rails [contribution guidelines](https://github.com/rails/rails/blob/main/CONTRIBUTING.md). +- The Open Government [contribution guidelines](https://github.com/opengovernment/opengovernment/blob/master/CONTRIBUTING.md). +- The MMOCR [contribution guidelines](https://mmocr.readthedocs.io/en/dev-1.x/notes/contribution_guide.html). +- The HuggingFace [contribution guidelines](https://huggingface2.notion.site/Contribution-Guide-19411c29298644df8e9656af45a7686d). From 73f6c3989199061845dcfd7f5d8a183b52c035c9 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 17 Dec 2024 14:55:08 -0500 Subject: [PATCH 5/5] feat: allow disabling OCR in hi_res mode (fixes: #2467) --- .../partition/pdf_image/test_pdf.py | 16 ++++++ unstructured/partition/pdf.py | 50 +++++++++++-------- unstructured/partition/utils/constants.py | 1 + 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 9b1b8de6e1..0eeebe768a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -602,6 +602,22 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode): assert "Layouts of scanned US newspapers from the 20th century" in table[0] +def test_partition_pdf_hi_res_ocr_mode_none(): + filename = example_doc_path("pdf/layout-parser-paper.pdf") + elements = pdf.partition_pdf( + filename=filename, + ocr_mode="none", + strategy=PartitionStrategy.HI_RES, + # FIXME: table structure still requires OCR for no good reason + infer_table_structure=False, + ) + fast_elements = pdf.partition_pdf( + filename=filename, + strategy=PartitionStrategy.FAST, + ) + assert elements != fast_elements + + def test_partition_pdf_with_copy_protection(): filename = example_doc_path("pdf/copy-protected.pdf") elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f87812d40b..c3f41242a5 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -623,17 +623,20 @@ def _partition_pdf_or_image_local( hi_res_model_name=hi_res_model_name, ) - final_document_layout = process_file_with_ocr( - filename, - merged_document_layout, - extracted_layout=extracted_layout, - is_image=is_image, - infer_table_structure=infer_table_structure, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ocr_layout_dumper=ocr_layout_dumper, - ) + if ocr_mode == OCRMode.NONE.value: + final_document_layout = merged_document_layout + else: + final_document_layout = process_file_with_ocr( + filename, + merged_document_layout, + extracted_layout=extracted_layout, + is_image=is_image, + infer_table_structure=infer_table_structure, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ocr_layout_dumper=ocr_layout_dumper, + ) else: inferred_document_layout = process_data_with_model( file, @@ -678,17 +681,20 @@ def _partition_pdf_or_image_local( if hasattr(file, "seek"): file.seek(0) - final_document_layout = process_data_with_ocr( - file, - merged_document_layout, - extracted_layout=extracted_layout, - is_image=is_image, - infer_table_structure=infer_table_structure, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ocr_layout_dumper=ocr_layout_dumper, - ) + if ocr_mode == OCRMode.NONE.value: + final_document_layout = merged_document_layout + else: + final_document_layout = process_data_with_ocr( + file, + merged_document_layout, + extracted_layout=extracted_layout, + is_image=is_image, + infer_table_structure=infer_table_structure, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ocr_layout_dumper=ocr_layout_dumper, + ) final_document_layout = clean_pdfminer_inner_elements(final_document_layout) diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 4b4dadeaa1..225583c39e 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -12,6 +12,7 @@ class Source(Enum): class OCRMode(Enum): INDIVIDUAL_BLOCKS = "individual_blocks" FULL_PAGE = "entire_page" + NONE = "none" class PartitionStrategy: