diff --git a/CHANGELOG.md b/CHANGELOG.md index 14c8afe1f3..959f6c581b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev1 +## 0.16.12-dev2 ### Enhancements @@ -8,6 +8,8 @@ ### Fixes +- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. + ## 0.16.11 ### Enhancements diff --git a/pyproject.toml b/pyproject.toml index ea90b03210..f91637cae6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,17 @@ verboseOutput = true [tool.ruff] line-length = 100 +target-version = "py39" -# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` -- -lint.select = [ +[tool.ruff.lint] +ignore = [ + "COM812", # -- over aggressively insists on trailing commas where not desireable -- + "PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) -- + "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- + "PT012", # -- pytest.raises() block should contain a single simple statement -- + "SIM117", # -- merge `with` statements for context managers that have same scope -- +] +select = [ "C4", # -- flake8-comprehensions -- "COM", # -- flake8-commas -- "E", # -- pycodestyle errors -- @@ -29,11 +37,3 @@ lint.select = [ "UP034", # -- Avoid extraneous parentheses -- "W", # -- Warnings, including invalid escape-sequence -- ] -lint.ignore = [ - "COM812", # -- over aggressively insists on trailing commas where not desireable -- - "PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) -- - "PT005", # -- flags mock fixtures with names intentionally matching private method name -- - "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- - "PT012", # -- pytest.raises() block should contain a single simple statement -- - "SIM117", # -- merge `with` statements for context managers that have same scope -- -] diff --git a/requirements/test.in b/requirements/test.in index c763c091d1..ca9d2d5bfe 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -11,9 +11,7 @@ mypy pydantic pytest-cov pytest-mock -# NOTE(robison) - we need to do additional cleanup to pass -# linting for the latest version of ruff -ruff<0.5.0 +ruff types-Markdown types-requests types-tabulate diff --git a/requirements/test.txt b/requirements/test.txt index 241f941d87..e3762557da 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -171,7 +171,7 @@ requests==2.32.3 # requests-mock requests-mock==1.12.1 # via label-studio-sdk -ruff==0.4.10 +ruff==0.8.3 # via -r ./test.in semantic-version==2.10.0 # via liccheck diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index cea6b44129..9b1b8de6e1 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -245,12 +245,14 @@ def _test(result): _test(result) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - result = pdf.partition_pdf( - file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number - ) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf( + file=spooled_temp_file, + strategy=strategy, + starting_page_number=starting_page_number, + ) _test(result) @@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date( ) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - elements = pdf.partition_pdf( - file=spooled_temp_file, - strategy=strategy, - metadata_last_modified=metadata_last_modified, - ) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = pdf.partition_pdf( + file=spooled_temp_file, + strategy=strategy, + metadata_last_modified=metadata_last_modified, + ) assert {el.metadata.last_modified for el in elements} == {expected_last_modified} @@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy( ) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - elements = pdf.partition_pdf( - file=spooled_temp_file, - strategy=PartitionStrategy.OCR_ONLY, - languages=["eng"], - is_image=is_image, - ) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = pdf.partition_pdf( + file=spooled_temp_file, + strategy=PartitionStrategy.OCR_ONLY, + languages=["eng"], + is_image=is_image, + ) assert elements[0].metadata.languages == ["eng"] # check pages diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 4b30e03379..1330b4a79a 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file( `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need to ensure the source file is appropriately converted in this case. """ - with open(mock_document_file_path, "rb") as test_file: - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + with open(mock_document_file_path, "rb") as test_file: + spooled_temp_file.write(test_file.read()) spooled_temp_file.seek(0) + elements = partition_docx(file=spooled_temp_file) - assert elements == expected_elements - for element in elements: - assert element.metadata.filename is None + + assert elements == expected_elements + assert all(e.metadata.filename is None for e in elements) def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]): @@ -921,16 +922,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( self, opts_args: dict[str, Any] ): - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(b"abcdefg") - opts_args["file"] = spooled_temp_file - opts = DocxPartitionerOptions(**opts_args) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = DocxPartitionerOptions(**opts_args) - docx_file = opts._docx_file + docx_file = opts._docx_file - assert docx_file is not spooled_temp_file - assert isinstance(docx_file, io.BytesIO) - assert docx_file.getvalue() == b"abcdefg" + assert docx_file is not spooled_temp_file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( self, opts_args: dict[str, Any] diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index e19d87ff98..afa3ea2bf5 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file(): Including one that does not have its read-pointer set to the start. """ - with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: + spooled_temp_file.write(test_file.read()) + elements = partition_pptx(file=spooled_temp_file) + assert elements == EXPECTED_PPTX_OUTPUT for element in elements: assert element.metadata.filename is None @@ -701,16 +703,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( self, opts_args: dict[str, Any] ): - spooled_temp_file = tempfile.SpooledTemporaryFile() - spooled_temp_file.write(b"abcdefg") - opts_args["file"] = spooled_temp_file - opts = PptxPartitionerOptions(**opts_args) + with tempfile.SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = PptxPartitionerOptions(**opts_args) - pptx_file = opts.pptx_file + pptx_file = opts.pptx_file - assert pptx_file is not spooled_temp_file - assert isinstance(pptx_file, io.BytesIO) - assert pptx_file.getvalue() == b"abcdefg" + assert pptx_file is not spooled_temp_file + assert isinstance(pptx_file, io.BytesIO) + assert pptx_file.getvalue() == b"abcdefg" def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( self, opts_args: dict[str, Any] diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index 2e951d321f..6697577a1e 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename(): def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji(): - f = tempfile.SpooledTemporaryFile() - with open("example-docs/emoji.xlsx", "rb") as g: - f.write(g.read()) - elements = partition_xlsx(file=f, include_header=False) + with tempfile.SpooledTemporaryFile() as f: + with open("example-docs/emoji.xlsx", "rb") as g: + f.write(g.read()) + + elements = partition_xlsx(file=f, include_header=False) + assert sum(isinstance(element, Text) for element in elements) == 1 assert len(elements) == 1 assert clean_extra_whitespace(elements[0].text) == "🤠😅" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 095d056ec7..0dbfa1eb73 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev1" # pragma: no cover +__version__ = "0.16.12-dev2" # pragma: no cover diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index d90e589f8f..c595055ec0 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -53,7 +53,6 @@ def ontology_to_unstructured_elements( """ elements_to_return = [] if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT: - if page_number is None and isinstance(ontology_element, ontology.Page): page_number = ontology_element.page_number @@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool: if any(isinstance(ontology_element, class_) for class_ in text_classes): return True - if any(ontology_element.elementType == category for category in text_categories): - return True - - return False + return any(ontology_element.elementType == category for category in text_categories) def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: @@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: if any(isinstance(ontology_element, class_) for class_ in inline_classes): return True - if any(ontology_element.elementType == category for category in inline_categories): - return True - - return False + return any(ontology_element.elementType == category for category in inline_categories) def unstructured_elements_to_ontology( @@ -327,10 +320,7 @@ def is_empty(tag): if tag.attrs: return False - if not tag.get_text(strip=True): - return True - - return False + return bool(not tag.get_text(strip=True)) def remove_empty_tags(soup): for tag in soup.find_all(): @@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag( # Scenario 1: Valid Ontology Element if soup.attrs.get("class"): - html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get( - (soup.name, soup.attrs["class"][0]) + html_tag, element_class = ( + soup.name, + HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])), ) # Scenario 2: HTML tag incorrect, CSS class correct