build: remove ruff version upper bound (Unstructured-IO#3829)

**Summary** Remove pin on `ruff` linter and fix the handful of lint errors a newer version catches.
dhdaines · Dec 16, 2024 · 10f0d54 · 10f0d54
1 parent b092fb7
commit 10f0d54
Show file tree

Hide file tree

Showing 10 changed files with 81 additions and 83 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.12-dev1
+## 0.16.12-dev2
 
 ### Enhancements
 
@@ -8,6 +8,8 @@
 
 ### Fixes
 
+- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
+
 ## 0.16.11
 
 ### Enhancements

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,9 +12,17 @@ verboseOutput = true
 
 [tool.ruff]
 line-length = 100
+target-version = "py39"
 
-# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` --
-lint.select = [
+[tool.ruff.lint]
+ignore = [
+    "COM812",   # -- over aggressively insists on trailing commas where not desireable --
+    "PT001",    # -- wants empty parens on @pytest.fixture where not used (essentially always) --
+    "PT011",    # -- pytest.raises({exc}) too broad, use match param or more specific exception --
+    "PT012",    # -- pytest.raises() block should contain a single simple statement --
+    "SIM117",   # -- merge `with` statements for context managers that have same scope --
+]
+select = [
     "C4",       # -- flake8-comprehensions --
     "COM",      # -- flake8-commas --
     "E",        # -- pycodestyle errors --
@@ -29,11 +37,3 @@ lint.select = [
     "UP034",    # -- Avoid extraneous parentheses --
     "W",        # -- Warnings, including invalid escape-sequence --
 ]
-lint.ignore = [
-    "COM812",   # -- over aggressively insists on trailing commas where not desireable --
-    "PT001",    # -- wants empty parens on @pytest.fixture where not used (essentially always) --
-    "PT005",    # -- flags mock fixtures with names intentionally matching private method name --
-    "PT011",    # -- pytest.raises({exc}) too broad, use match param or more specific exception --
-    "PT012",    # -- pytest.raises() block should contain a single simple statement --
-    "SIM117",   # -- merge `with` statements for context managers that have same scope --
-]
diff --git a/requirements/test.in b/requirements/test.in
@@ -11,9 +11,7 @@ mypy
 pydantic
 pytest-cov
 pytest-mock
-# NOTE(robison) - we need to do additional cleanup to pass
-# linting for the latest version of ruff
-ruff<0.5.0
+ruff
 types-Markdown
 types-requests
 types-tabulate

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -171,7 +171,7 @@ requests==2.32.3
     #   requests-mock
 requests-mock==1.12.1
     # via label-studio-sdk
-ruff==0.4.10
+ruff==0.8.3
     # via -r ./test.in
 semantic-version==2.10.0
     # via liccheck

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -245,12 +245,14 @@ def _test(result):
             _test(result)
     else:
         with open(filename, "rb") as test_file:
-            spooled_temp_file = SpooledTemporaryFile()
-            spooled_temp_file.write(test_file.read())
-            spooled_temp_file.seek(0)
-            result = pdf.partition_pdf(
-                file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
-            )
+            with SpooledTemporaryFile() as spooled_temp_file:
+                spooled_temp_file.write(test_file.read())
+                spooled_temp_file.seek(0)
+                result = pdf.partition_pdf(
+                    file=spooled_temp_file,
+                    strategy=strategy,
+                    starting_page_number=starting_page_number,
+                )
             _test(result)
 
 
@@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date(
             )
     else:
         with open(filename, "rb") as test_file:
-            spooled_temp_file = SpooledTemporaryFile()
-            spooled_temp_file.write(test_file.read())
-            spooled_temp_file.seek(0)
-            elements = pdf.partition_pdf(
-                file=spooled_temp_file,
-                strategy=strategy,
-                metadata_last_modified=metadata_last_modified,
-            )
+            with SpooledTemporaryFile() as spooled_temp_file:
+                spooled_temp_file.write(test_file.read())
+                spooled_temp_file.seek(0)
+                elements = pdf.partition_pdf(
+                    file=spooled_temp_file,
+                    strategy=strategy,
+                    metadata_last_modified=metadata_last_modified,
+                )
 
     assert {el.metadata.last_modified for el in elements} == {expected_last_modified}
 
@@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy(
             )
     else:
         with open(filename, "rb") as test_file:
-            spooled_temp_file = SpooledTemporaryFile()
-            spooled_temp_file.write(test_file.read())
-            spooled_temp_file.seek(0)
-            elements = pdf.partition_pdf(
-                file=spooled_temp_file,
-                strategy=PartitionStrategy.OCR_ONLY,
-                languages=["eng"],
-                is_image=is_image,
-            )
+            with SpooledTemporaryFile() as spooled_temp_file:
+                spooled_temp_file.write(test_file.read())
+                spooled_temp_file.seek(0)
+                elements = pdf.partition_pdf(
+                    file=spooled_temp_file,
+                    strategy=PartitionStrategy.OCR_ONLY,
+                    languages=["eng"],
+                    is_image=is_image,
+                )
 
     assert elements[0].metadata.languages == ["eng"]
     # check pages

diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
@@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file(
     `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
     to ensure the source file is appropriately converted in this case.
     """
-    with open(mock_document_file_path, "rb") as test_file:
-        spooled_temp_file = tempfile.SpooledTemporaryFile()
-        spooled_temp_file.write(test_file.read())
+    with tempfile.SpooledTemporaryFile() as spooled_temp_file:
+        with open(mock_document_file_path, "rb") as test_file:
+            spooled_temp_file.write(test_file.read())
         spooled_temp_file.seek(0)
+
         elements = partition_docx(file=spooled_temp_file)
-        assert elements == expected_elements
-        for element in elements:
-            assert element.metadata.filename is None
+
+    assert elements == expected_elements
+    assert all(e.metadata.filename is None for e in elements)
 
 
 def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]):
@@ -921,16 +922,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
     def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
         self, opts_args: dict[str, Any]
     ):
-        spooled_temp_file = tempfile.SpooledTemporaryFile()
-        spooled_temp_file.write(b"abcdefg")
-        opts_args["file"] = spooled_temp_file
-        opts = DocxPartitionerOptions(**opts_args)
+        with tempfile.SpooledTemporaryFile() as spooled_temp_file:
+            spooled_temp_file.write(b"abcdefg")
+            opts_args["file"] = spooled_temp_file
+            opts = DocxPartitionerOptions(**opts_args)
 
-        docx_file = opts._docx_file
+            docx_file = opts._docx_file
 
-        assert docx_file is not spooled_temp_file
-        assert isinstance(docx_file, io.BytesIO)
-        assert docx_file.getvalue() == b"abcdefg"
+            assert docx_file is not spooled_temp_file
+            assert isinstance(docx_file, io.BytesIO)
+            assert docx_file.getvalue() == b"abcdefg"
 
     def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
         self, opts_args: dict[str, Any]

diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py
@@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file():
 
     Including one that does not have its read-pointer set to the start.
     """
-    with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
-        spooled_temp_file = tempfile.SpooledTemporaryFile()
-        spooled_temp_file.write(test_file.read())
+    with tempfile.SpooledTemporaryFile() as spooled_temp_file:
+        with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
+            spooled_temp_file.write(test_file.read())
+
         elements = partition_pptx(file=spooled_temp_file)
+
         assert elements == EXPECTED_PPTX_OUTPUT
         for element in elements:
             assert element.metadata.filename is None
@@ -701,16 +703,16 @@ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
     def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
         self, opts_args: dict[str, Any]
     ):
-        spooled_temp_file = tempfile.SpooledTemporaryFile()
-        spooled_temp_file.write(b"abcdefg")
-        opts_args["file"] = spooled_temp_file
-        opts = PptxPartitionerOptions(**opts_args)
+        with tempfile.SpooledTemporaryFile() as spooled_temp_file:
+            spooled_temp_file.write(b"abcdefg")
+            opts_args["file"] = spooled_temp_file
+            opts = PptxPartitionerOptions(**opts_args)
 
-        pptx_file = opts.pptx_file
+            pptx_file = opts.pptx_file
 
-        assert pptx_file is not spooled_temp_file
-        assert isinstance(pptx_file, io.BytesIO)
-        assert pptx_file.getvalue() == b"abcdefg"
+            assert pptx_file is not spooled_temp_file
+            assert isinstance(pptx_file, io.BytesIO)
+            assert pptx_file.getvalue() == b"abcdefg"
 
     def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
         self, opts_args: dict[str, Any]

diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py
@@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename():
 
 
 def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
-    f = tempfile.SpooledTemporaryFile()
-    with open("example-docs/emoji.xlsx", "rb") as g:
-        f.write(g.read())
-    elements = partition_xlsx(file=f, include_header=False)
+    with tempfile.SpooledTemporaryFile() as f:
+        with open("example-docs/emoji.xlsx", "rb") as g:
+            f.write(g.read())
+
+        elements = partition_xlsx(file=f, include_header=False)
+
     assert sum(isinstance(element, Text) for element in elements) == 1
     assert len(elements) == 1
     assert clean_extra_whitespace(elements[0].text) == "🤠😅"

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12-dev1"  # pragma: no cover
+__version__ = "0.16.12-dev2"  # pragma: no cover
diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
@@ -53,7 +53,6 @@ def ontology_to_unstructured_elements(
     """
     elements_to_return = []
     if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
-
         if page_number is None and isinstance(ontology_element, ontology.Page):
             page_number = ontology_element.page_number
 
@@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
     if any(isinstance(ontology_element, class_) for class_ in text_classes):
         return True
 
-    if any(ontology_element.elementType == category for category in text_categories):
-        return True
-
-    return False
+    return any(ontology_element.elementType == category for category in text_categories)
 
 
 def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
@@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
     if any(isinstance(ontology_element, class_) for class_ in inline_classes):
         return True
 
-    if any(ontology_element.elementType == category for category in inline_categories):
-        return True
-
-    return False
+    return any(ontology_element.elementType == category for category in inline_categories)
 
 
 def unstructured_elements_to_ontology(
@@ -327,10 +320,7 @@ def is_empty(tag):
         if tag.attrs:
             return False
 
-        if not tag.get_text(strip=True):
-            return True
-
-        return False
+        return bool(not tag.get_text(strip=True))
 
     def remove_empty_tags(soup):
         for tag in soup.find_all():
@@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag(
 
     # Scenario 1: Valid Ontology Element
     if soup.attrs.get("class"):
-        html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get(
-            (soup.name, soup.attrs["class"][0])
+        html_tag, element_class = (
+            soup.name,
+            HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])),
         )
 
     # Scenario 2: HTML tag incorrect, CSS class correct
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.16.12-dev1" # pragma: no cover
		__version__ = "0.16.12-dev2" # pragma: no cover