Unstructured-IO · dhdaines · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 ### Fixes
 
+- **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`.  Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.
+
 ## 0.16.11
 
 ### Enhancements

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1192,8 +1192,8 @@ def test_partition_pdf_with_fast_finds_headers_footers(
 @pytest.mark.parametrize(
     ("filename", "expected_log"),
     [
+        # This one is *actually* an invalid PDF document
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
-        ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
@@ -1202,6 +1202,20 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
     assert expected_log in caplog.text
 
 
+@pytest.mark.parametrize(
+    ("filename", "expected_log"),
+    [
+        # This one is *not* an invalid PDF document, make sure we
+        # don't try to "repair" it unnecessarily
+        ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
+    ],
+)
+def test_properly_patch_pdfminer(filename, expected_log, caplog):
+    caplog.set_level(logging.INFO)
+    assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
+    assert expected_log not in caplog.text
+
+
 def assert_element_extraction(
     elements: list[Element],
     extract_image_block_types: list[str],

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -11,7 +11,6 @@
 
 import numpy as np
 import wrapt
-from pdfminer import psparser
 from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
 from pdfminer.utils import open_filename
 from pi_heif import register_heif_opener
@@ -96,15 +95,18 @@
     PartitionStrategy,
 )
 from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
-from unstructured.patches.pdfminer import parse_keyword
+from unstructured.patches.pdfminer import patch_psparser
 from unstructured.utils import first, requires_dependencies
 
 if TYPE_CHECKING:
     pass
 
-# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
-# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
-psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
+
+# Correct a bug that was introduced by a previous patch to
+# pdfminer.six, causing needless and unsuccessful repairing of PDFs
+# which were not actually broken.
+patch_psparser()
+
 
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
 

diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -6,7 +6,7 @@
 from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfparser import PSSyntaxError
+from pdfminer.psparser import PSSyntaxError
 
 from unstructured.logger import logger
 from unstructured.utils import requires_dependencies

diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
@@ -1,18 +1,35 @@
-from typing import Union
+import functools
+from typing import Tuple, Union
 
-from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
+import pdfminer
+from pdfminer.psparser import (
+    END_KEYWORD,
+    KWD,
+    PSEOF,
+    PSBaseParser,
+    PSBaseParserToken,
+    PSKeyword,
+    log,
+)
 
+factory_seek = PSBaseParser.seek
 
-def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
-    """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
-    https://github.com/pdfminer/pdfminer.six/pull/885."""
+
+@functools.wraps(PSBaseParser.seek)
+def seek(self: PSBaseParser, pos: int) -> None:
+    factory_seek(self, pos)
+    self.eof = False
+
+
+@functools.wraps(PSBaseParser._parse_keyword)
+def _parse_keyword(self, s: bytes, i: int) -> int:
     m = END_KEYWORD.search(s, i)
-    if not m:
-        j = len(s)
-        self._curtoken += s[i:]
-    else:
+    if m:
         j = m.start(0)
         self._curtoken += s[i:j]
+    else:
+        self._curtoken += s[i:]
+        return len(s)
     if self._curtoken == b"true":
         token: Union[bool, PSKeyword] = True
     elif self._curtoken == b"false":
@@ -22,3 +39,38 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
     self._add_token(token)
     self._parse1 = self._parse_main
     return j
+
+
+@functools.wraps(PSBaseParser.nexttoken)
+def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+    if self.eof:
+        # It's not really unexpected, come on now...
+        raise PSEOF("Unexpected EOF")
+    while not self._tokens:
+        try:
+            self.fillbuf()
+            self.charpos = self._parse1(self.buf, self.charpos)
+        except PSEOF:
+            # If we hit EOF in the middle of a token, try to parse
+            # it by tacking on whitespace, and delay raising PSEOF
+            # until next time around
+            self.charpos = self._parse1(b"\n", 0)
+            self.eof = True
+            # Oh, so there wasn't actually a token there? OK.
+            if not self._tokens:
+                raise
+    token = self._tokens.pop(0)
+    log.debug("nexttoken: %r", token)
+    return token
+
+
+def patch_psparser():
+    """Monkey-patch certain versions of pdfminer.six to avoid dropping
+    tokens at EOF (before 20231228) and splitting tokens at buffer
+    boundaries (20231228 and 20240706).
+    """
+    # Presuming the bug will be fixed in the next release
+    if pdfminer.__version__ <= "20240706":
+        PSBaseParser.seek = seek
+        PSBaseParser._parse_keyword = _parse_keyword
+        PSBaseParser.nexttoken = nexttoken