Skip to content

Commit

Permalink
fix: correctly patch EOF handling in pdfminer (fixes: Unstructured-IO…
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Dec 12, 2024
1 parent 3b718ec commit e0f464a
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 15 deletions.
15 changes: 14 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1192,15 +1192,28 @@ def test_partition_pdf_with_fast_finds_headers_footers(
@pytest.mark.parametrize(
("filename", "expected_log"),
[
# This one is *actually* an invalid PDF document
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
],
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
assert expected_log in caplog.text

@pytest.mark.parametrize(
("filename", "expected_log"),
[
# This one is *not* an invalid PDF document, make sure we
# don't try to "repair" it unnecessarily
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
],
)
def test_properly_patch_pdfminer(filename, expected_log, caplog):
caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
assert expected_log not in caplog.text


def assert_element_extraction(
elements: list[Element],
Expand Down
7 changes: 2 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import numpy as np
import wrapt
from pdfminer import psparser
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.utils import open_filename
from pi_heif import register_heif_opener
Expand Down Expand Up @@ -96,16 +95,14 @@
PartitionStrategy,
)
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
from unstructured.patches.pdfminer import parse_keyword
from unstructured.utils import first, requires_dependencies
from unstructured.patches.pdfminer import patch_psparser

if TYPE_CHECKING:
pass

# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore

patch_psparser()
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)


Expand Down
68 changes: 59 additions & 9 deletions unstructured/patches/pdfminer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
from typing import Union
from typing import Union, Tuple

from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
import functools
import pdfminer
from pdfminer.psparser import (
PSBaseParser,
KWD,
PSBaseParserToken,
PSEOF,
END_KEYWORD,
PSKeyword,
log,
)

factory_seek = PSBaseParser.seek

def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
"""Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
https://github.com/pdfminer/pdfminer.six/pull/885."""

@functools.wraps(PSBaseParser.seek)
def seek(self: PSBaseParser, pos: int) -> None:
factory_seek(self, pos)
self.eof = False


@functools.wraps(PSBaseParser._parse_keyword)
def _parse_keyword(self, s: bytes, i: int) -> int:
m = END_KEYWORD.search(s, i)
if not m:
j = len(s)
self._curtoken += s[i:]
else:
if m:
j = m.start(0)
self._curtoken += s[i:j]
else:
self._curtoken += s[i:]
return len(s)
if self._curtoken == b"true":
token: Union[bool, PSKeyword] = True
elif self._curtoken == b"false":
Expand All @@ -22,3 +39,36 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
self._add_token(token)
self._parse1 = self._parse_main
return j


@functools.wraps(PSBaseParser.nexttoken)
def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
if self.eof:
# It's not really unexpected, come on now...
raise PSEOF("Unexpected EOF")
while not self._tokens:
try:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
except PSEOF:
# If we hit EOF in the middle of a token, try to parse
# it by tacking on whitespace, and delay raising PSEOF
# until next time around
self.charpos = self._parse1(b"\n", 0)
self.eof = True
# Oh, so there wasn't actually a token there? OK.
if not self._tokens:
raise
token = self._tokens.pop(0)
log.debug("nexttoken: %r", token)
return token


def patch_psparser():
"""Monkey-patch certain versions of pdfminer.six to avoid breaking
tokens across buffers."""
# Presuming the bug will be fixed in the next release
if pdfminer.__version__ <= "20240706":
PSBaseParser.seek = seek
PSBaseParser._parse_keyword = _parse_keyword
PSBaseParser.nexttoken = nexttoken

0 comments on commit e0f464a

Please sign in to comment.