Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Correctly patch pdfminer to avoid unnecessarily and unsuccessfully repairing PDFs with long content streams, causing needless and endless OCR #3822

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

### Fixes

- **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`. Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.

## 0.16.11

### Enhancements
Expand Down
16 changes: 15 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1192,8 +1192,8 @@ def test_partition_pdf_with_fast_finds_headers_footers(
@pytest.mark.parametrize(
("filename", "expected_log"),
[
# This one is *actually* an invalid PDF document
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
],
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
Expand All @@ -1202,6 +1202,20 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
assert expected_log in caplog.text


@pytest.mark.parametrize(
("filename", "expected_log"),
[
# This one is *not* an invalid PDF document, make sure we
# don't try to "repair" it unnecessarily
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
],
)
def test_properly_patch_pdfminer(filename, expected_log, caplog):
caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
assert expected_log not in caplog.text


def assert_element_extraction(
elements: list[Element],
extract_image_block_types: list[str],
Expand Down
12 changes: 7 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import numpy as np
import wrapt
from pdfminer import psparser
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.utils import open_filename
from pi_heif import register_heif_opener
Expand Down Expand Up @@ -96,15 +95,18 @@
PartitionStrategy,
)
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
from unstructured.patches.pdfminer import parse_keyword
from unstructured.patches.pdfminer import patch_psparser
from unstructured.utils import first, requires_dependencies

if TYPE_CHECKING:
pass

# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore

# Correct a bug that was introduced by a previous patch to
# pdfminer.six, causing needless and unsuccessful repairing of PDFs
# which were not actually broken.
patch_psparser()


RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)

Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PSSyntaxError
from pdfminer.psparser import PSSyntaxError

from unstructured.logger import logger
from unstructured.utils import requires_dependencies
Expand Down
70 changes: 61 additions & 9 deletions unstructured/patches/pdfminer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
from typing import Union
import functools
from typing import Tuple, Union

from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
import pdfminer
from pdfminer.psparser import (
END_KEYWORD,
KWD,
PSEOF,
PSBaseParser,
PSBaseParserToken,
PSKeyword,
log,
)

factory_seek = PSBaseParser.seek

def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
"""Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
https://github.com/pdfminer/pdfminer.six/pull/885."""

@functools.wraps(PSBaseParser.seek)
def seek(self: PSBaseParser, pos: int) -> None:
factory_seek(self, pos)
self.eof = False


@functools.wraps(PSBaseParser._parse_keyword)
def _parse_keyword(self, s: bytes, i: int) -> int:
m = END_KEYWORD.search(s, i)
if not m:
j = len(s)
self._curtoken += s[i:]
else:
if m:
j = m.start(0)
self._curtoken += s[i:j]
else:
self._curtoken += s[i:]
return len(s)
if self._curtoken == b"true":
token: Union[bool, PSKeyword] = True
elif self._curtoken == b"false":
Expand All @@ -22,3 +39,38 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
self._add_token(token)
self._parse1 = self._parse_main
return j


@functools.wraps(PSBaseParser.nexttoken)
def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
if self.eof:
# It's not really unexpected, come on now...
raise PSEOF("Unexpected EOF")
while not self._tokens:
try:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
except PSEOF:
# If we hit EOF in the middle of a token, try to parse
# it by tacking on whitespace, and delay raising PSEOF
# until next time around
self.charpos = self._parse1(b"\n", 0)
self.eof = True
# Oh, so there wasn't actually a token there? OK.
if not self._tokens:
raise
token = self._tokens.pop(0)
log.debug("nexttoken: %r", token)
return token


def patch_psparser():
"""Monkey-patch certain versions of pdfminer.six to avoid dropping
tokens at EOF (before 20231228) and splitting tokens at buffer
boundaries (20231228 and 20240706).
"""
# Presuming the bug will be fixed in the next release
if pdfminer.__version__ <= "20240706":
PSBaseParser.seek = seek
PSBaseParser._parse_keyword = _parse_keyword
PSBaseParser.nexttoken = nexttoken