diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 47887ea8..b79604cb 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -14,27 +14,8 @@ env: default-python: "3.10" jobs: - - check-code-formatting: - name: Check coding style - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v4 - with: - python-version: ${{ env.default-python }} - - name: Upgrade pip, Install nox - run: | - python -m pip install --upgrade pip - python -m pip install nox - - name: Check coding style - run: | - nox --error-on-missing-interpreters --non-interactive --session format - check-coding-style: - name: Check coding style + name: Format and check coding style runs-on: ubuntu-latest steps: - name: Checkout code @@ -49,7 +30,7 @@ jobs: python -m pip install nox - name: Check coding style run: | - nox --error-on-missing-interpreters --non-interactive --session lint + nox --error-on-missing-interpreters --non-interactive --session format check-static-types: name: Check static types @@ -122,7 +103,6 @@ jobs: name: Publish to PyPi runs-on: ubuntu-latest needs: - - check-code-formatting - check-coding-style - check-static-types - tests diff --git a/.gitignore b/.gitignore index c1642e11..7f27b7ae 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,5 @@ Pipfile Pipfile.lock .noseids .vscode/ -pyproject.toml poetry.lock .eggs diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cf168bf..b743c35f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Changed + +- Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)]) + ### Deprecated - The third argument (generation number) to `PDFObjRef` ([#972](https://github.com/pdfminer/pdfminer.six/pull/972)) diff --git a/fuzzing/extract_text_fuzzer.py b/fuzzing/extract_text_fuzzer.py index 219badb5..79fe6018 100644 --- a/fuzzing/extract_text_fuzzer.py +++ b/fuzzing/extract_text_fuzzer.py @@ -6,9 +6,9 @@ with atheris.instrument_imports(): from fuzzing.utils import ( - prepare_pdfminer_fuzzing, - is_valid_byte_stream, generate_layout_parameters, + is_valid_byte_stream, + prepare_pdfminer_fuzzing, ) from pdfminer.high_level import extract_text diff --git a/fuzzing/extract_text_to_fp_fuzzer.py b/fuzzing/extract_text_to_fp_fuzzer.py index 302062d9..cb1424fb 100644 --- a/fuzzing/extract_text_to_fp_fuzzer.py +++ b/fuzzing/extract_text_to_fp_fuzzer.py @@ -7,9 +7,9 @@ with atheris.instrument_imports(): from fuzzing.utils import ( - prepare_pdfminer_fuzzing, - is_valid_byte_stream, generate_layout_parameters, + is_valid_byte_stream, + prepare_pdfminer_fuzzing, ) from pdfminer.high_level import extract_text_to_fp from pdfminer.psexceptions import PSException diff --git a/fuzzing/fuzzed_data_provider.py b/fuzzing/fuzzed_data_provider.py index 1992fbab..55889eee 100644 --- a/fuzzing/fuzzed_data_provider.py +++ b/fuzzing/fuzzed_data_provider.py @@ -26,7 +26,10 @@ def ConsumeMemoryFile(self, all_data: bool = False) -> io.BytesIO: return io.BytesIO(self.ConsumeRandomBytes()) def ConsumeOptionalIntList( - self, max_count: int, min: int, max: int + self, + max_count: int, + min: int, + max: int, ) -> Optional[List[int]]: if self.ConsumeBool(): count = self.ConsumeIntInRange(0, max_count) diff --git a/fuzzing/page_extraction_fuzzer.py b/fuzzing/page_extraction_fuzzer.py index 435cdb69..fb91ad05 100755 --- a/fuzzing/page_extraction_fuzzer.py +++ b/fuzzing/page_extraction_fuzzer.py @@ -1,14 +1,15 @@ #!/usr/bin/env python3 -import atheris import sys +import atheris + from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider with atheris.instrument_imports(): from fuzzing.utils import ( - prepare_pdfminer_fuzzing, - is_valid_byte_stream, generate_layout_parameters, + is_valid_byte_stream, + prepare_pdfminer_fuzzing, ) from pdfminer.high_level import extract_pages from pdfminer.psexceptions import PSException @@ -29,7 +30,7 @@ def fuzz_one_input(data: bytes) -> None: maxpages=fdp.ConsumeIntInRange(0, 10), page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10), laparams=generate_layout_parameters(fdp), - ) + ), ) except (AssertionError, PSException): return diff --git a/fuzzing/utils.py b/fuzzing/utils.py index 6c8e5a0f..9be29334 100644 --- a/fuzzing/utils.py +++ b/fuzzing/utils.py @@ -1,6 +1,5 @@ -""" -Utilities shared across the various PDF fuzzing harnesses -""" +"""Utilities shared across the various PDF fuzzing harnesses""" + import logging from typing import Optional @@ -12,9 +11,7 @@ def prepare_pdfminer_fuzzing() -> None: - """ - Used to disable logging of the pdfminer module - """ + """Used to disable logging of the pdfminer module""" logging.getLogger("pdfminer").setLevel(logging.CRITICAL) diff --git a/noxfile.py b/noxfile.py index 5c9aca14..56b514a4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -2,25 +2,20 @@ import nox - PYTHON_ALL_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"] PYTHON_MODULES = ["fuzzing", "pdfminer", "tools", "tests", "noxfile.py", "setup.py"] @nox.session def format(session): - session.install("black<23") + session.install("ruff==0.5.1") # Format files locally with black, but only check in cicd if "CI" in os.environ: - session.run("black", "--check", *PYTHON_MODULES) + session.run("ruff", "check") + session.run("ruff", "format", "--check") else: - session.run("black", *PYTHON_MODULES) - - -@nox.session -def lint(session): - session.install("flake8") - session.run("flake8", *PYTHON_MODULES, "--count", "--statistics") + session.run("ruff", "check", "--fix") + session.run("ruff", "format") @nox.session @@ -49,8 +44,20 @@ def docs(session): session.install("setuptools") session.install("-e", ".[docs]") session.run( - "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html" + "python", + "-m", + "sphinx", + "-b", + "html", + "docs/source", + "docs/build/html", ) session.run( - "python", "-m", "sphinx", "-b", "doctest", "docs/source", "docs/build/doctest" + "python", + "-m", + "sphinx", + "-b", + "doctest", + "docs/source", + "docs/build/doctest", ) diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index 5bd4d50a..e2a177ce 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -1,4 +1,4 @@ -from importlib.metadata import version, PackageNotFoundError +from importlib.metadata import PackageNotFoundError, version try: __version__ = version("pdfminer.six") diff --git a/pdfminer/_saslprep.py b/pdfminer/_saslprep.py index d56ca16b..18e2b733 100644 --- a/pdfminer/_saslprep.py +++ b/pdfminer/_saslprep.py @@ -21,10 +21,10 @@ __all__ = ["saslprep"] import stringprep -from typing import Callable, Tuple import unicodedata +from typing import Callable, Tuple -from .pdfexceptions import PDFValueError +from pdfminer.pdfexceptions import PDFValueError # RFC4013 section 2.3 prohibited output. _PROHIBITED: Tuple[Callable[[str], bool], ...] = ( @@ -66,7 +66,11 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str: in_table_c12 = stringprep.in_table_c12 in_table_b1 = stringprep.in_table_b1 data = "".join( - ["\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt)] + [ + "\u0020" if in_table_c12(elt) else elt + for elt in data + if not in_table_b1(elt) + ], ) # RFC3454 section 2, step 2 - Normalize diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index a767667f..cc78e361 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -1,10 +1,9 @@ -""" Python implementation of Arcfour encryption algorithm. +"""Python implementation of Arcfour encryption algorithm. See https://en.wikipedia.org/wiki/RC4 This code is in the public domain. """ - from typing import Sequence diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index dbe3d2a2..233bc744 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -1,4 +1,4 @@ -""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version). +"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version). This code is in the public domain. @@ -10,8 +10,7 @@ # ascii85decode(data) def ascii85decode(data: bytes) -> bytes: - """ - In ASCII85 encoding, every four bytes are encoded with five ASCII + """In ASCII85 encoding, every four bytes are encoded with five ASCII letters, using 85 different types of characters (as 256**4 < 85**5). When the length of the original bytes is not a multiple of 4, a special rule is used for round up. @@ -24,7 +23,7 @@ def ascii85decode(data: bytes) -> bytes: out = b"" for i in iter(data): c = bytes((i,)) - if b"!" <= c and c <= b"u": + if c >= b"!" and c <= b"u": n += 1 b = b * 85 + (ord(c) - 33) if n == 5: @@ -48,8 +47,7 @@ def ascii85decode(data: bytes) -> bytes: def asciihexdecode(data: bytes) -> bytes: - """ - ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 + """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the ASCIIHexDecode filter produces one byte of binary data. All white-space characters are ignored. A right angle bracket character (>) indicates diff --git a/pdfminer/casting.py b/pdfminer/casting.py index c73dfa6b..a8df21da 100644 --- a/pdfminer/casting.py +++ b/pdfminer/casting.py @@ -1,4 +1,4 @@ -from typing import Optional, Any +from typing import Any, Optional def safe_int(o: Any) -> Optional[int]: diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index 9a9ca6f2..d55cd7ef 100644 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -25,7 +25,7 @@ cast, ) -from .pdfexceptions import PDFException, PDFValueError +from pdfminer.pdfexceptions import PDFException, PDFValueError def get_bytes(data: bytes) -> Iterator[int]: @@ -53,7 +53,7 @@ def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None: p: BitParserState = root b = None for i in range(len(bits)): - if 0 < i: + if i > 0: assert b is not None if p[b] is None: p[b] = [None, None] @@ -84,7 +84,6 @@ def _parse_bit(self, x: object) -> None: class CCITTG4Parser(BitParser): - MODE = [None, None] BitParser.add(MODE, 0, "1") BitParser.add(MODE, +1, "011") @@ -475,9 +474,7 @@ def _do_vertical(self, dx: int) -> None: if x1 == 0: if self._color == 1 and self._refline[x1] != self._color: break - elif x1 == len(self._refline): - break - elif ( + elif x1 == len(self._refline) or ( self._refline[x1 - 1] == self._color and self._refline[x1] != self._color ): @@ -501,9 +498,7 @@ def _do_pass(self) -> None: if x1 == 0: if self._color == 1 and self._refline[x1] != self._color: break - elif x1 == len(self._refline): - break - elif ( + elif x1 == len(self._refline) or ( self._refline[x1 - 1] == self._color and self._refline[x1] != self._color ): @@ -513,9 +508,7 @@ def _do_pass(self) -> None: if x1 == 0: if self._color == 0 and self._refline[x1] == self._color: break - elif x1 == len(self._refline): - break - elif ( + elif x1 == len(self._refline) or ( self._refline[x1 - 1] != self._color and self._refline[x1] == self._color ): @@ -550,7 +543,10 @@ def _do_uncompressed(self, bits: str) -> None: class CCITTFaxDecoder(CCITTG4Parser): def __init__( - self, width: int, bytealign: bool = False, reversed: bool = False + self, + width: int, + bytealign: bool = False, + reversed: bool = False, ) -> None: CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.reversed = reversed @@ -563,7 +559,7 @@ def output_line(self, y: int, bits: Sequence[int]) -> None: arr = array.array("B", [0] * ((len(bits) + 7) // 8)) if self.reversed: bits = [1 - b for b in bits] - for (i, b) in enumerate(bits): + for i, b in enumerate(bits): if b: arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8] self._buf += arr.tobytes() @@ -598,7 +594,7 @@ def __init__(self, width: int, bytealign: bool = False) -> None: self.img = pygame.Surface((self.width, 1000)) def output_line(self, y: int, bits: Sequence[int]) -> None: - for (x, b) in enumerate(bits): + for x, b in enumerate(bits): if b: self.img.set_at((x, y), (255, 255, 255)) else: diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 3ee5ae92..87d9870e 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -1,4 +1,4 @@ -""" Adobe character mapping (CMap) support. +"""Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode code-points to character ids (CIDs). @@ -25,23 +25,18 @@ List, MutableMapping, Optional, + Set, TextIO, Tuple, Union, cast, - Set, ) +from pdfminer.encodingdb import name2unicode from pdfminer.pdfexceptions import PDFException, PDFTypeError -from .encodingdb import name2unicode -from .psparser import KWD from pdfminer.psexceptions import PSEOF, PSSyntaxError -from .psparser import PSKeyword -from .psparser import PSLiteral -from .psparser import PSStackParser -from .psparser import literal_name -from .utils import choplist -from .utils import nunpack +from pdfminer.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name +from pdfminer.utils import choplist, nunpack log = logging.getLogger(__name__) @@ -51,7 +46,6 @@ class CMapError(PDFException): class CMapBase: - debug = 0 def __init__(self, **kwargs: object) -> None: @@ -88,7 +82,7 @@ def use_cmap(self, cmap: CMapBase) -> None: assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: - for (k, v) in src.items(): + for k, v in src.items(): if isinstance(v, dict): d: Dict[int, object] = {} dst[k] = d @@ -121,7 +115,7 @@ def dump( if code2cid is None: code2cid = self.code2cid code = () - for (k, v) in sorted(code2cid.items()): + for k, v in sorted(code2cid.items()): c = code + (k,) if isinstance(v, int): out.write("code %r = cid %d\n" % (c, v)) @@ -160,7 +154,7 @@ def get_unichr(self, cid: int) -> str: return self.cid2unichr[cid] def dump(self, out: TextIO = sys.stdout) -> None: - for (k, v) in sorted(self.cid2unichr.items()): + for k, v in sorted(self.cid2unichr.items()): out.write("cid %d = unicode %r\n" % (k, v)) @@ -174,7 +168,7 @@ def get_unichr(self, cid: int) -> str: class FileCMap(CMap): def add_code2cid(self, code: str, cid: int) -> None: assert isinstance(code, str) and isinstance(cid, int), str( - (type(code), type(cid)) + (type(code), type(cid)), ) d = self.code2cid for c in code[:-1]: @@ -205,7 +199,7 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: raise PDFTypeError(code) # A0 = non-breaking space, some weird fonts can have a collision on a cid here. - if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ": + if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ": return self.cid2unichr[cid] = unichr @@ -229,7 +223,6 @@ def __init__(self, name: str, module: Any, vertical: bool) -> None: class CMapDB: - _cmap_cache: Dict[str, PyCMap] = {} _umap_cache: Dict[str, List[PyUnicodeMap]] = {} @@ -253,8 +246,7 @@ def _load_data(cls, name: str) -> Any: return type(str(name), (), pickle.loads(gzfile.read())) finally: gzfile.close() - else: - raise CMapDB.CMapNotFound(name) + raise CMapDB.CMapNotFound(name) @classmethod def get_cmap(cls, name: str) -> CMapBase: @@ -364,7 +356,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] - for (start_byte, end_byte, cid) in choplist(3, objs): + for start_byte, end_byte, cid in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object of begincidrange is not a byte.") continue @@ -377,7 +369,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if len(start_byte) != len(end_byte): self._warn_once( "The start and end byte of begincidrange have " - "different lengths." + "different lengths.", ) continue start_prefix = start_byte[:-4] @@ -385,7 +377,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if start_prefix != end_prefix: self._warn_once( "The prefix of the start and end byte of " - "begincidrange are not the same." + "begincidrange are not the same.", ) continue svar = start_byte[-4:] @@ -404,7 +396,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_ENDCIDCHAR: objs = [obj for (__, obj) in self.popall()] - for (cid, code) in choplist(2, objs): + for cid, code in choplist(2, objs): if isinstance(code, bytes) and isinstance(cid, int): self.cmap.add_cid2unichr(cid, code) return @@ -415,7 +407,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] - for (start_byte, end_byte, code) in choplist(3, objs): + for start_byte, end_byte, code in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object is not a byte.") continue @@ -431,7 +423,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if len(code) != end - start + 1: self._warn_once( "The difference between the start and end " - "offsets does not match the code length." + "offsets does not match the code length.", ) for cid, unicode_value in zip(range(start, end + 1), code): self.cmap.add_cid2unichr(cid, unicode_value) @@ -452,7 +444,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_ENDBFCHAR: objs = [obj for (__, obj) in self.popall()] - for (cid, code) in choplist(2, objs): + for cid, code in choplist(2, objs): if isinstance(cid, bytes) and isinstance(code, bytes): self.cmap.add_cid2unichr(nunpack(cid), code) return diff --git a/pdfminer/converter.py b/pdfminer/converter.py index f310387d..7563c7af 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -15,38 +15,48 @@ cast, ) +from pdfminer import utils +from pdfminer.image import ImageWriter +from pdfminer.layout import ( + LAParams, + LTAnno, + LTChar, + LTComponent, + LTContainer, + LTCurve, + LTFigure, + LTImage, + LTItem, + LTLayoutContainer, + LTLine, + LTPage, + LTRect, + LTText, + LTTextBox, + LTTextBoxVertical, + LTTextGroup, + LTTextLine, + TextGroupElement, +) from pdfminer.pdfcolor import PDFColorSpace -from . import utils -from .image import ImageWriter -from .layout import LAParams, LTComponent, TextGroupElement -from .layout import LTAnno -from .layout import LTChar -from .layout import LTContainer -from .layout import LTCurve -from .layout import LTFigure -from .layout import LTImage -from .layout import LTItem -from .layout import LTLayoutContainer -from .layout import LTLine -from .layout import LTPage -from .layout import LTRect -from .layout import LTText -from .layout import LTTextBox -from .layout import LTTextBoxVertical -from .layout import LTTextGroup -from .layout import LTTextLine -from .pdfdevice import PDFTextDevice -from .pdffont import PDFFont -from .pdffont import PDFUnicodeNotDefined -from .pdfinterp import PDFGraphicState, PDFResourceManager -from .pdfpage import PDFPage -from .pdftypes import PDFStream -from .pdfexceptions import PDFValueError -from .utils import AnyIO, Point, Matrix, Rect, PathSegment, make_compat_str -from .utils import apply_matrix_pt -from .utils import bbox2str -from .utils import enc -from .utils import mult_matrix +from pdfminer.pdfdevice import PDFTextDevice +from pdfminer.pdfexceptions import PDFValueError +from pdfminer.pdffont import PDFFont, PDFUnicodeNotDefined +from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager +from pdfminer.pdfpage import PDFPage +from pdfminer.pdftypes import PDFStream +from pdfminer.utils import ( + AnyIO, + Matrix, + PathSegment, + Point, + Rect, + apply_matrix_pt, + bbox2str, + enc, + make_compat_str, + mult_matrix, +) log = logging.getLogger(__name__) @@ -305,9 +315,7 @@ def _is_binary_stream(outfp: AnyIO) -> bool: return False elif isinstance(outfp, io.BytesIO): return True - elif isinstance(outfp, io.StringIO): - return False - elif isinstance(outfp, io.TextIOBase): + elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase): return False return True @@ -404,7 +412,12 @@ def __init__( text_colors: Optional[Dict[str, str]] = None, ) -> None: PDFConverter.__init__( - self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams + self, + rsrcmgr, + outfp, + codec=codec, + pageno=pageno, + laparams=laparams, ) # write() assumes a codec for binary I/O, or no codec for text I/O. @@ -455,7 +468,7 @@ def write_header(self) -> None: def write_footer(self) -> None: page_links = [f'{i}' for i in range(1, self.pageno)] s = '