Skip to content

Commit

Permalink
feat: Added pytesseract wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
GeorgeFI committed Sep 20, 2023
1 parent 30f7475 commit 0eaf610
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 6 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
"networkx",
"pydantic",
"psutil",
"pytesseract",
]

[project.optional-dependencies]
Expand Down
13 changes: 12 additions & 1 deletion requirements/dev_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ docutils==0.17.1
# sphinx
entrypoints==0.4
# via jupyter-client
exceptiongroup==1.1.3
# via pytest
executing==1.2.0
# via stack-data
fastjsonschema==2.16.2
Expand All @@ -97,6 +99,7 @@ importlib-metadata==5.1.0
# via
# jupyter-cache
# myst-nb
# sphinx
iniconfig==1.1.1
# via pytest
ipykernel==6.19.1
Expand Down Expand Up @@ -222,6 +225,7 @@ packaging==22.0
# matplotlib
# pikepdf
# pydata-sphinx-theme
# pytesseract
# pytest
# setuptools-scm
# spacy
Expand Down Expand Up @@ -252,6 +256,7 @@ pillow==9.3.0
# via
# matplotlib
# pikepdf
# pytesseract
# sec-certs (./../pyproject.toml)
pip-tools==6.11.0
# via sec-certs (./../pyproject.toml)
Expand Down Expand Up @@ -304,6 +309,8 @@ pyrsistent==0.19.2
# via jsonschema
pysankeybeta==1.4.0
# via sec-certs (./../pyproject.toml)
pytesseract==0.3.10
# via sec-certs (./../pyproject.toml)
pytest==7.2.0
# via
# pytest-cov
Expand Down Expand Up @@ -434,7 +441,9 @@ tomli==2.0.1
# pytest
# setuptools-scm
tornado==6.3.3
# via setuptools-scm
# via
# ipykernel
# jupyter-client
tqdm==4.64.1
# via
# sec-certs (./../pyproject.toml)
Expand Down Expand Up @@ -464,10 +473,12 @@ types-urllib3==1.26.25.4
# via types-requests
typing-extensions==4.4.0
# via
# black
# mypy
# myst-nb
# myst-parser
# pydantic
# pypdf
# setuptools-scm
urllib3==1.26.13
# via requests
Expand Down
5 changes: 5 additions & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ packaging==22.0
# ipykernel
# matplotlib
# pikepdf
# pytesseract
# setuptools-scm
# spacy
pandas==1.5.2
Expand All @@ -152,6 +153,7 @@ pillow==9.3.0
# via
# matplotlib
# pikepdf
# pytesseract
# sec-certs (./../pyproject.toml)
pkgconfig==1.5.5
# via sec-certs (./../pyproject.toml)
Expand Down Expand Up @@ -189,6 +191,8 @@ pyrsistent==0.19.2
# via jsonschema
pysankeybeta==1.4.0
# via sec-certs (./../pyproject.toml)
pytesseract==0.3.10
# via sec-certs (./../pyproject.toml)
python-dateutil==2.8.2
# via
# jupyter-client
Expand Down Expand Up @@ -275,6 +279,7 @@ typer==0.7.0
typing-extensions==4.4.0
# via
# pydantic
# pypdf
# setuptools-scm
urllib3==1.26.13
# via requests
Expand Down
7 changes: 7 additions & 0 deletions requirements/test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ distro==1.8.0
# via tabula-py
entrypoints==0.4
# via jupyter-client
exceptiongroup==1.1.3
# via pytest
executing==1.2.0
# via stack-data
fonttools==4.38.0
Expand Down Expand Up @@ -136,6 +138,7 @@ packaging==22.0
# ipykernel
# matplotlib
# pikepdf
# pytesseract
# pytest
# setuptools-scm
# spacy
Expand All @@ -161,6 +164,7 @@ pillow==9.3.0
# via
# matplotlib
# pikepdf
# pytesseract
# sec-certs (./../pyproject.toml)
pkgconfig==1.5.5
# via sec-certs (./../pyproject.toml)
Expand Down Expand Up @@ -200,6 +204,8 @@ pyrsistent==0.19.2
# via jsonschema
pysankeybeta==1.4.0
# via sec-certs (./../pyproject.toml)
pytesseract==0.3.10
# via sec-certs (./../pyproject.toml)
pytest==7.2.0
# via
# pytest-cov
Expand Down Expand Up @@ -295,6 +301,7 @@ typer==0.7.0
typing-extensions==4.4.0
# via
# pydantic
# pypdf
# setuptools-scm
urllib3==1.26.13
# via requests
Expand Down
14 changes: 9 additions & 5 deletions src/sec_certs/utils/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import pdftotext
import pikepdf
import pytesseract

from sec_certs import constants
from sec_certs.constants import (
Expand Down Expand Up @@ -51,13 +52,16 @@ def ocr_pdf_file(pdf_path: Path) -> str:
)
if ppm.returncode != 0:
raise ValueError(f"pdftoppm failed: {ppm.returncode}")

for ppm_path in map(Path, glob.glob(str(tmppath / "image*.ppm"))):
base = ppm_path.with_suffix("")
tes = subprocess.run(
["tesseract", "-l", "eng+deu+fra", ppm_path, base], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
)
if tes.returncode != 0:
raise ValueError(f"tesseract failed: {tes.returncode}")
content = pytesseract.image_to_string(ppm_path, lang="eng+deu+fra")

Check warning on line 58 in src/sec_certs/utils/pdf.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/utils/pdf.py#L58

Added line #L58 was not covered by tests

if content:
with Path(base.with_suffix(".txt")).open("w") as file:
file.write(content)

Check warning on line 62 in src/sec_certs/utils/pdf.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/utils/pdf.py#L60-L62

Added lines #L60 - L62 were not covered by tests
else:
raise ValueError(f"OCR failed for document {ppm_path}. Check document manually")

Check warning on line 64 in src/sec_certs/utils/pdf.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/utils/pdf.py#L64

Added line #L64 was not covered by tests

contents = ""

Expand Down

0 comments on commit 0eaf610

Please sign in to comment.