Skip to content

Commit

Permalink
Merge pull request #5 from explosion/feature/bytes
Browse files Browse the repository at this point in the history
Accept bytes as input
  • Loading branch information
ines authored Nov 20, 2024
2 parents 839887e + 9c37c6c commit 39edf39
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 8 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ doc = layout("./starcraft.pdf")

| Argument | Type | Description |
| --- | --- | --- |
| `path` | `str \| Path` | Path of document to process. |
| `source` | `str \| Path \| bytes` | Path of document to process or bytes. |
| **RETURNS** | `Doc` | The processed spaCy `Doc` object. |

#### <kbd>method</kbd> `spaCyLayout.pipe`
Expand All @@ -164,5 +164,5 @@ docs = layout.pipe(paths)

| Argument | Type | Description |
| --- | --- | --- |
| `paths` | `Iterable[str \| Path]` | Paths of documents to process. |
| `paths` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. |
| **YIELDS** | `Doc` | The processed spaCy `Doc` object. |
17 changes: 12 additions & 5 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from io import BytesIO
from pathlib import Path
from typing import Iterable, Iterator

from docling.datamodel.base_models import InputFormat
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.document_converter import ConversionResult, DocumentConverter, FormatOption
from docling_core.types.doc.labels import DocItemLabel
from spacy.language import Language
Expand Down Expand Up @@ -41,17 +42,23 @@ def __init__(
Span.set_extension(self.attrs.span_layout, default=None, force=True)
Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)

def __call__(self, path: str | Path) -> Doc:
def __call__(self, source: str | Path | bytes) -> Doc:
"""Call parser on a path to create a spaCy Doc object."""
result = self.converter.convert(path)
result = self.converter.convert(self._get_source(source))
return self._result_to_doc(result)

def pipe(self, paths: Iterable[str | Path]) -> Iterator[Doc]:
def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
"""Process multiple documents and create spaCy Doc objects."""
results = self.converter.convert_all(paths)
data = (self._get_source(source) for source in sources)
results = self.converter.convert_all(data)
for result in results:
yield self._result_to_doc(result)

def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
if isinstance(source, (str, Path)):
return source
return DocumentStream(name="source", stream=BytesIO(source))

def _result_to_doc(self, result: ConversionResult) -> Doc:
inputs = []
for item in result.document.texts:
Expand Down
3 changes: 2 additions & 1 deletion tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
PDF_STARCRAFT = Path(__file__).parent / "data" / "starcraft.pdf"
PDF_SIMPLE = Path(__file__).parent / "data" / "simple.pdf"
DOCX_SIMPLE = Path(__file__).parent / "data" / "simple.docx"
PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read()


@pytest.fixture
Expand All @@ -22,7 +23,7 @@ def span_labels():
return [label.value for label in DocItemLabel]


@pytest.mark.parametrize("path", [PDF_STARCRAFT, PDF_SIMPLE])
@pytest.mark.parametrize("path", [PDF_STARCRAFT, PDF_SIMPLE, PDF_SIMPLE_BYTES])
def test_general(path, nlp, span_labels):
layout = spaCyLayout(nlp)
doc = layout(path)
Expand Down

0 comments on commit 39edf39

Please sign in to comment.