Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow DoclingDocument as direct input #20

Merged
merged 1 commit into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ doc = layout("./starcraft.pdf")

| Argument | Type | Description |
| --- | --- | --- |
| `source` | `str \| Path \| bytes` | Path of document to process or bytes. |
| `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. |
| **RETURNS** | `Doc` | The processed spaCy `Doc` object. |

#### <kbd>method</kbd> `spaCyLayout.pipe`
Expand Down
26 changes: 15 additions & 11 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import srsly
from docling.datamodel.base_models import DocumentStream
from docling.document_converter import DocumentConverter
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc.labels import DocItemLabel
from spacy.tokens import Doc, Span, SpanGroup

Expand All @@ -13,7 +14,7 @@

if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import ConversionResult, FormatOption
from docling.document_converter import FormatOption
from pandas import DataFrame
from spacy.language import Language

Expand Down Expand Up @@ -66,37 +67,40 @@ def __init__(
Span.set_extension(self.attrs.span_data, default=None, force=True)
Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)

def __call__(self, source: str | Path | bytes) -> Doc:
def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
"""Call parser on a path to create a spaCy Doc object."""
result = self.converter.convert(self._get_source(source))
if isinstance(source, DoclingDocument):
result = source
else:
result = self.converter.convert(self._get_source(source)).document
return self._result_to_doc(result)

def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
"""Process multiple documents and create spaCy Doc objects."""
data = (self._get_source(source) for source in sources)
results = self.converter.convert_all(data)
for result in results:
yield self._result_to_doc(result)
yield self._result_to_doc(result.document)

def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
if isinstance(source, (str, Path)):
return source
return DocumentStream(name="source", stream=BytesIO(source))

def _result_to_doc(self, result: "ConversionResult") -> Doc:
def _result_to_doc(self, document: DoclingDocument) -> Doc:
inputs = []
pages = {
(page.page_no + 1): PageLayout(
(page.page_no): PageLayout(
page_no=page.page_no + 1,
width=page.size.width if page.size else 0,
height=page.size.height if page.size else 0,
)
for page in result.pages
for _, page in document.pages.items()
}
text_items = {item.self_ref: item for item in result.document.texts}
table_items = {item.self_ref: item for item in result.document.tables}
text_items = {item.self_ref: item for item in document.texts}
table_items = {item.self_ref: item for item in document.tables}
# We want to iterate over the tree to get different elements in order
for node, _ in result.document.iterate_items():
for node, _ in document.iterate_items():
if node.self_ref in text_items:
item = text_items[node.self_ref]
if item.text == "":
Expand All @@ -111,7 +115,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
inputs.append((table_text, item))
doc = self._texts_to_doc(inputs, pages)
doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown())
doc._.set(self.attrs.doc_markdown, document.export_to_markdown())
return doc

def _texts_to_doc(
Expand Down
Loading