Skip to content

Commit

Permalink
Merge pull request #20 from explosion/feature/allow-doclingdocument
Browse files Browse the repository at this point in the history
Allow DoclingDocument as direct input
  • Loading branch information
ines authored Dec 13, 2024
2 parents 64c6f4a + 2042aee commit b9ca1e0
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ doc = layout("./starcraft.pdf")

| Argument | Type | Description |
| --- | --- | --- |
| `source` | `str \| Path \| bytes` | Path of document to process or bytes. |
| `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. |
| **RETURNS** | `Doc` | The processed spaCy `Doc` object. |

#### <kbd>method</kbd> `spaCyLayout.pipe`
Expand Down
26 changes: 15 additions & 11 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import srsly
from docling.datamodel.base_models import DocumentStream
from docling.document_converter import DocumentConverter
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc.labels import DocItemLabel
from spacy.tokens import Doc, Span, SpanGroup

Expand All @@ -13,7 +14,7 @@

if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import ConversionResult, FormatOption
from docling.document_converter import FormatOption
from pandas import DataFrame
from spacy.language import Language

Expand Down Expand Up @@ -66,37 +67,40 @@ def __init__(
Span.set_extension(self.attrs.span_data, default=None, force=True)
Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)

def __call__(self, source: str | Path | bytes) -> Doc:
def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
"""Call parser on a path to create a spaCy Doc object."""
result = self.converter.convert(self._get_source(source))
if isinstance(source, DoclingDocument):
result = source
else:
result = self.converter.convert(self._get_source(source)).document
return self._result_to_doc(result)

def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
"""Process multiple documents and create spaCy Doc objects."""
data = (self._get_source(source) for source in sources)
results = self.converter.convert_all(data)
for result in results:
yield self._result_to_doc(result)
yield self._result_to_doc(result.document)

def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
if isinstance(source, (str, Path)):
return source
return DocumentStream(name="source", stream=BytesIO(source))

def _result_to_doc(self, result: "ConversionResult") -> Doc:
def _result_to_doc(self, document: DoclingDocument) -> Doc:
inputs = []
pages = {
(page.page_no + 1): PageLayout(
(page.page_no): PageLayout(
page_no=page.page_no + 1,
width=page.size.width if page.size else 0,
height=page.size.height if page.size else 0,
)
for page in result.pages
for _, page in document.pages.items()
}
text_items = {item.self_ref: item for item in result.document.texts}
table_items = {item.self_ref: item for item in result.document.tables}
text_items = {item.self_ref: item for item in document.texts}
table_items = {item.self_ref: item for item in document.tables}
# We want to iterate over the tree to get different elements in order
for node, _ in result.document.iterate_items():
for node, _ in document.iterate_items():
if node.self_ref in text_items:
item = text_items[node.self_ref]
if item.text == "":
Expand All @@ -111,7 +115,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
inputs.append((table_text, item))
doc = self._texts_to_doc(inputs, pages)
doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown())
doc._.set(self.attrs.doc_markdown, document.export_to_markdown())
return doc

def _texts_to_doc(
Expand Down

0 comments on commit b9ca1e0

Please sign in to comment.