-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from explosion/fix/serialization-new
Fix serialization of extension attributes
- Loading branch information
Showing
6 changed files
with
123 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
spacy>=3.7.5 | ||
docling>=2.5.2 | ||
pandas # version range set by Docling | ||
srsly # version range set by spaCy | ||
# Dev requirements | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import dataclasses | ||
from typing import TYPE_CHECKING, Callable | ||
|
||
from docling_core.types.doc.base import CoordOrigin | ||
from pandas import DataFrame | ||
|
||
from .types import DocLayout, PageLayout, SpanLayout | ||
|
||
if TYPE_CHECKING: | ||
from docling_core.types.doc.base import BoundingBox | ||
|
||
TYPE_ATTR = "__type__" | ||
OBJ_TYPES = {"SpanLayout": SpanLayout, "DocLayout": DocLayout, "PageLayout": PageLayout} | ||
|
||
|
||
def encode_obj(obj, chain: Callable | None = None): | ||
"""Convert custom dataclass to dict for serialization.""" | ||
if isinstance(obj, tuple(OBJ_TYPES.values())): | ||
result = dataclasses.asdict(obj) | ||
result[TYPE_ATTR] = type(obj).__name__ | ||
return result | ||
return obj if chain is None else chain(obj) | ||
|
||
|
||
def decode_obj(obj, chain: Callable | None = None): | ||
"""Load custom dataclass from serialized dict.""" | ||
if isinstance(obj, dict) and obj.get(TYPE_ATTR) in OBJ_TYPES: | ||
obj_type = obj.pop(TYPE_ATTR) | ||
return OBJ_TYPES[obj_type].from_dict(obj) | ||
return obj if chain is None else chain(obj) | ||
|
||
|
||
def encode_df(obj, chain: Callable | None = None): | ||
"""Convert pandas.DataFrame for serialization.""" | ||
if isinstance(obj, DataFrame): | ||
return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"} | ||
return obj if chain is None else chain(obj) | ||
|
||
|
||
def decode_df(obj, chain: Callable | None = None): | ||
"""Load pandas.DataFrame from serialized data.""" | ||
if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame": | ||
return DataFrame(obj["data"]) | ||
return obj if chain is None else chain(obj) | ||
|
||
|
||
def get_bounding_box( | ||
bbox: "BoundingBox", page_height: float | ||
) -> tuple[float, float, float, float]: | ||
is_bottom = bbox.coord_origin == CoordOrigin.BOTTOMLEFT | ||
y = page_height - bbox.t if is_bottom else bbox.t | ||
height = bbox.t - bbox.b if is_bottom else bbox.b - bbox.t | ||
width = bbox.r - bbox.l | ||
return (bbox.l, y, width, height) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters