Skip to content

Commit

Permalink
Fix bounding boxes for top left and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Nov 24, 2024
1 parent a6b33bf commit 72c8892
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 9 deletions.
21 changes: 13 additions & 8 deletions spacy_layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import ConversionResult, FormatOption
from docling_core.types.doc.base import BoundingBox
from pandas import DataFrame
from spacy.language import Language

Expand Down Expand Up @@ -148,15 +149,9 @@ def _get_span_layout(
prov = item.prov[0]
page = pages[prov.page_no]
if page.width and page.height:
is_bottom = prov.bbox.coord_origin == CoordOrigin.BOTTOMLEFT
y = page.height - prov.bbox.t if is_bottom else prov.bbox.t
height = prov.bbox.t - prov.bbox.b if is_bottom else prov.bbox.t
x, y, width, height = get_bounding_box(prov.bbox, page.height)
bounding_box = SpanLayout(
x=prov.bbox.l,
y=y,
width=prov.bbox.r - prov.bbox.l,
height=height,
page_no=prov.page_no,
x=x, y=y, width=width, height=height, page_no=prov.page_no
)
return bounding_box

Expand Down Expand Up @@ -186,3 +181,13 @@ def get_tables(self, doc: Doc) -> list[Span]:
for span in doc.spans[self.attrs.span_group]
if span.label_ == DocItemLabel.TABLE
]


def get_bounding_box(
bbox: "BoundingBox", page_height: float
) -> tuple[float, float, float, float]:
is_bottom = bbox.coord_origin == CoordOrigin.BOTTOMLEFT
y = page_height - bbox.t if is_bottom else bbox.t
height = bbox.t - bbox.b if is_bottom else bbox.b - bbox.t
width = bbox.r - bbox.l
return (bbox.l, y, width, height)
40 changes: 39 additions & 1 deletion tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import pytest
import spacy
from docling_core.types.doc.base import BoundingBox, CoordOrigin
from docling_core.types.doc.labels import DocItemLabel

from spacy_layout import spaCyLayout
from spacy_layout.layout import TABLE_PLACEHOLDER
from spacy_layout.layout import TABLE_PLACEHOLDER, get_bounding_box
from spacy_layout.types import DocLayout, SpanLayout

PDF_STARCRAFT = Path(__file__).parent / "data" / "starcraft.pdf"
Expand Down Expand Up @@ -80,3 +81,40 @@ def display_table(df):
doc = layout(PDF_TABLE)
table = doc._.get(layout.attrs.doc_tables)[0]
assert table.text == "Table with columns: Name, Type, Place of birth"


@pytest.mark.parametrize(
"box,page_height,expected",
[
(
(200.0, 50.0, 100.0, 400.0, CoordOrigin.BOTTOMLEFT),
1000.0,
(100.0, 800.0, 300.0, 150.0),
),
(
(200.0, 250.0, 100.0, 400.0, CoordOrigin.TOPLEFT),
1000.0,
(100.0, 200.0, 300.0, 50.0),
),
(
(
648.3192749023438,
633.4112548828125,
155.50897216796875,
239.66929626464844,
CoordOrigin.BOTTOMLEFT,
),
792.0,
(
155.50897216796875,
143.68072509765625,
84.16032409667969,
14.90802001953125,
),
),
],
)
def test_bounding_box(box, page_height, expected):
top, bottom, left, right, origin = box
bbox = BoundingBox(t=top, b=bottom, l=left, r=right, coord_origin=origin)
assert get_bounding_box(bbox, page_height) == expected

0 comments on commit 72c8892

Please sign in to comment.