Skip to content

Commit 1c3bd97

Browse files
committed
Ignore zero-height and zero-width bounding boxes.
Signed-off-by: Benjamin Hahn <[email protected]>
1 parent 5ab8b8c commit 1c3bd97

File tree

4 files changed

+92
-3
lines changed

4 files changed

+92
-3
lines changed

docling_core/types/doc/document.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5347,12 +5347,17 @@ def load_from_doctags( # noqa: C901
53475347
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
53485348
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
53495349
coords = re.findall(r"<loc_(\d+)>", text_chunk)
5350-
if len(coords) > 4:
5350+
if len(coords) < 4:
5351+
return None
5352+
else:
53515353
coords = coords[:4]
5352-
if len(coords) == 4:
53535354
l, t, r, b = map(float, coords)
5355+
eps = 1e-3
5356+
# Ignore bounding boxes with width or height of <1e-3, including cases where l>r or t>b.
5357+
if r - l < eps or b - t < eps:
5358+
return None
5359+
53545360
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
5355-
return None
53565361

53575362
def extract_inner_text(text_chunk: str) -> str:
53585363
"""Strip all <...> tags inside the chunk to get the raw text content."""

test/data/doc/defect_bbox_page.dt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Assistant: <doctag>
2+
<text><loc_68><loc_447><loc_445><loc_447>This is valid text with a zero-height bounding box.</text>
3+
<text><loc_68><loc_447><loc_32><loc_499>This is valid text with a negative-width bounding box.</text>
4+
</doctag><end_of_utterance>
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
{
2+
"body": {
3+
"children": [
4+
{
5+
"$ref": "#/texts/0"
6+
},
7+
{
8+
"$ref": "#/texts/1"
9+
}
10+
],
11+
"content_layer": "body",
12+
"label": "unspecified",
13+
"name": "_root_",
14+
"self_ref": "#/body"
15+
},
16+
"form_items": [],
17+
"furniture": {
18+
"children": [],
19+
"content_layer": "furniture",
20+
"label": "unspecified",
21+
"name": "_root_",
22+
"self_ref": "#/furniture"
23+
},
24+
"groups": [],
25+
"key_value_items": [],
26+
"name": "Document",
27+
"pages": {
28+
"1": {
29+
"page_no": 1,
30+
"size": {
31+
"height": 1.0,
32+
"width": 1.0
33+
}
34+
}
35+
},
36+
"pictures": [],
37+
"schema_name": "DoclingDocument",
38+
"tables": [],
39+
"texts": [
40+
{
41+
"children": [],
42+
"content_layer": "body",
43+
"label": "text",
44+
"orig": "This is valid text with a zero-height bounding box.",
45+
"parent": {
46+
"$ref": "#/body"
47+
},
48+
"prov": [],
49+
"self_ref": "#/texts/0",
50+
"text": "This is valid text with a zero-height bounding box."
51+
},
52+
{
53+
"children": [],
54+
"content_layer": "body",
55+
"label": "text",
56+
"orig": "This is valid text with a negative-width bounding box.",
57+
"parent": {
58+
"$ref": "#/body"
59+
},
60+
"prov": [],
61+
"self_ref": "#/texts/1",
62+
"text": "This is valid text with a negative-width bounding box."
63+
}
64+
],
65+
"version": "1.8.0"
66+
}

test/test_doctags_load.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,17 @@ def test_doctags_inline():
168168
exp_file=exp,
169169
actual=deser_doc.export_to_dict(),
170170
)
171+
172+
173+
def test_doctags_handle_defect_bbox():
174+
175+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
176+
[Path("test/data/doc/defect_bbox_page.dt")], None
177+
)
178+
179+
doc = DoclingDocument.load_from_doctags(doctags_doc)
180+
exp = "test/data/doc/defect_bbox_page.dt.json"
181+
verify(
182+
exp_file=exp,
183+
actual=doc.export_to_dict(),
184+
)

0 commit comments

Comments
 (0)