Skip to content

Commit d12e3fd

Browse files
committed
add Illuminator tool for post docling conversion analysis
This adds the initial version of the Illuminator tool. Illuminator is a post docling conversion checker designed to highlight problematic tables and merged cells that may have impacted the conversion from pdf to markdown. Includes: analyze_pdf_with_docling for detecting merged table cells automatic page number detection via Docling provenance data CLI interface supporting single file or directory input JSON and terminal output summaries of detected issues initial README and requirements.txt for setup Signed-off-by: Alina Ryan <[email protected]>
1 parent 4c33afc commit d12e3fd

File tree

7 files changed

+401
-0
lines changed

7 files changed

+401
-0
lines changed

notebooks/illuminator/.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Ignore virtual environments
2+
venv/
3+
.env/
4+
5+
# Ignore compiled Python files
6+
__pycache__/
7+
*.pyc
8+
*.pyo
9+
10+
# Ignore output files
11+
results.json
12+
13+
# Ignore system files
14+
.DS_Store
15+
Thumbs.db

notebooks/illuminator/README.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# 💡 Illuminator
2+
3+
## 📌 Overview
4+
Illuminator is your post-conversion PDF sanity checker. After converting documents with Docling, Illuminator scans the result and flags merged table cells that could cause layout issues or require manual cleanup.
5+
6+
It's a lightweight tool designed for teams working with structured data, helping you catch subtle formatting problems before they snowball.
7+
8+
Illuminator checks for:
9+
- **⚠️ Merged Table Cells**
10+
- Colspan > 1
11+
- Rowspan > 1
12+
- **📄 Accurate Page Mapping**
13+
- Uses Docling’s provenance metadata (not guesswork!)
14+
- Associates each merged cell with its correct page number
15+
16+
---
17+
18+
## 🔧 Installation
19+
20+
### 1️⃣ **Clone the Repository**
21+
```sh
22+
git clone https://github.com/instructlab/support-utils.git
23+
cd support-utils/beta/illuminator
24+
```
25+
26+
### 2️⃣ Create a Virtual Environment (Optional, Recommended)
27+
```python3 -m venv venv
28+
source venv/bin/activate # On macOS/Linux
29+
venv\Scripts\activate # On Windows
30+
```
31+
32+
### 3️⃣ Install Dependencies
33+
```
34+
pip install -r requirements.txt
35+
```
36+
37+
## 🚀 Usage
38+
### Check a Single PDF or an Entire Folder of PDFs
39+
```
40+
python illuminator.py -f /path/to/pdf/document.pdf
41+
```
42+
43+
### Save Results to a JSON File
44+
By default, results are saved to results.json. To specify a different output file:
45+
```
46+
python illuminator.py -f /path/to/pdf/folder/ -o results.json
47+
```
48+
49+
## 📝 Output Format
50+
### 📄 Terminal Output (Example)
51+
52+
📂 File: /home/user/documents/report.pdf
53+
54+
⚠️ Merged Table Cells Detected on Pages: 2, 4
55+
- Page 2: "Total Revenue" (colspan=2, rowspan=1)
56+
- Page 4: "[empty]" (colspan=3, rowspan=1)
57+
58+
📁 Results saved to results.json
59+
60+
61+
## 📁 JSON Output (results.json)
62+
```
63+
{
64+
"/home/user/documents/report.pdf": {
65+
"page_count": 10,
66+
"table_count": 3,
67+
"merged_cell_pages": [2, 4],
68+
"merged_table_cells": [
69+
{
70+
"page": 2,
71+
"row": 0,
72+
"column": 1,
73+
"colspan": 2,
74+
"rowspan": 1,
75+
"text": "Total Revenue"
76+
},
77+
{
78+
"page": 4,
79+
"row": 2,
80+
"column": 0,
81+
"colspan": 3,
82+
"rowspan": 1,
83+
"text": "[empty]"
84+
}
85+
]
86+
}
87+
}
88+
```
89+
90+
## 🤝 Acknowledgments
91+
Built by Alina with ❤️ for better PDF conversion workflows!
92+

notebooks/illuminator/analysis.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from docling.document_converter import DocumentConverter
2+
from typing import List, Tuple, Dict, Any, Union
3+
from log_utils import logger
4+
import os
5+
6+
def cell_is_merged(cell) -> bool:
7+
"""
8+
Determines whether a table cell is merged based on its row or column span.
9+
10+
Args:
11+
cell: A table cell object from the Docling document.
12+
13+
Returns:
14+
True if the cell spans multiple rows or columns; False otherwise.
15+
"""
16+
return (
17+
cell.col_span > 1 or
18+
cell.row_span > 1
19+
)
20+
21+
def summarize_tables(doc) -> Tuple[int, List[int]]:
22+
"""
23+
Summarizes tables and extracts the page numbers they appear on.
24+
25+
Args:
26+
doc: A DoclingDocument object.
27+
28+
Returns:
29+
A tuple containing:
30+
- The number of tables found.
31+
- A list of page numbers on which the tables are located.
32+
"""
33+
tables = doc.tables
34+
num_tables = len(tables)
35+
pages = []
36+
37+
for table in tables:
38+
for prov in table.prov:
39+
pages.append(prov.page_no)
40+
41+
return num_tables, pages
42+
43+
44+
def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
45+
"""
46+
Analyzes a PDF for merged table cells using the Docling converter,
47+
and saves a Markdown version of the converted document.
48+
49+
Args:
50+
file_path: Path to the input PDF file.
51+
52+
Returns:
53+
A dictionary containing:
54+
- Total number of tables.
55+
- Set of pages with merged cells.
56+
- List of merged cell details (page, position, spans, text).
57+
- Total unique pages with tables.
58+
"""
59+
converter = DocumentConverter()
60+
result = converter.convert(file_path)
61+
doc = result.document
62+
63+
# ✅ Save Markdown output
64+
markdown_text = doc.export_to_markdown()
65+
base_name = os.path.splitext(os.path.basename(file_path))[0]
66+
md_output_path = f"{base_name}.md"
67+
with open(md_output_path, "w") as f:
68+
f.write(markdown_text)
69+
70+
logger.info(f"📝 Markdown saved to {md_output_path}")
71+
72+
# ⬇️ Proceed with table analysis
73+
table_count, table_pages_list = summarize_tables(doc)
74+
total_pages = len(set(table_pages_list)) or "Unknown"
75+
76+
issues = {
77+
"merged_table_cells": [],
78+
"table_count": table_count,
79+
"merged_cell_pages": set(),
80+
"page_count": total_pages
81+
}
82+
83+
for i, table_item in enumerate(doc.tables):
84+
try:
85+
page_number = table_pages_list[i]
86+
except IndexError:
87+
page_number = "Unknown page"
88+
table_data = table_item.data
89+
90+
for row_idx, row in enumerate(table_data.grid):
91+
for col_idx, cell in enumerate(row):
92+
if cell_is_merged(cell):
93+
issues["merged_table_cells"].append({
94+
"page": page_number,
95+
"row": row_idx,
96+
"column": col_idx,
97+
"colspan": cell.col_span,
98+
"rowspan": cell.row_span,
99+
"text": cell.text or "[empty]"
100+
})
101+
102+
if isinstance(page_number, int):
103+
issues["merged_cell_pages"].add(page_number)
104+
105+
issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
106+
return issues

notebooks/illuminator/illuminator.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# main.py
2+
import argparse
3+
from utils import get_pdf_files, save_results, generate_summary
4+
from analysis import analyze_pdf_with_docling
5+
from log_utils import logger
6+
7+
def parse_args() -> argparse.Namespace:
8+
"""
9+
Parses command-line arguments for the Illuminator tool.
10+
11+
Returns:
12+
argparse.Namespace containing:
13+
- file: Optional path to a single PDF.
14+
- dir: Optional path to a directory of PDFs.
15+
- output: Path to save results JSON file.
16+
"""
17+
parser = argparse.ArgumentParser(description="Docling PDF Checker")
18+
parser.add_argument(
19+
"-f", "--file",
20+
help="Path to a PDF file or directory of PDFs",
21+
required=True
22+
)
23+
parser.add_argument(
24+
"-o", "--output",
25+
help="Optional path to save JSON results",
26+
default="results.json"
27+
)
28+
return parser.parse_args()
29+
30+
31+
def main() -> None:
32+
"""
33+
Main execution flow:
34+
- Parses arguments
35+
- Loads and analyzes PDFs
36+
- Generates and saves results
37+
"""
38+
args = parse_args()
39+
pdfs = get_pdf_files(args.file)
40+
if not pdfs:
41+
logger.error("❌ No PDFs found to process.")
42+
return
43+
44+
all_results = {}
45+
for path in pdfs:
46+
logger.info(f"\n🔍 Converting and analyzing: {path}\n")
47+
try:
48+
result = analyze_pdf_with_docling(path)
49+
all_results[path] = result
50+
except Exception as e:
51+
logger.error(f"❌ Failed to process {path}: {e}")
52+
53+
generate_summary(all_results)
54+
save_results(all_results, args.output)
55+
56+
if __name__ == "__main__":
57+
main()

notebooks/illuminator/log_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import logging
2+
3+
logger = logging.getLogger("illuminator")
4+
logger.setLevel(logging.INFO)
5+
6+
# Don't let it propagate to the root logger (which docling uses)
7+
logger.propagate = False
8+
9+
# Add handler if none exists (to avoid duplication)
10+
if not logger.handlers:
11+
handler = logging.StreamHandler()
12+
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"))
13+
logger.addHandler(handler)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
docling

0 commit comments

Comments
 (0)