Skip to content

Commit 4ef378f

Browse files
authored
Merge pull request #5 from alinaryan/add-illuminator
add Illuminator tool for post docling conversion analysis
2 parents 4c33afc + d12e3fd commit 4ef378f

File tree

7 files changed

+401
-0
lines changed

7 files changed

+401
-0
lines changed

notebooks/illuminator/.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Ignore virtual environments
2+
venv/
3+
.env/
4+
5+
# Ignore compiled Python files
6+
__pycache__/
7+
*.pyc
8+
*.pyo
9+
10+
# Ignore output files
11+
results.json
12+
13+
# Ignore system files
14+
.DS_Store
15+
Thumbs.db

notebooks/illuminator/README.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# 💡 Illuminator
2+
3+
## 📌 Overview
4+
Illuminator is your post-conversion PDF sanity checker. After converting documents with Docling, Illuminator scans the result and flags merged table cells that could cause layout issues or require manual cleanup.
5+
6+
It's a lightweight tool designed for teams working with structured data, helping you catch subtle formatting problems before they snowball.
7+
8+
Illuminator checks for:
9+
- **⚠️ Merged Table Cells**
10+
- Colspan > 1
11+
- Rowspan > 1
12+
- **📄 Accurate Page Mapping**
13+
- Uses Docling’s provenance metadata (not guesswork!)
14+
- Associates each merged cell with its correct page number
15+
16+
---
17+
18+
## 🔧 Installation
19+
20+
### 1️⃣ **Clone the Repository**
21+
```sh
22+
git clone https://github.com/instructlab/support-utils.git
23+
cd support-utils/beta/illuminator
24+
```
25+
26+
### 2️⃣ Create a Virtual Environment (Optional, Recommended)
27+
```python3 -m venv venv
28+
source venv/bin/activate # On macOS/Linux
29+
venv\Scripts\activate # On Windows
30+
```
31+
32+
### 3️⃣ Install Dependencies
33+
```
34+
pip install -r requirements.txt
35+
```
36+
37+
## 🚀 Usage
38+
### Check a Single PDF or an Entire Folder of PDFs
39+
```
40+
python illuminator.py -f /path/to/pdf/document.pdf
41+
```
42+
43+
### Save Results to a JSON File
44+
By default, results are saved to results.json. To specify a different output file:
45+
```
46+
python illuminator.py -f /path/to/pdf/folder/ -o results.json
47+
```
48+
49+
## 📝 Output Format
50+
### 📄 Terminal Output (Example)
51+
52+
📂 File: /home/user/documents/report.pdf
53+
54+
⚠️ Merged Table Cells Detected on Pages: 2, 4
55+
- Page 2: "Total Revenue" (colspan=2, rowspan=1)
56+
- Page 4: "[empty]" (colspan=3, rowspan=1)
57+
58+
📁 Results saved to results.json
59+
60+
61+
## 📁 JSON Output (results.json)
62+
```
63+
{
64+
"/home/user/documents/report.pdf": {
65+
"page_count": 10,
66+
"table_count": 3,
67+
"merged_cell_pages": [2, 4],
68+
"merged_table_cells": [
69+
{
70+
"page": 2,
71+
"row": 0,
72+
"column": 1,
73+
"colspan": 2,
74+
"rowspan": 1,
75+
"text": "Total Revenue"
76+
},
77+
{
78+
"page": 4,
79+
"row": 2,
80+
"column": 0,
81+
"colspan": 3,
82+
"rowspan": 1,
83+
"text": "[empty]"
84+
}
85+
]
86+
}
87+
}
88+
```
89+
90+
## 🤝 Acknowledgments
91+
Built by Alina with ❤️ for better PDF conversion workflows!
92+

notebooks/illuminator/analysis.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from docling.document_converter import DocumentConverter
2+
from typing import List, Tuple, Dict, Any, Union
3+
from log_utils import logger
4+
import os
5+
6+
def cell_is_merged(cell) -> bool:
7+
"""
8+
Determines whether a table cell is merged based on its row or column span.
9+
10+
Args:
11+
cell: A table cell object from the Docling document.
12+
13+
Returns:
14+
True if the cell spans multiple rows or columns; False otherwise.
15+
"""
16+
return (
17+
cell.col_span > 1 or
18+
cell.row_span > 1
19+
)
20+
21+
def summarize_tables(doc) -> Tuple[int, List[int]]:
22+
"""
23+
Summarizes tables and extracts the page numbers they appear on.
24+
25+
Args:
26+
doc: A DoclingDocument object.
27+
28+
Returns:
29+
A tuple containing:
30+
- The number of tables found.
31+
- A list of page numbers on which the tables are located.
32+
"""
33+
tables = doc.tables
34+
num_tables = len(tables)
35+
pages = []
36+
37+
for table in tables:
38+
for prov in table.prov:
39+
pages.append(prov.page_no)
40+
41+
return num_tables, pages
42+
43+
44+
def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
45+
"""
46+
Analyzes a PDF for merged table cells using the Docling converter,
47+
and saves a Markdown version of the converted document.
48+
49+
Args:
50+
file_path: Path to the input PDF file.
51+
52+
Returns:
53+
A dictionary containing:
54+
- Total number of tables.
55+
- Set of pages with merged cells.
56+
- List of merged cell details (page, position, spans, text).
57+
- Total unique pages with tables.
58+
"""
59+
converter = DocumentConverter()
60+
result = converter.convert(file_path)
61+
doc = result.document
62+
63+
# ✅ Save Markdown output
64+
markdown_text = doc.export_to_markdown()
65+
base_name = os.path.splitext(os.path.basename(file_path))[0]
66+
md_output_path = f"{base_name}.md"
67+
with open(md_output_path, "w") as f:
68+
f.write(markdown_text)
69+
70+
logger.info(f"📝 Markdown saved to {md_output_path}")
71+
72+
# ⬇️ Proceed with table analysis
73+
table_count, table_pages_list = summarize_tables(doc)
74+
total_pages = len(set(table_pages_list)) or "Unknown"
75+
76+
issues = {
77+
"merged_table_cells": [],
78+
"table_count": table_count,
79+
"merged_cell_pages": set(),
80+
"page_count": total_pages
81+
}
82+
83+
for i, table_item in enumerate(doc.tables):
84+
try:
85+
page_number = table_pages_list[i]
86+
except IndexError:
87+
page_number = "Unknown page"
88+
table_data = table_item.data
89+
90+
for row_idx, row in enumerate(table_data.grid):
91+
for col_idx, cell in enumerate(row):
92+
if cell_is_merged(cell):
93+
issues["merged_table_cells"].append({
94+
"page": page_number,
95+
"row": row_idx,
96+
"column": col_idx,
97+
"colspan": cell.col_span,
98+
"rowspan": cell.row_span,
99+
"text": cell.text or "[empty]"
100+
})
101+
102+
if isinstance(page_number, int):
103+
issues["merged_cell_pages"].add(page_number)
104+
105+
issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
106+
return issues

notebooks/illuminator/illuminator.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# main.py
2+
import argparse
3+
from utils import get_pdf_files, save_results, generate_summary
4+
from analysis import analyze_pdf_with_docling
5+
from log_utils import logger
6+
7+
def parse_args() -> argparse.Namespace:
8+
"""
9+
Parses command-line arguments for the Illuminator tool.
10+
11+
Returns:
12+
argparse.Namespace containing:
13+
- file: Optional path to a single PDF.
14+
- dir: Optional path to a directory of PDFs.
15+
- output: Path to save results JSON file.
16+
"""
17+
parser = argparse.ArgumentParser(description="Docling PDF Checker")
18+
parser.add_argument(
19+
"-f", "--file",
20+
help="Path to a PDF file or directory of PDFs",
21+
required=True
22+
)
23+
parser.add_argument(
24+
"-o", "--output",
25+
help="Optional path to save JSON results",
26+
default="results.json"
27+
)
28+
return parser.parse_args()
29+
30+
31+
def main() -> None:
32+
"""
33+
Main execution flow:
34+
- Parses arguments
35+
- Loads and analyzes PDFs
36+
- Generates and saves results
37+
"""
38+
args = parse_args()
39+
pdfs = get_pdf_files(args.file)
40+
if not pdfs:
41+
logger.error("❌ No PDFs found to process.")
42+
return
43+
44+
all_results = {}
45+
for path in pdfs:
46+
logger.info(f"\n🔍 Converting and analyzing: {path}\n")
47+
try:
48+
result = analyze_pdf_with_docling(path)
49+
all_results[path] = result
50+
except Exception as e:
51+
logger.error(f"❌ Failed to process {path}: {e}")
52+
53+
generate_summary(all_results)
54+
save_results(all_results, args.output)
55+
56+
if __name__ == "__main__":
57+
main()

notebooks/illuminator/log_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import logging
2+
3+
logger = logging.getLogger("illuminator")
4+
logger.setLevel(logging.INFO)
5+
6+
# Don't let it propagate to the root logger (which docling uses)
7+
logger.propagate = False
8+
9+
# Add handler if none exists (to avoid duplication)
10+
if not logger.handlers:
11+
handler = logging.StreamHandler()
12+
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"))
13+
logger.addHandler(handler)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
docling

0 commit comments

Comments
 (0)