add Illuminator tool for post docling conversion analysis

alinaryan · alinaryan · commit d12e3fda50ad · 2025-04-28T17:42:42.000-04:00
This adds the initial version of the Illuminator tool. Illuminator is a post docling conversion checker designed to highlight problematic tables and merged cells that may have impacted the conversion from pdf to markdown.

Includes:

analyze_pdf_with_docling for detecting merged table cells
automatic page number detection via Docling provenance data
CLI interface supporting single file or directory input
JSON and terminal output summaries of detected issues
initial README and requirements.txt for setup

Signed-off-by: Alina Ryan &lt;aliryan@redhat.com&gt;
diff --git a/notebooks/illuminator/.gitignore b/notebooks/illuminator/.gitignore
@@ -0,0 +1,15 @@
+# Ignore virtual environments
+venv/
+.env/
+
+# Ignore compiled Python files
+__pycache__/
+*.pyc
+*.pyo
+
+# Ignore output files
+results.json
+
+# Ignore system files
+.DS_Store
+Thumbs.db
diff --git a/notebooks/illuminator/README.md b/notebooks/illuminator/README.md
@@ -0,0 +1,92 @@
+# 💡 Illuminator
+
+## 📌 Overview
+Illuminator is your post-conversion PDF sanity checker. After converting documents with Docling, Illuminator scans the result and flags merged table cells that could cause layout issues or require manual cleanup.
+
+It's a lightweight tool designed for teams working with structured data, helping you catch subtle formatting problems before they snowball.
+
+Illuminator checks for:
+- **⚠️ Merged Table Cells**
+  - Colspan > 1
+  - Rowspan > 1
+- **📄 Accurate Page Mapping**
+  - Uses Docling’s provenance metadata (not guesswork!)
+  - Associates each merged cell with its correct page number
+
+---
+
+## 🔧 Installation
+
+### 1️⃣ **Clone the Repository**
+```sh
+git clone https://github.com/instructlab/support-utils.git
+cd support-utils/beta/illuminator
+```
+
+### 2️⃣ Create a Virtual Environment (Optional, Recommended)
+```python3 -m venv venv
+source venv/bin/activate  # On macOS/Linux
+venv\Scripts\activate     # On Windows
+```
+
+### 3️⃣ Install Dependencies
+```
+pip install -r requirements.txt
+```
+
+## 🚀 Usage
+### Check a Single PDF or an Entire Folder of PDFs
+```
+python illuminator.py -f /path/to/pdf/document.pdf
+```
+
+### Save Results to a JSON File
+By default, results are saved to results.json. To specify a different output file:
+```
+python illuminator.py -f /path/to/pdf/folder/ -o results.json
+```
+
+## 📝 Output Format
+### 📄 Terminal Output (Example)
+
+📂 File: /home/user/documents/report.pdf
+
+⚠️ Merged Table Cells Detected on Pages: 2, 4
+   - Page 2: "Total Revenue" (colspan=2, rowspan=1)
+   - Page 4: "[empty]" (colspan=3, rowspan=1)
+
+📁 Results saved to results.json 
+
+
+## 📁 JSON Output (results.json)
+```
+{
+    "/home/user/documents/report.pdf": {
+        "page_count": 10,
+        "table_count": 3,
+        "merged_cell_pages": [2, 4],
+        "merged_table_cells": [
+            {
+                "page": 2,
+                "row": 0,
+                "column": 1,
+                "colspan": 2,
+                "rowspan": 1,
+                "text": "Total Revenue"
+            },
+            {
+                "page": 4,
+                "row": 2,
+                "column": 0,
+                "colspan": 3,
+                "rowspan": 1,
+                "text": "[empty]"
+            }
+        ]
+    }
+}
+```
+
+## 🤝 Acknowledgments
+Built by Alina with ❤️ for better PDF conversion workflows!
+
diff --git a/notebooks/illuminator/analysis.py b/notebooks/illuminator/analysis.py
@@ -0,0 +1,106 @@
+from docling.document_converter import DocumentConverter
+from typing import List, Tuple, Dict, Any, Union
+from log_utils import logger
+import os
+
+def cell_is_merged(cell) -> bool:
+    """
+    Determines whether a table cell is merged based on its row or column span.
+
+    Args:
+        cell: A table cell object from the Docling document.
+
+    Returns:
+        True if the cell spans multiple rows or columns; False otherwise.
+    """
+    return (
+        cell.col_span > 1 or
+        cell.row_span > 1
+    )
+
+def summarize_tables(doc) -> Tuple[int, List[int]]:
+    """
+    Summarizes tables and extracts the page numbers they appear on.
+
+    Args:
+        doc: A DoclingDocument object.
+
+    Returns:
+        A tuple containing:
+            - The number of tables found.
+            - A list of page numbers on which the tables are located.
+    """
+    tables = doc.tables
+    num_tables = len(tables)
+    pages = []
+
+    for table in tables:
+        for prov in table.prov:
+            pages.append(prov.page_no)
+
+    return num_tables, pages
+
+
+def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
+    """
+    Analyzes a PDF for merged table cells using the Docling converter,
+    and saves a Markdown version of the converted document.
+
+    Args:
+        file_path: Path to the input PDF file.
+
+    Returns:
+        A dictionary containing:
+            - Total number of tables.
+            - Set of pages with merged cells.
+            - List of merged cell details (page, position, spans, text).
+            - Total unique pages with tables.
+    """
+    converter = DocumentConverter()
+    result = converter.convert(file_path)
+    doc = result.document
+
+    # ✅ Save Markdown output
+    markdown_text = doc.export_to_markdown()
+    base_name = os.path.splitext(os.path.basename(file_path))[0]
+    md_output_path = f"{base_name}.md"
+    with open(md_output_path, "w") as f:
+        f.write(markdown_text)
+
+    logger.info(f"📝 Markdown saved to {md_output_path}")
+
+    # ⬇️ Proceed with table analysis
+    table_count, table_pages_list = summarize_tables(doc)
+    total_pages = len(set(table_pages_list)) or "Unknown"
+
+    issues = {
+        "merged_table_cells": [],
+        "table_count": table_count,
+        "merged_cell_pages": set(),
+        "page_count": total_pages
+    }
+
+    for i, table_item in enumerate(doc.tables):
+        try:
+            page_number = table_pages_list[i]
+        except IndexError:
+            page_number = "Unknown page"
+        table_data = table_item.data
+
+        for row_idx, row in enumerate(table_data.grid):
+            for col_idx, cell in enumerate(row):
+                if cell_is_merged(cell):
+                    issues["merged_table_cells"].append({
+                        "page": page_number,
+                        "row": row_idx,
+                        "column": col_idx,
+                        "colspan": cell.col_span,
+                        "rowspan": cell.row_span,
+                        "text": cell.text or "[empty]"
+                    })
+
+                    if isinstance(page_number, int):
+                        issues["merged_cell_pages"].add(page_number)
+
+    issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
+    return issues
diff --git a/notebooks/illuminator/illuminator.py b/notebooks/illuminator/illuminator.py
@@ -0,0 +1,57 @@
+# main.py
+import argparse
+from utils import get_pdf_files, save_results, generate_summary
+from analysis import analyze_pdf_with_docling
+from log_utils import logger
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parses command-line arguments for the Illuminator tool.
+
+    Returns:
+        argparse.Namespace containing:
+            - file: Optional path to a single PDF.
+            - dir: Optional path to a directory of PDFs.
+            - output: Path to save results JSON file.
+    """
+    parser = argparse.ArgumentParser(description="Docling PDF Checker")
+    parser.add_argument(
+        "-f", "--file",
+        help="Path to a PDF file or directory of PDFs",
+        required=True
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Optional path to save JSON results",
+        default="results.json"
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """
+    Main execution flow:
+    - Parses arguments
+    - Loads and analyzes PDFs
+    - Generates and saves results
+    """
+    args = parse_args()
+    pdfs = get_pdf_files(args.file)
+    if not pdfs:
+        logger.error("❌ No PDFs found to process.")
+        return
+
+    all_results = {}
+    for path in pdfs:
+        logger.info(f"\n🔍 Converting and analyzing: {path}\n")
+        try:
+            result = analyze_pdf_with_docling(path)
+            all_results[path] = result
+        except Exception as e:
+            logger.error(f"❌ Failed to process {path}: {e}")
+
+    generate_summary(all_results)
+    save_results(all_results, args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/notebooks/illuminator/log_utils.py b/notebooks/illuminator/log_utils.py
@@ -0,0 +1,13 @@
+import logging
+
+logger = logging.getLogger("illuminator")
+logger.setLevel(logging.INFO)
+
+# Don't let it propagate to the root logger (which docling uses)
+logger.propagate = False
+
+# Add handler if none exists (to avoid duplication)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"))
+    logger.addHandler(handler)
diff --git a/notebooks/illuminator/requirements.txt b/notebooks/illuminator/requirements.txt
@@ -0,0 +1 @@
+docling
diff --git a/notebooks/illuminator/utils.py b/notebooks/illuminator/utils.py