Merge pull request #5 from alinaryan/add-illuminator

alimaredia · web-flow · commit 4ef378f64eb5 · 2025-04-29T08:04:48.000-04:00
add Illuminator tool for post docling conversion analysis
diff --git a/notebooks/illuminator/.gitignore b/notebooks/illuminator/.gitignore
@@ -0,0 +1,15 @@
+# Ignore virtual environments
+venv/
+.env/
+
+# Ignore compiled Python files
+__pycache__/
+*.pyc
+*.pyo
+
+# Ignore output files
+results.json
+
+# Ignore system files
+.DS_Store
+Thumbs.db
diff --git a/notebooks/illuminator/README.md b/notebooks/illuminator/README.md
@@ -0,0 +1,92 @@
+# 💡 Illuminator
+
+## 📌 Overview
+Illuminator is your post-conversion PDF sanity checker. After converting documents with Docling, Illuminator scans the result and flags merged table cells that could cause layout issues or require manual cleanup.
+
+It's a lightweight tool designed for teams working with structured data, helping you catch subtle formatting problems before they snowball.
+
+Illuminator checks for:
+- **⚠️ Merged Table Cells**
+  - Colspan > 1
+  - Rowspan > 1
+- **📄 Accurate Page Mapping**
+  - Uses Docling’s provenance metadata (not guesswork!)
+  - Associates each merged cell with its correct page number
+
+---
+
+## 🔧 Installation
+
+### 1️⃣ **Clone the Repository**
+```sh
+git clone https://github.com/instructlab/support-utils.git
+cd support-utils/beta/illuminator
+```
+
+### 2️⃣ Create a Virtual Environment (Optional, Recommended)
+```python3 -m venv venv
+source venv/bin/activate  # On macOS/Linux
+venv\Scripts\activate     # On Windows
+```
+
+### 3️⃣ Install Dependencies
+```
+pip install -r requirements.txt
+```
+
+## 🚀 Usage
+### Check a Single PDF or an Entire Folder of PDFs
+```
+python illuminator.py -f /path/to/pdf/document.pdf
+```
+
+### Save Results to a JSON File
+By default, results are saved to results.json. To specify a different output file:
+```
+python illuminator.py -f /path/to/pdf/folder/ -o results.json
+```
+
+## 📝 Output Format
+### 📄 Terminal Output (Example)
+
+📂 File: /home/user/documents/report.pdf
+
+⚠️ Merged Table Cells Detected on Pages: 2, 4
+   - Page 2: "Total Revenue" (colspan=2, rowspan=1)
+   - Page 4: "[empty]" (colspan=3, rowspan=1)
+
+📁 Results saved to results.json 
+
+
+## 📁 JSON Output (results.json)
+```
+{
+    "/home/user/documents/report.pdf": {
+        "page_count": 10,
+        "table_count": 3,
+        "merged_cell_pages": [2, 4],
+        "merged_table_cells": [
+            {
+                "page": 2,
+                "row": 0,
+                "column": 1,
+                "colspan": 2,
+                "rowspan": 1,
+                "text": "Total Revenue"
+            },
+            {
+                "page": 4,
+                "row": 2,
+                "column": 0,
+                "colspan": 3,
+                "rowspan": 1,
+                "text": "[empty]"
+            }
+        ]
+    }
+}
+```
+
+## 🤝 Acknowledgments
+Built by Alina with ❤️ for better PDF conversion workflows!
+
diff --git a/notebooks/illuminator/analysis.py b/notebooks/illuminator/analysis.py
@@ -0,0 +1,106 @@
+from docling.document_converter import DocumentConverter
+from typing import List, Tuple, Dict, Any, Union
+from log_utils import logger
+import os
+
+def cell_is_merged(cell) -> bool:
+    """
+    Determines whether a table cell is merged based on its row or column span.
+
+    Args:
+        cell: A table cell object from the Docling document.
+
+    Returns:
+        True if the cell spans multiple rows or columns; False otherwise.
+    """
+    return (
+        cell.col_span > 1 or
+        cell.row_span > 1
+    )
+
+def summarize_tables(doc) -> Tuple[int, List[int]]:
+    """
+    Summarizes tables and extracts the page numbers they appear on.
+
+    Args:
+        doc: A DoclingDocument object.
+
+    Returns:
+        A tuple containing:
+            - The number of tables found.
+            - A list of page numbers on which the tables are located.
+    """
+    tables = doc.tables
+    num_tables = len(tables)
+    pages = []
+
+    for table in tables:
+        for prov in table.prov:
+            pages.append(prov.page_no)
+
+    return num_tables, pages
+
+
+def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
+    """
+    Analyzes a PDF for merged table cells using the Docling converter,
+    and saves a Markdown version of the converted document.
+
+    Args:
+        file_path: Path to the input PDF file.
+
+    Returns:
+        A dictionary containing:
+            - Total number of tables.
+            - Set of pages with merged cells.
+            - List of merged cell details (page, position, spans, text).
+            - Total unique pages with tables.
+    """
+    converter = DocumentConverter()
+    result = converter.convert(file_path)
+    doc = result.document
+
+    # ✅ Save Markdown output
+    markdown_text = doc.export_to_markdown()
+    base_name = os.path.splitext(os.path.basename(file_path))[0]
+    md_output_path = f"{base_name}.md"
+    with open(md_output_path, "w") as f:
+        f.write(markdown_text)
+
+    logger.info(f"📝 Markdown saved to {md_output_path}")
+
+    # ⬇️ Proceed with table analysis
+    table_count, table_pages_list = summarize_tables(doc)
+    total_pages = len(set(table_pages_list)) or "Unknown"
+
+    issues = {
+        "merged_table_cells": [],
+        "table_count": table_count,
+        "merged_cell_pages": set(),
+        "page_count": total_pages
+    }
+
+    for i, table_item in enumerate(doc.tables):
+        try:
+            page_number = table_pages_list[i]
+        except IndexError:
+            page_number = "Unknown page"
+        table_data = table_item.data
+
+        for row_idx, row in enumerate(table_data.grid):
+            for col_idx, cell in enumerate(row):
+                if cell_is_merged(cell):
+                    issues["merged_table_cells"].append({
+                        "page": page_number,
+                        "row": row_idx,
+                        "column": col_idx,
+                        "colspan": cell.col_span,
+                        "rowspan": cell.row_span,
+                        "text": cell.text or "[empty]"
+                    })
+
+                    if isinstance(page_number, int):
+                        issues["merged_cell_pages"].add(page_number)
+
+    issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
+    return issues
diff --git a/notebooks/illuminator/illuminator.py b/notebooks/illuminator/illuminator.py
@@ -0,0 +1,57 @@
+# main.py
+import argparse
+from utils import get_pdf_files, save_results, generate_summary
+from analysis import analyze_pdf_with_docling
+from log_utils import logger
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parses command-line arguments for the Illuminator tool.
+
+    Returns:
+        argparse.Namespace containing:
+            - file: Optional path to a single PDF.
+            - dir: Optional path to a directory of PDFs.
+            - output: Path to save results JSON file.
+    """
+    parser = argparse.ArgumentParser(description="Docling PDF Checker")
+    parser.add_argument(
+        "-f", "--file",
+        help="Path to a PDF file or directory of PDFs",
+        required=True
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Optional path to save JSON results",
+        default="results.json"
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """
+    Main execution flow:
+    - Parses arguments
+    - Loads and analyzes PDFs
+    - Generates and saves results
+    """
+    args = parse_args()
+    pdfs = get_pdf_files(args.file)
+    if not pdfs:
+        logger.error("❌ No PDFs found to process.")
+        return
+
+    all_results = {}
+    for path in pdfs:
+        logger.info(f"\n🔍 Converting and analyzing: {path}\n")
+        try:
+            result = analyze_pdf_with_docling(path)
+            all_results[path] = result
+        except Exception as e:
+            logger.error(f"❌ Failed to process {path}: {e}")
+
+    generate_summary(all_results)
+    save_results(all_results, args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/notebooks/illuminator/log_utils.py b/notebooks/illuminator/log_utils.py
@@ -0,0 +1,13 @@
+import logging
+
+logger = logging.getLogger("illuminator")
+logger.setLevel(logging.INFO)
+
+# Don't let it propagate to the root logger (which docling uses)
+logger.propagate = False
+
+# Add handler if none exists (to avoid duplication)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"))
+    logger.addHandler(handler)
diff --git a/notebooks/illuminator/requirements.txt b/notebooks/illuminator/requirements.txt
@@ -0,0 +1 @@
+docling
diff --git a/notebooks/illuminator/utils.py b/notebooks/illuminator/utils.py