statcom-um · jacksonloper · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/.github/workflows/check-unique-hashes.yml b/.github/workflows/check-unique-hashes.yml
@@ -0,0 +1,41 @@
+name: Check Unique SHA256 Hashes
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'pdf_parsing/parquet_files/*.parquet'
+      - 'pdf_parsing/check_unique_hashes.py'
+      - '.github/workflows/check-unique-hashes.yml'
+
+jobs:
+  check-hashes:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true
+          cache-dependency-glob: '**/pyproject.toml'
+
+      - name: Install dependencies
+        run: |
+          uv pip install --system pandas pyarrow
+
+      - name: Check for duplicate SHA256 hashes
+        run: |
+          python3 pdf_parsing/check_unique_hashes.py
+
+      - name: Report success
+        if: success()
+        run: |
+          echo "✅ All SHA256 hashes across parquet files are unique!"
diff --git a/pdf_parsing/README.md b/pdf_parsing/README.md
@@ -0,0 +1,80 @@
+# PDF Text Extraction Tool
+
+A Python script that extracts text from PDF files using [pdfplumber](https://github.com/jsvine/pdfplumber) and saves the results to compressed Parquet files.
+
+## Overview
+
+This script facilitates the production of a directory of parquet files that store text versions of each file in a directory of pdfs (such as that downloaded by ../ingestion).  We assume the pdf directory is continually being updated, and, as such, the script may need to be run again and again.  Each time the script it run, it:
+
+- **Looks at the parquet files storing the text information we already have**.  These parquet files store both sha256 hashes of the original pdfs and the text that was extracted.
+- **Looks at the sha256 of files in the directory**.
+- **Figures out files that still need to be processed**.
+- **Processes them** into a new parquet file that is added into the parquet directory.
+
+By default the text information is stored in parquet_files, as that is where they are stored in this git repository for this project.  For this project's use, we find 500 pdf files boil down to about 1.5 megabytes.
+
+## Usage
+
+**Note**: All commands should be run from the project root directory.
+
+### Basic Usage
+
+Extract text from all PDFs in a directory:
+
+```bash
+uv run pdf_parsing/extract_pdf_text.py --pdf-dir /path/to/pdf/directory
+```
+
+This creates timestamped Parquet files in `pdf_parsing/parquet_files/` by default (e.g., `20251103_143052_pdf_text.parquet` with `%Y%m%d_%H%M%S` timestamp).
+
+### Custom Output Directory
+
+Specify a custom output directory:
+
+```bash
+uv run pdf_parsing/extract_pdf_text.py --pdf-dir /path/to/pdf/directory --parquet-dir /path/to/output
+```
+
+### Limit Processing
+
+Process only a limited number of PDFs (useful for testing or incremental processing):
+
+```bash
+uv run pdf_parsing/extract_pdf_text.py --pdf-dir /path/to/pdf/directory --limit 100
+```
+
+This will process at most 100 PDFs. Note that already-processed PDFs (skipped files) don't count toward the limit.
+
+### Spot Check
+
+Verify existing extractions by re-processing N random PDFs:
+
+```bash
+uv run pdf_parsing/extract_pdf_text.py --pdf-dir /path/to/pdf/directory --spot-check 10
+```
+
+This will:
+- Load existing records from all Parquet files in the output directory
+- Randomly select up to 10 PDFs that have been previously processed
+- Re-extract text from those PDFs
+- Compare the newly extracted text with the stored text
+- Report pass/fail for each PDF
+
+Spot checking exits with code 0 if all checks pass, or code 1 if any fail.
+
+## Output Format
+
+The script outputs compressed Parquet files with the following schema:
+
+### Fields
+
+- **`sha256`** (string): SHA256 hash of the PDF file (hex digest)
+- **`dateprocessed`** (string): ISO 8601 timestamp of when the PDF was processed
+- **`text`** (list of strings): Text content, one string per page
+
+### File Naming
+
+Each processing run creates a new file named: `YYYYMMDD_HHMMSS_pdf_text.parquet`
+
+Example: `20251103_143052_pdf_text.parquet`
+
diff --git a/pdf_parsing/check_unique_hashes.py b/pdf_parsing/check_unique_hashes.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""Check that all SHA256 hashes across all parquet files are unique."""
+
+import sys
+from pathlib import Path
+import pandas as pd
+
+
+def check_unique_hashes(parquet_dir: Path) -> tuple[bool, dict]:
+    """
+    Check if all SHA256 hashes across all parquet files are unique.
+
+    Args:
+        parquet_dir: Directory containing parquet files
+
+    Returns:
+        Tuple of (all_unique: bool, stats: dict)
+    """
+    parquet_files = sorted(parquet_dir.glob("*.parquet"))
+
+    if not parquet_files:
+        print(f"❌ No parquet files found in {parquet_dir}")
+        return False, {}
+
+    print(f"Found {len(parquet_files)} parquet file(s):")
+    for f in parquet_files:
+        print(f"  - {f.name}")
+    print()
+
+    # Collect all hashes
+    all_hashes = []
+    file_hash_counts = {}
+
+    for parquet_file in parquet_files:
+        df = pd.read_parquet(parquet_file)
+
+        if 'sha256' not in df.columns:
+            print(f"❌ File {parquet_file.name} does not have a 'sha256' column")
+            return False, {}
+
+        hashes = df['sha256'].tolist()
+        all_hashes.extend(hashes)
+        file_hash_counts[parquet_file.name] = len(hashes)
+        print(f"  {parquet_file.name}: {len(hashes)} hashes")
+
+    print()
+    total_hashes = len(all_hashes)
+    unique_hashes = len(set(all_hashes))
+
+    stats = {
+        'total_files': len(parquet_files),
+        'total_hashes': total_hashes,
+        'unique_hashes': unique_hashes,
+        'file_hash_counts': file_hash_counts
+    }
+
+    print(f"Total hashes across all files: {total_hashes}")
+    print(f"Unique hashes: {unique_hashes}")
+
+    if total_hashes == unique_hashes:
+        print("✅ All SHA256 hashes are unique!")
+        return True, stats
+    else:
+        duplicates = total_hashes - unique_hashes
+        print(f"❌ Found {duplicates} duplicate hash(es)!")
+
+        # Find and report duplicates
+        hash_counts = {}
+        for h in all_hashes:
+            hash_counts[h] = hash_counts.get(h, 0) + 1
+
+        duplicate_hashes = {h: count for h, count in hash_counts.items() if count > 1}
+        print(f"\nDuplicate hashes:")
+        for hash_val, count in sorted(duplicate_hashes.items()):
+            print(f"  {hash_val}: appears {count} times")
+
+        return False, stats
+
+
+def main():
+    """Run the uniqueness check."""
+    parquet_dir = Path(__file__).parent / "parquet_files"
+
+    if not parquet_dir.exists():
+        print(f"❌ Directory {parquet_dir} does not exist")
+        sys.exit(1)
+
+    all_unique, stats = check_unique_hashes(parquet_dir)
+
+    if not all_unique:
+        sys.exit(1)
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()