feat(models): align OCR data models with PRD specification #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

amostt merged 6 commits into master from fix/ocr-data-model-prd-alignment

Oct 30, 2025

backend/app/alembic/versions/0e7dd198b7c7_convert_status_to_enum_type.py

-Original file line number
+Diff line change
@@ -0,0 +1,103 @@
+    """convert_status_to_enum_type
+    Convert ingestions.status column from VARCHAR to PostgreSQL ENUM type.
+    This migration:
+. Creates extractionstatus ENUM type with all status values
+. Converts existing VARCHAR status column to use the ENUM type
+. Maintains data integrity by mapping existing values to ENUM
+    Revision ID: 0e7dd198b7c7
+    Revises: 2ccac127c59f
+    Create Date: 2025-10-30 13:25:21.537208
+    """
+    from alembic import op
+    import sqlalchemy as sa
+    import sqlmodel.sql.sqltypes
+    # revision identifiers, used by Alembic.
+    revision = '0e7dd198b7c7'
+    down_revision = '20038a3ab258'
+    branch_labels = None
+    depends_on = None
+    def upgrade():
+        """Convert status column to PostgreSQL ENUM type."""
+        # Create extractionstatus ENUM type
+        op.execute("""
+            CREATE TYPE extractionstatus AS ENUM (
+                'UPLOADED',
+                'OCR_IN_PROGRESS',
+                'OCR_COMPLETE',
+                'OCR_FAILED',
+                'SEGMENTATION_PROCESSING',
+                'SEGMENTATION_COMPLETE',
+                'TAGGING_PROCESSING',
+                'DRAFT',
+                'IN_REVIEW',
+                'APPROVED',
+                'REJECTED',
+                'FAILED'
+            )
+        """)
+        # Update existing 'OCR_PROCESSING' values to 'OCR_IN_PROGRESS' if any exist
+        op.execute("""
+            UPDATE ingestions
+            SET status = 'OCR_IN_PROGRESS'
+            WHERE status = 'OCR_PROCESSING'
+        """)
+        # Step 1: Drop the existing default value
+        op.execute("""
+            ALTER TABLE ingestions
+            ALTER COLUMN status DROP DEFAULT
+        """)
+        # Step 2: Convert status column to use ENUM type
+        op.execute("""
+            ALTER TABLE ingestions
+            ALTER COLUMN status TYPE extractionstatus
+            USING status::text::extractionstatus
+        """)
+        # Step 3: Re-add the default value as ENUM type
+        op.execute("""
+            ALTER TABLE ingestions
+            ALTER COLUMN status SET DEFAULT 'UPLOADED'::extractionstatus
+        """)
+    def downgrade():
+        """Convert status column back to VARCHAR."""
+        # Step 1: Drop the ENUM default
+        op.execute("""
+            ALTER TABLE ingestions
+            ALTER COLUMN status DROP DEFAULT
+        """)
+        # Step 2: Convert status column back to VARCHAR
+        op.execute("""
+            ALTER TABLE ingestions
+            ALTER COLUMN status TYPE VARCHAR
+            USING status::text
+        """)
+        # Step 3: Re-add the VARCHAR default
+        op.execute("""
+            ALTER TABLE ingestions
+            ALTER COLUMN status SET DEFAULT 'UPLOADED'
+        """)
+        # Step 4: Drop the ENUM type
+        op.execute("DROP TYPE extractionstatus")
+        # Step 5: Revert OCR_IN_PROGRESS back to OCR_PROCESSING if any exist
+        op.execute("""
+            UPDATE ingestions
+            SET status = 'OCR_PROCESSING'
+            WHERE status = 'OCR_IN_PROGRESS'
+        """)

backend/app/models.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -86,8 +86,9 @@ class ExtractionStatus(str, Enum):
  
        """Extraction pipeline status enum."""

        UPLOADED = "UPLOADED"

        OCR_PROCESSING = "OCR_PROCESSING"

        OCR_IN_PROGRESS = "OCR_IN_PROGRESS"

        OCR_COMPLETE = "OCR_COMPLETE"

        OCR_FAILED = "OCR_FAILED"

        SEGMENTATION_PROCESSING = "SEGMENTATION_PROCESSING"

        SEGMENTATION_COMPLETE = "SEGMENTATION_COMPLETE"

        TAGGING_PROCESSING = "TAGGING_PROCESSING"

backend/app/services/ocr.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -6,7 +6,7 @@
  
    import uuid

    from datetime import datetime

    from typing import Any

    from typing import Any, Literal

    import httpx

    from pydantic import BaseModel, Field

    @@ -61,22 +61,35 @@ class BoundingBox(BaseModel):
  
        height: float = Field(..., description="Height of the bounding box")

    class TableStructure(BaseModel):

        """Table layout extracted by Mistral OCR.

        Represents the structure of a table including dimensions and cell contents.

        """

        rows: int = Field(..., description="Number of rows in the table", gt=0)

        columns: int = Field(..., description="Number of columns in the table", gt=0)

        cells: list[dict[str, Any]] = Field(

            default_factory=list,

            description="Cell data with row, col, text, and bbox information",

        )

    class ContentBlock(BaseModel):

        """A content block extracted from a PDF page.

        Represents text, equations, tables, or images with their layout information.

        """

        block_id: str = Field(..., description="Unique identifier for this content block")

        block_type: str = Field(

            ...,

            description="Type of content: text, equation, table, image, header, paragraph, list",

        )

        block_type: Literal[

            "text", "header", "paragraph", "list", "table", "equation", "image"

        ] = Field(..., description="Type of content block")

        text: str = Field(..., description="Extracted text content")

        bbox: BoundingBox = Field(..., description="Bounding box coordinates")

        confidence: float = Field(..., ge=0.0, le=1.0, description="OCR confidence score")

        latex: str | None = Field(None, description="LaTeX representation for equations")

        table_structure: dict[str, Any] | None = Field(

        table_structure: TableStructure | None = Field(

            None, description="Table structure metadata (rows, columns, cells)"

        )

        image_description: str | None = Field(

    @@ -153,7 +166,9 @@ def __init__(self, api_key: str, base_url: str = "https://api.mistral.ai/v1"):
  
                timeout=httpx.Timeout(60.0),

            )

        def _map_block_type(self, mistral_type: str) -> str:

        def _map_block_type(

            self, mistral_type: str

        ) -> Literal["text", "header", "paragraph", "list", "table", "equation", "image"]:

            """Map Mistral's block type to semantic types for segmentation.

            Args:

    @@ -162,7 +177,12 @@ def _map_block_type(self, mistral_type: str) -> str:
  
            Returns:

                Semantic block type (e.g., "header", "paragraph")

            """

            mapping = {

            mapping: dict[

                str,

                Literal[

                    "text", "header", "paragraph", "list", "table", "equation", "image"

                ],

            ] = {

                "heading": "header",

                "text": "paragraph",

                "equation": "equation",

    @@ -281,6 +301,15 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult:
  
                        # If no type provided, default to "text" (fallback/unknown type)

                        # If type is provided, map to semantic type

                        block_type: Literal[

                            "text",

                            "header",

                            "paragraph",

                            "list",

                            "table",

                            "equation",

                            "image",

                        ]

                        if mistral_type is None:

                            block_type = "text"  # Default fallback

                        else:

    @@ -322,11 +351,11 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult:
  
                            ),

                            confidence=0.95,

                            latex=None,

                            table_structure={

                                "rows": table_data.get("rows"),

                                "columns": table_data.get("columns"),

                                "cells": table_data.get("cells", []),

                            },

                            table_structure=TableStructure(

                                rows=table_data.get("rows", 0),

                                columns=table_data.get("columns", 0),

                                cells=table_data.get("cells", []),

                            ),

                            image_description=None,

                            markdown_content=None,

                            hierarchy_level=None,

backend/app/tasks/extraction.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -78,11 +78,11 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
  
                    logger.error(f"Ingestion {ingestion_id} not found in database")

                    raise ValueError(f"Ingestion {ingestion_id} not found")

                # Update status to OCR_PROCESSING

                ingestion.status = ExtractionStatus.OCR_PROCESSING

                # Update status to OCR_IN_PROGRESS

                ingestion.status = ExtractionStatus.OCR_IN_PROGRESS

                db.add(ingestion)

                db.commit()

                logger.info(f"[{ingestion_id}] Status updated to OCR_PROCESSING")

                logger.info(f"[{ingestion_id}] Status updated to OCR_IN_PROGRESS")

                # Download PDF from storage

                logger.info(

backend/tests/services/test_ocr.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -204,7 +204,7 @@ def mock_handler(request: httpx.Request) -> httpx.Response:
  
                )

                assert table_block is not None

                assert table_block.table_structure is not None

                assert table_block.table_structure["rows"] == 2

                assert table_block.table_structure.rows == 2

        @pytest.mark.asyncio

        async def test_extract_text_api_error_400(self):

    @@ -625,17 +625,17 @@ def mock_handler(request: httpx.Request) -> httpx.Response:
  
                # Verify table structure with cell-level detail

                table_struct = table_block.table_structure

                assert table_struct is not None

                assert table_struct["rows"] == 4

                assert table_struct["columns"] == 2

                assert len(table_struct["cells"]) == 4

                assert table_struct.rows == 4

                assert table_struct.columns == 2

                assert len(table_struct.cells) == 4

                # Verify cell data with row/column positions

                cell_a = table_struct["cells"][0]

                cell_a = table_struct.cells[0]

                assert cell_a["row"] == 0

                assert cell_a["col"] == 0

                assert cell_a["text"] == "A."

                cell_b = table_struct["cells"][2]

                cell_b = table_struct.cells[2]

                assert cell_b["row"] == 1

                assert cell_b["col"] == 0

                assert cell_b["text"] == "B."

backend/tests/tasks/test_extraction.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -100,7 +100,7 @@ def test_process_ocr_task_success(
  
            from app.models import Ingestion

            mock_db.get.assert_called_once_with(Ingestion, mock_ingestion.id)

            assert mock_db.commit.call_count == 2  # Status OCR_PROCESSING + OCR_COMPLETE

            assert mock_db.commit.call_count == 2  # Status OCR_IN_PROGRESS + OCR_COMPLETE

            # Verify ingestion status was updated to OCR_COMPLETE

            assert mock_ingestion.status == ExtractionStatus.OCR_COMPLETE

    @@ -234,7 +234,7 @@ def test_process_ocr_task_updates_status_to_processing(
  
            mock_ingestion,

            mock_ocr_result,

        ):

            """Test task updates status to OCR_PROCESSING before starting OCR."""

            """Test task updates status to OCR_IN_PROGRESS before starting OCR."""

            mock_settings.MISTRAL_API_KEY = "test-api-key"

            mock_db = MagicMock()

    @@ -257,7 +257,7 @@ def track_status_change(*args, **kwargs):
  
            process_ocr_task(str(mock_ingestion.id))

            # Verify status progression: OCR_PROCESSING -> OCR_COMPLETE

            # Verify status progression: OCR_IN_PROGRESS -> OCR_COMPLETE

            assert len(status_changes) >= 2

            assert ExtractionStatus.OCR_PROCESSING in status_changes

            assert ExtractionStatus.OCR_IN_PROGRESS in status_changes

            assert status_changes[-1] == ExtractionStatus.OCR_COMPLETE

frontend/src/client/schemas.gen.ts

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -71,7 +71,7 @@ export const Body_login_login_access_tokenSchema = {
  
    export const ExtractionStatusSchema = {

        type: 'string',

        enum: ['UPLOADED', 'OCR_PROCESSING', 'OCR_COMPLETE', 'SEGMENTATION_PROCESSING', 'SEGMENTATION_COMPLETE', 'TAGGING_PROCESSING', 'DRAFT', 'IN_REVIEW', 'APPROVED', 'REJECTED', 'FAILED'],

        enum: ['UPLOADED', 'OCR_IN_PROGRESS', 'OCR_COMPLETE', 'OCR_FAILED', 'SEGMENTATION_PROCESSING', 'SEGMENTATION_COMPLETE', 'TAGGING_PROCESSING', 'DRAFT', 'IN_REVIEW', 'APPROVED', 'REJECTED', 'FAILED'],

        title: 'ExtractionStatus',

        description: 'Extraction pipeline status enum.'

    } as const;

frontend/src/client/types.gen.ts

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -19,7 +19,7 @@ export type Body_login_login_access_token = {
  
    /**

     * Extraction pipeline status enum.

     */

    export type ExtractionStatus = 'UPLOADED' | 'OCR_PROCESSING' | 'OCR_COMPLETE' | 'SEGMENTATION_PROCESSING' | 'SEGMENTATION_COMPLETE' | 'TAGGING_PROCESSING' | 'DRAFT' | 'IN_REVIEW' | 'APPROVED' | 'REJECTED' | 'FAILED';

    export type ExtractionStatus = 'UPLOADED' | 'OCR_IN_PROGRESS' | 'OCR_COMPLETE' | 'OCR_FAILED' | 'SEGMENTATION_PROCESSING' | 'SEGMENTATION_COMPLETE' | 'TAGGING_PROCESSING' | 'DRAFT' | 'IN_REVIEW' | 'APPROVED' | 'REJECTED' | 'FAILED';

    export type HTTPValidationError = {

        detail?: Array<ValidationError>;

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(models): align OCR data models with PRD specification #18

Uh oh!

Diff view

Diff view

There are no files selected for viewing