Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""convert_status_to_enum_type

Convert ingestions.status column from VARCHAR to PostgreSQL ENUM type.

This migration:
1. Creates extractionstatus ENUM type with all status values
2. Converts existing VARCHAR status column to use the ENUM type
3. Maintains data integrity by mapping existing values to ENUM

Revision ID: 0e7dd198b7c7
Revises: 2ccac127c59f
Create Date: 2025-10-30 13:25:21.537208

"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes


# revision identifiers, used by Alembic.
revision = '0e7dd198b7c7'
down_revision = '20038a3ab258'
branch_labels = None
depends_on = None


def upgrade():
"""Convert status column to PostgreSQL ENUM type."""
# Create extractionstatus ENUM type
op.execute("""
CREATE TYPE extractionstatus AS ENUM (
'UPLOADED',
'OCR_IN_PROGRESS',
'OCR_COMPLETE',
'OCR_FAILED',
'SEGMENTATION_PROCESSING',
'SEGMENTATION_COMPLETE',
'TAGGING_PROCESSING',
'DRAFT',
'IN_REVIEW',
'APPROVED',
'REJECTED',
'FAILED'
)
""")

# Update existing 'OCR_PROCESSING' values to 'OCR_IN_PROGRESS' if any exist
op.execute("""
UPDATE ingestions
SET status = 'OCR_IN_PROGRESS'
WHERE status = 'OCR_PROCESSING'
""")

# Step 1: Drop the existing default value
op.execute("""
ALTER TABLE ingestions
ALTER COLUMN status DROP DEFAULT
""")

# Step 2: Convert status column to use ENUM type
op.execute("""
ALTER TABLE ingestions
ALTER COLUMN status TYPE extractionstatus
USING status::text::extractionstatus
""")

# Step 3: Re-add the default value as ENUM type
op.execute("""
ALTER TABLE ingestions
ALTER COLUMN status SET DEFAULT 'UPLOADED'::extractionstatus
""")


def downgrade():
"""Convert status column back to VARCHAR."""
# Step 1: Drop the ENUM default
op.execute("""
ALTER TABLE ingestions
ALTER COLUMN status DROP DEFAULT
""")

# Step 2: Convert status column back to VARCHAR
op.execute("""
ALTER TABLE ingestions
ALTER COLUMN status TYPE VARCHAR
USING status::text
""")

# Step 3: Re-add the VARCHAR default
op.execute("""
ALTER TABLE ingestions
ALTER COLUMN status SET DEFAULT 'UPLOADED'
""")

# Step 4: Drop the ENUM type
op.execute("DROP TYPE extractionstatus")

# Step 5: Revert OCR_IN_PROGRESS back to OCR_PROCESSING if any exist
op.execute("""
UPDATE ingestions
SET status = 'OCR_PROCESSING'
WHERE status = 'OCR_IN_PROGRESS'
""")
3 changes: 2 additions & 1 deletion backend/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,9 @@ class ExtractionStatus(str, Enum):
"""Extraction pipeline status enum."""

UPLOADED = "UPLOADED"
OCR_PROCESSING = "OCR_PROCESSING"
OCR_IN_PROGRESS = "OCR_IN_PROGRESS"
OCR_COMPLETE = "OCR_COMPLETE"
OCR_FAILED = "OCR_FAILED"
SEGMENTATION_PROCESSING = "SEGMENTATION_PROCESSING"
SEGMENTATION_COMPLETE = "SEGMENTATION_COMPLETE"
TAGGING_PROCESSING = "TAGGING_PROCESSING"
Expand Down
55 changes: 42 additions & 13 deletions backend/app/services/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import uuid
from datetime import datetime
from typing import Any
from typing import Any, Literal

import httpx
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -61,22 +61,35 @@ class BoundingBox(BaseModel):
height: float = Field(..., description="Height of the bounding box")


class TableStructure(BaseModel):
"""Table layout extracted by Mistral OCR.

Represents the structure of a table including dimensions and cell contents.
"""

rows: int = Field(..., description="Number of rows in the table", gt=0)
columns: int = Field(..., description="Number of columns in the table", gt=0)
cells: list[dict[str, Any]] = Field(
default_factory=list,
description="Cell data with row, col, text, and bbox information",
)


class ContentBlock(BaseModel):
"""A content block extracted from a PDF page.

Represents text, equations, tables, or images with their layout information.
"""

block_id: str = Field(..., description="Unique identifier for this content block")
block_type: str = Field(
...,
description="Type of content: text, equation, table, image, header, paragraph, list",
)
block_type: Literal[
"text", "header", "paragraph", "list", "table", "equation", "image"
] = Field(..., description="Type of content block")
text: str = Field(..., description="Extracted text content")
bbox: BoundingBox = Field(..., description="Bounding box coordinates")
confidence: float = Field(..., ge=0.0, le=1.0, description="OCR confidence score")
latex: str | None = Field(None, description="LaTeX representation for equations")
table_structure: dict[str, Any] | None = Field(
table_structure: TableStructure | None = Field(
None, description="Table structure metadata (rows, columns, cells)"
)
image_description: str | None = Field(
Expand Down Expand Up @@ -153,7 +166,9 @@ def __init__(self, api_key: str, base_url: str = "https://api.mistral.ai/v1"):
timeout=httpx.Timeout(60.0),
)

def _map_block_type(self, mistral_type: str) -> str:
def _map_block_type(
self, mistral_type: str
) -> Literal["text", "header", "paragraph", "list", "table", "equation", "image"]:
"""Map Mistral's block type to semantic types for segmentation.

Args:
Expand All @@ -162,7 +177,12 @@ def _map_block_type(self, mistral_type: str) -> str:
Returns:
Semantic block type (e.g., "header", "paragraph")
"""
mapping = {
mapping: dict[
str,
Literal[
"text", "header", "paragraph", "list", "table", "equation", "image"
],
] = {
"heading": "header",
"text": "paragraph",
"equation": "equation",
Expand Down Expand Up @@ -281,6 +301,15 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult:

# If no type provided, default to "text" (fallback/unknown type)
# If type is provided, map to semantic type
block_type: Literal[
"text",
"header",
"paragraph",
"list",
"table",
"equation",
"image",
]
if mistral_type is None:
block_type = "text" # Default fallback
else:
Expand Down Expand Up @@ -322,11 +351,11 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult:
),
confidence=0.95,
latex=None,
table_structure={
"rows": table_data.get("rows"),
"columns": table_data.get("columns"),
"cells": table_data.get("cells", []),
},
table_structure=TableStructure(
rows=table_data.get("rows", 0),
columns=table_data.get("columns", 0),
cells=table_data.get("cells", []),
),
image_description=None,
markdown_content=None,
hierarchy_level=None,
Expand Down
6 changes: 3 additions & 3 deletions backend/app/tasks/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
logger.error(f"Ingestion {ingestion_id} not found in database")
raise ValueError(f"Ingestion {ingestion_id} not found")

# Update status to OCR_PROCESSING
ingestion.status = ExtractionStatus.OCR_PROCESSING
# Update status to OCR_IN_PROGRESS
ingestion.status = ExtractionStatus.OCR_IN_PROGRESS
db.add(ingestion)
db.commit()
logger.info(f"[{ingestion_id}] Status updated to OCR_PROCESSING")
logger.info(f"[{ingestion_id}] Status updated to OCR_IN_PROGRESS")

# Download PDF from storage
logger.info(
Expand Down
12 changes: 6 additions & 6 deletions backend/tests/services/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def mock_handler(request: httpx.Request) -> httpx.Response:
)
assert table_block is not None
assert table_block.table_structure is not None
assert table_block.table_structure["rows"] == 2
assert table_block.table_structure.rows == 2

@pytest.mark.asyncio
async def test_extract_text_api_error_400(self):
Expand Down Expand Up @@ -625,17 +625,17 @@ def mock_handler(request: httpx.Request) -> httpx.Response:
# Verify table structure with cell-level detail
table_struct = table_block.table_structure
assert table_struct is not None
assert table_struct["rows"] == 4
assert table_struct["columns"] == 2
assert len(table_struct["cells"]) == 4
assert table_struct.rows == 4
assert table_struct.columns == 2
assert len(table_struct.cells) == 4

# Verify cell data with row/column positions
cell_a = table_struct["cells"][0]
cell_a = table_struct.cells[0]
assert cell_a["row"] == 0
assert cell_a["col"] == 0
assert cell_a["text"] == "A."

cell_b = table_struct["cells"][2]
cell_b = table_struct.cells[2]
assert cell_b["row"] == 1
assert cell_b["col"] == 0
assert cell_b["text"] == "B."
Expand Down
8 changes: 4 additions & 4 deletions backend/tests/tasks/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def test_process_ocr_task_success(
from app.models import Ingestion

mock_db.get.assert_called_once_with(Ingestion, mock_ingestion.id)
assert mock_db.commit.call_count == 2 # Status OCR_PROCESSING + OCR_COMPLETE
assert mock_db.commit.call_count == 2 # Status OCR_IN_PROGRESS + OCR_COMPLETE

# Verify ingestion status was updated to OCR_COMPLETE
assert mock_ingestion.status == ExtractionStatus.OCR_COMPLETE
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_process_ocr_task_updates_status_to_processing(
mock_ingestion,
mock_ocr_result,
):
"""Test task updates status to OCR_PROCESSING before starting OCR."""
"""Test task updates status to OCR_IN_PROGRESS before starting OCR."""
mock_settings.MISTRAL_API_KEY = "test-api-key"

mock_db = MagicMock()
Expand All @@ -257,7 +257,7 @@ def track_status_change(*args, **kwargs):

process_ocr_task(str(mock_ingestion.id))

# Verify status progression: OCR_PROCESSING -> OCR_COMPLETE
# Verify status progression: OCR_IN_PROGRESS -> OCR_COMPLETE
assert len(status_changes) >= 2
assert ExtractionStatus.OCR_PROCESSING in status_changes
assert ExtractionStatus.OCR_IN_PROGRESS in status_changes
assert status_changes[-1] == ExtractionStatus.OCR_COMPLETE
2 changes: 1 addition & 1 deletion frontend/src/client/schemas.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ export const Body_login_login_access_tokenSchema = {

export const ExtractionStatusSchema = {
type: 'string',
enum: ['UPLOADED', 'OCR_PROCESSING', 'OCR_COMPLETE', 'SEGMENTATION_PROCESSING', 'SEGMENTATION_COMPLETE', 'TAGGING_PROCESSING', 'DRAFT', 'IN_REVIEW', 'APPROVED', 'REJECTED', 'FAILED'],
enum: ['UPLOADED', 'OCR_IN_PROGRESS', 'OCR_COMPLETE', 'OCR_FAILED', 'SEGMENTATION_PROCESSING', 'SEGMENTATION_COMPLETE', 'TAGGING_PROCESSING', 'DRAFT', 'IN_REVIEW', 'APPROVED', 'REJECTED', 'FAILED'],
title: 'ExtractionStatus',
description: 'Extraction pipeline status enum.'
} as const;
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/client/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export type Body_login_login_access_token = {
/**
* Extraction pipeline status enum.
*/
export type ExtractionStatus = 'UPLOADED' | 'OCR_PROCESSING' | 'OCR_COMPLETE' | 'SEGMENTATION_PROCESSING' | 'SEGMENTATION_COMPLETE' | 'TAGGING_PROCESSING' | 'DRAFT' | 'IN_REVIEW' | 'APPROVED' | 'REJECTED' | 'FAILED';
export type ExtractionStatus = 'UPLOADED' | 'OCR_IN_PROGRESS' | 'OCR_COMPLETE' | 'OCR_FAILED' | 'SEGMENTATION_PROCESSING' | 'SEGMENTATION_COMPLETE' | 'TAGGING_PROCESSING' | 'DRAFT' | 'IN_REVIEW' | 'APPROVED' | 'REJECTED' | 'FAILED';

export type HTTPValidationError = {
detail?: Array<ValidationError>;
Expand Down