Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
env/
logs/
.env
.git
__pycache__
*.pyc
*.pyo
cookbook/
examples/
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,14 @@ __pycache__
.env*
.venv/
logs/
.idea/
.claude/

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
37 changes: 37 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# syntax=docker/dockerfile:1

FROM python:3.11-slim as builder

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

ENV VIRTUAL_ENV=/opt/venv
RUN python -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

COPY requirements.txt .
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements.txt

FROM python:3.11-slim

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

WORKDIR /app

COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

COPY . .

EXPOSE 8080

CMD ["uvicorn", "api.server:app", "--host", "0.0.0.0", "--port", "8080"]
Empty file added api/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions api/dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os
from functools import lru_cache

import aioboto3

# TODO: replace with a dedicated async S3 client that manages its own connection pool
# and lifecycle (e.g. FastAPI lifespan event) instead of creating a new client per call.
@lru_cache(maxsize=1)
def get_s3_session() -> aioboto3.Session:
return aioboto3.Session()


def get_s3_bucket() -> str:
bucket = os.environ.get("PAGEINDEX_S3_BUCKET")
if not bucket:
raise RuntimeError("PAGEINDEX_S3_BUCKET environment variable is not set")
return bucket
6 changes: 6 additions & 0 deletions api/routers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from fastapi import APIRouter

from api.routers.pageindex import pageindex_router

api_router = APIRouter()
api_router.include_router(pageindex_router, prefix="/pageindex", tags=["PageIndex"])
74 changes: 74 additions & 0 deletions api/routers/pageindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging

from fastapi import APIRouter, HTTPException, status
from pydantic import BaseModel, Field

from api.dependencies import get_s3_bucket, get_s3_session
from api.services.pageindex_service import (
EmptyDocumentError,
S3KeyNotFoundError,
S3ReadError,
S3WriteError,
process_markdown,
)

logger = logging.getLogger(__name__)

pageindex_router = APIRouter()


_CONFIG_YAML_NOTE = "Defaults to the value in pageindex/config.yaml when not provided."


class MarkdownPageIndexRequest(BaseModel):
input_s3_key: str = Field(..., min_length=1, description="S3 key of the markdown file to index.")
output_s3_key: str = Field(..., min_length=1, description="S3 key where the output tree JSON will be written.")
tokens_per_page: int = Field(
default=2000,
ge=500,
le=10000,
description="Target token budget per virtual page. Controls section granularity.",
)

# Pipeline options — all optional; unset fields fall back to pageindex/config.yaml defaults
model: str | None = Field(default=None, description=f"LLM model name for all pipeline stages. {_CONFIG_YAML_NOTE}")
if_add_node_id: str | None = Field(default=None, description=f'"yes" or "no". {_CONFIG_YAML_NOTE}')
if_add_node_summary: str | None = Field(default=None, description=f'"yes" or "no". {_CONFIG_YAML_NOTE}')
if_add_node_text: str | None = Field(default=None, description=f'"yes" or "no". {_CONFIG_YAML_NOTE}')
if_add_doc_description: str | None = Field(default=None, description=f'"yes" or "no". {_CONFIG_YAML_NOTE}')

extra_config: dict | None = Field(
default=None,
description="Escape hatch for any other config.yaml key not exposed above.",
)
# TODO: add `content: str | None` to accept raw markdown inline, skipping the S3 read


class MarkdownPageIndexResponse(BaseModel):
output_s3_key: str
doc_description: str
structure: list


@pageindex_router.post("/markdown", response_model=MarkdownPageIndexResponse)
async def index_markdown(payload: MarkdownPageIndexRequest) -> MarkdownPageIndexResponse:
"""Index a markdown document stored on S3 using the full PDF pipeline (process_no_toc path).

Reads the markdown from S3, splits it into virtual pages, runs tree generation
+ verification + retry logic, then writes the resulting tree JSON back to S3.
"""
try:
result = await process_markdown(payload, get_s3_session(), get_s3_bucket())
except S3KeyNotFoundError as e:
logger.warning(str(e))
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
except EmptyDocumentError as e:
logger.warning(str(e))
raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e))
except (S3ReadError, S3WriteError) as e:
logger.error(str(e), exc_info=True)
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail=str(e))
except Exception as e:
logger.error(f"PageIndex pipeline failed for '{payload.input_s3_key}': {e}", exc_info=True)
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
return result
15 changes: 15 additions & 0 deletions api/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from fastapi import FastAPI

from api.routers import api_router

def create_app() -> FastAPI:
app = FastAPI(
title="PageIndex API",
version="1.0.0",
description="HTTP API for indexing markdown documents using the PageIndex PDF pipeline.",
)
app.include_router(api_router, prefix="/api/v1")
return app


app = create_app()
Empty file added api/services/__init__.py
Empty file.
Loading