ogx-ai
diff --git a/‎.github/workflows/integration-vector-io-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-vector-io-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/docs/providers/file_processors/inline_pypdf.mdx‎
Lines changed: 0 additions & 1 deletion b/‎docs/docs/providers/file_processors/inline_pypdf.mdx‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/llama_stack/providers/inline/file_processor/pypdf/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llama_stack/providers/inline/file_processor/pypdf/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama_stack/providers/inline/file_processor/pypdf/adapter.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llama_stack/providers/inline/file_processor/pypdf/adapter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama_stack/providers/inline/file_processor/pypdf/config.py‎
Lines changed: 0 additions & 6 deletions b/‎src/llama_stack/providers/inline/file_processor/pypdf/config.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎src/llama_stack/providers/inline/file_processor/pypdf/pypdf.py‎
Lines changed: 73 additions & 31 deletions b/‎src/llama_stack/providers/inline/file_processor/pypdf/pypdf.py‎
Lines changed: 73 additions & 31 deletions
diff --git a/‎src/llama_stack/providers/inline/vector_io/faiss/faiss.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llama_stack/providers/inline/vector_io/faiss/faiss.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py‎
Lines changed: 1 addition & 1 deletion
@@ -215,7 +215,7 @@ jobs:
           INFINISPAN_PASSWORD: ${{ matrix.vector-io-provider == 'remote::infinispan' && 'password' || '' }}
         run: |
           uv run --no-sync \
-            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers?trust_remote_code=true,vector_io=${{ matrix.vector-io-provider }}" \
+            pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers?trust_remote_code=true,vector_io=${{ matrix.vector-io-provider }},file_processors=inline::pypdf" \
             tests/integration/vector_io
 
       - name: Check Storage and Memory Available After Tests
 
@@ -16,7 +16,6 @@ PyPDF-based file processor for extracting text content from documents.
 |-------|------|----------|---------|-------------|
 | `default_chunk_size_tokens` | `int` | No | 800 | Default chunk size in tokens when chunking_strategy type is 'auto' |
 | `default_chunk_overlap_tokens` | `int` | No | 400 | Default chunk overlap in tokens when chunking_strategy type is 'auto' |
-| `max_file_size_bytes` | `int` | No | 104857600 | Maximum file size in bytes for uploaded files (default 100MB) |
 | `extract_metadata` | `bool` | No | True | Whether to extract PDF metadata (title, author, etc.) |
 | `clean_text` | `bool` | No | True | Whether to clean extracted text (remove extra whitespace, normalize line breaks) |
 
 
@@ -17,7 +17,7 @@ async def get_provider_impl(config: PyPDFFileProcessorConfig, deps: dict[Api, An
 
     assert isinstance(config, PyPDFFileProcessorConfig), f"Unexpected config type: {type(config)}"
 
-    files_api = deps.get(Api.files)
+    files_api = deps[Api.files]
 
     impl = PyPDFFileProcessorAdapter(config, files_api)
     return impl
 
@@ -16,7 +16,7 @@
 class PyPDFFileProcessorAdapter:
     """Adapter for PyPDF file processor."""
 
-    def __init__(self, config: PyPDFFileProcessorConfig, files_api=None) -> None:
+    def __init__(self, config: PyPDFFileProcessorConfig, files_api) -> None:
         self.config = config
         self.files_api = files_api
         self.processor = PyPDFFileProcessor(config, files_api)
 
@@ -28,12 +28,6 @@ class PyPDFFileProcessorConfig(BaseModel):
         description="Default chunk overlap in tokens when chunking_strategy type is 'auto'",
     )
 
-    max_file_size_bytes: int = Field(
-        default=100 * 1024 * 1024,
-        ge=1,
-        description="Maximum file size in bytes for uploaded files (default 100MB)",
-    )
-
     # PDF-specific options
     extract_metadata: bool = Field(default=True, description="Whether to extract PDF metadata (title, author, etc.)")
 
 
@@ -5,10 +5,12 @@
 # the root directory of this source tree.
 
 import io
+import mimetypes
 import time
 import uuid
 from typing import Any
 
+import chardet
 from fastapi import HTTPException, UploadFile
 from pypdf import PdfReader
 
@@ -33,7 +35,7 @@
 class PyPDFFileProcessor:
     """PyPDF-based file processor for PDF documents."""
 
-    def __init__(self, config: PyPDFFileProcessorConfig, files_api=None) -> None:
+    def __init__(self, config: PyPDFFileProcessorConfig, files_api) -> None:
         self.config = config
         self.files_api = files_api
 
@@ -44,72 +46,73 @@ async def process_file(
         options: dict[str, Any] | None = None,
         chunking_strategy: VectorStoreChunkingStrategy | None = None,
     ) -> ProcessFileResponse:
-        """Process a PDF file and return chunks."""
+        """Process a file and return chunks. Supports PDF and text-based files."""
 
-        # Validate input
         if not file and not file_id:
             raise ValueError("Either file or file_id must be provided")
         if file and file_id:
             raise ValueError("Cannot provide both file and file_id")
 
         start_time = time.time()
 
-        # Get PDF content
+        # Upload size limits are enforced by the router layer (upload_safety.py).
+        # The provider trusts that `file` has already been bounded-read and
+        # `file_id` references a file accepted by the Files API.
         if file:
-            # Read from uploaded file
             content = await file.read()
-            if len(content) > self.config.max_file_size_bytes:
-                raise ValueError(
-                    f"File size {len(content)} bytes exceeds maximum of {self.config.max_file_size_bytes} bytes"
-                )
             filename = file.filename or f"{uuid.uuid4()}.pdf"
         elif file_id:
-            # Get file from file storage using Files API
-            if not self.files_api:
-                raise ValueError("Files API not available - cannot process file_id")
-
-            # Get file metadata
             file_info = await self.files_api.openai_retrieve_file(RetrieveFileRequest(file_id=file_id))
             filename = file_info.filename
 
-            # Get file content
             content_response = await self.files_api.openai_retrieve_file_content(
                 RetrieveFileContentRequest(file_id=file_id)
             )
             content = content_response.body
 
+        mime_type, _ = mimetypes.guess_type(filename)
+        mime_category = mime_type.split("/")[0] if (mime_type and "/" in mime_type) else None
+
+        if mime_type == "application/pdf":
+            return self._process_pdf(content, filename, file_id, chunking_strategy, start_time)
+        elif mime_category == "text":
+            return self._process_text(content, filename, file_id, chunking_strategy, start_time)
+        else:
+            # Attempt text decoding as a fallback for unknown types
+            log.warning("Unknown mime type, attempting text extraction", mime_type=mime_type, filename=filename)
+            return self._process_text(content, filename, file_id, chunking_strategy, start_time)
+
+    def _process_pdf(
+        self,
+        content: bytes,
+        filename: str,
+        file_id: str | None,
+        chunking_strategy: VectorStoreChunkingStrategy | None,
+        start_time: float,
+    ) -> ProcessFileResponse:
+        """Process a PDF file."""
         pdf_bytes = io.BytesIO(content)
         reader = PdfReader(pdf_bytes)
 
         if reader.is_encrypted:
             raise HTTPException(status_code=422, detail="Password-protected PDFs are not supported")
 
-        # Extract text from PDF
         text_content, failed_pages = self._extract_pdf_text(reader)
 
-        # Clean text if configured
         if self.config.clean_text:
             text_content = self._clean_text(text_content)
 
-        # Extract metadata if configured
         pdf_metadata = {}
         if self.config.extract_metadata:
             pdf_metadata = self._extract_pdf_metadata(reader)
 
         document_id = str(uuid.uuid4())
-
-        # Prepare document metadata (include filename and file_id)
-        document_metadata = {
-            "filename": filename,
-            **pdf_metadata,
-        }
+        document_metadata: dict[str, Any] = {"filename": filename, **pdf_metadata}
         if file_id:
             document_metadata["file_id"] = file_id
 
         processing_time_ms = int((time.time() - start_time) * 1000)
-
-        # Create response metadata
-        response_metadata = {
+        response_metadata: dict[str, Any] = {
             "processor": "pypdf",
             "processing_time_ms": processing_time_ms,
             "page_count": pdf_metadata.get("page_count", 0),
@@ -120,13 +123,48 @@ async def process_file(
         if failed_pages:
             response_metadata["failed_pages"] = failed_pages
 
-        # Handle empty text - return empty chunks with metadata
         if not text_content or not text_content.strip():
             return ProcessFileResponse(chunks=[], metadata=response_metadata)
 
-        # Create chunks for non-empty text
         chunks = self._create_chunks(text_content, document_id, chunking_strategy, document_metadata)
+        return ProcessFileResponse(chunks=chunks, metadata=response_metadata)
+
+    def _process_text(
+        self,
+        content: bytes,
+        filename: str,
+        file_id: str | None,
+        chunking_strategy: VectorStoreChunkingStrategy | None,
+        start_time: float,
+    ) -> ProcessFileResponse:
+        """Process a text-based file (txt, csv, md, etc.)."""
+        detected = chardet.detect(content)
+        encoding = detected["encoding"] or "utf-8"
+        try:
+            text_content = content.decode(encoding)
+        except UnicodeDecodeError:
+            text_content = content.decode("utf-8", errors="replace")
 
+        if self.config.clean_text:
+            text_content = self._clean_text(text_content)
+
+        document_id = str(uuid.uuid4())
+        document_metadata: dict[str, Any] = {"filename": filename}
+        if file_id:
+            document_metadata["file_id"] = file_id
+
+        processing_time_ms = int((time.time() - start_time) * 1000)
+        response_metadata: dict[str, Any] = {
+            "processor": "text",
+            "processing_time_ms": processing_time_ms,
+            "extraction_method": "text",
+            "file_size_bytes": len(content),
+        }
+
+        if not text_content or not text_content.strip():
+            return ProcessFileResponse(chunks=[], metadata=response_metadata)
+
+        chunks = self._create_chunks(text_content, document_id, chunking_strategy, document_metadata)
         return ProcessFileResponse(chunks=chunks, metadata=response_metadata)
 
     def _extract_pdf_text(self, reader: PdfReader) -> tuple[str, list[str]]:
@@ -140,8 +178,9 @@ def _extract_pdf_text(self, reader: PdfReader) -> tuple[str, list[str]]:
             except Exception as e:
                 failed_pages.append(f"page {page_num + 1}: {e}")
                 continue
-            if page_text and page_text.strip():
-                text_parts.append(page_text)
+            if page_text:
+                if not self.config.clean_text or page_text.strip():
+                    text_parts.append(page_text)
 
         return "\n".join(text_parts), failed_pages
 
@@ -209,6 +248,9 @@ def _create_chunks(
         elif chunking_strategy.type == "static":
             chunk_size = chunking_strategy.static.max_chunk_size_tokens
             overlap_size = chunking_strategy.static.chunk_overlap_tokens
+        elif chunking_strategy.type == "contextual":
+            chunk_size = chunking_strategy.contextual.max_chunk_size_tokens
+            overlap_size = chunking_strategy.contextual.chunk_overlap_tokens
         else:
             chunk_size = self.config.default_chunk_size_tokens
             overlap_size = self.config.default_chunk_overlap_tokens
 
@@ -344,7 +344,7 @@ async def query_hybrid(
 
 
 class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
-    """Vector I/O adapter using FAISS for in-memory vector similarity search."""
+    """VectorIO adapter that uses FAISS for similarity search and vector storage."""
 
     def __init__(
         self,
 
@@ -290,7 +290,7 @@ async def query_hybrid(
 
 
 class WeaviateVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
-    """Vector I/O adapter for remote Weaviate instances."""
+    """VectorIO adapter that uses Weaviate for similarity search and vector storage."""
 
     def __init__(
         self,