diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index eec8a0c..1c07a86 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -1,5 +1,5 @@ import { useState, useEffect, useRef } from 'react' -import { FileText, Upload, Loader2, Download, CheckCircle2, XCircle, FileDown, Github, Clock, Repeat2, Scissors, DownloadCloud } from 'lucide-react' +import { FileText, Upload, Loader2, Download, CheckCircle2, XCircle, FileDown, Github, Clock, Repeat2, Scissors, DownloadCloud, Database } from 'lucide-react' import { Button } from './components/ui/button' import { Card, CardContent, CardDescription, CardHeader, CardTitle } from './components/ui/card' import { Select } from './components/ui/select' @@ -11,6 +11,8 @@ import { RecentComparisons } from './components/RecentComparisons' import { ComparisonResults } from './components/ComparisonResults' import { ChunkerSelect } from './components/ChunkerSelect' import { ChunkResults } from './components/ChunkResults' +import { EmbeddingsPanel } from './components/EmbeddingsPanel' + function App() { const [libraries, setLibraries] = useState([]) @@ -948,6 +950,12 @@ function App() { Preview + {chunkingResult?.chunking?.chunks && ( + + + Embeddings + + )} @@ -963,6 +971,16 @@ function App() { )} /> + {chunkingResult?.chunking?.chunks && ( + +
+ +
+
+ )} )} diff --git a/frontend/src/components/EmbeddingsPanel.jsx b/frontend/src/components/EmbeddingsPanel.jsx new file mode 100644 index 0000000..d5020d7 --- /dev/null +++ b/frontend/src/components/EmbeddingsPanel.jsx @@ -0,0 +1,191 @@ +import React, { useState, useEffect } from 'react'; +import { Database, ShieldCheck, ShieldAlert, Cpu, Sparkles, Loader2 } from 'lucide-react'; +import { Button } from './ui/button'; +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from './ui/card'; +import { Select } from './ui/select'; +import { Badge } from './ui/badge'; +import { Alert, AlertDescription } from './ui/alert'; + +export function EmbeddingsPanel({ chunks, isLoading: parentLoading }) { + const [providers, setProviders] = useState([]); + const [selectedProvider, setSelectedProvider] = useState('auto'); + const [embeddings, setEmbeddings] = useState(null); + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(null); + + useEffect(() => { + fetchProviders(); + }, []); + + const fetchProviders = async () => { + try { + const response = await fetch('/embeddings/providers'); + const data = await response.json(); + const rawProviders = data.providers || []; + + const autoOpt = { + name: 'auto', + available: true, + configured: true, + message: 'Automatically pick best available provider' + }; + + setProviders([autoOpt, ...rawProviders]); + } catch (err) { + console.error('Failed to fetch embedding providers:', err); + } + }; + + const handleGenerateEmbeddings = async () => { + if (!chunks || chunks.length === 0) return; + + setIsLoading(true); + setError(null); + setEmbeddings(null); + + try { + // Get exact text content from chunks + const texts = chunks.map(c => c.text); + + const response = await fetch('/embeddings/embed', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + texts: texts, + model: selectedProvider + }) + }); + + const data = await response.json(); + + if (response.ok && data.success) { + setEmbeddings(data); + } else { + setError(data.detail || 'Embedding generation failed'); + } + } catch (err) { + setError('Failed to generate embeddings: ' + err.message); + } finally { + setIsLoading(false); + } + }; + + if (!chunks || chunks.length === 0) { + return null; + } + + return ( + + +
+
+ +
+ Vector Embeddings + Turn chunks into mathematical vectors for search +
+
+ + {chunks.length} Chunks + +
+
+ +
+
+
+ + +
+ +
+ + {error && ( + + + {error} + + )} + + {embeddings && ( +
+
+ + Successfully generated {embeddings.count} embeddings using {embeddings.model_used} +
+ +
+
+ Vector Preview (Showing first 5 dimensions) +
+
+ + + + + + + + + + {embeddings.embeddings.slice(0, 5).map((vec, i) => ( + + + + + + ))} + {embeddings.embeddings.length > 5 && ( + + + + )} + +
ChunkPreview (First 5 values)Dimensions
#{i + 1} + [{vec.slice(0, 5).map(v => v.toFixed(4)).join(', ')}...] + + {vec.length}d +
+ + {embeddings.embeddings.length - 5} more vectors +
+
+
+
+ )} +
+
+
+ ); +} diff --git a/main.py b/main.py index eb1e3f3..16a64a8 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,6 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request +from typing import List, Dict, Union, Optional + from fastapi.responses import HTMLResponse, JSONResponse, FileResponse, StreamingResponse from fastapi.staticfiles import StaticFiles from fastapi.middleware.cors import CORSMiddleware @@ -18,6 +20,8 @@ from services.queue_manager import QueueManager from services.results_manager import ResultsManager from services.chunker_factory import get_chunker_factory +from services.embeddings_factory import get_embeddings_factory + app = FastAPI(title="PDFStract - Unified PDF extraction wrapper", description="Convert PDF files to Markdown using various libraries") @@ -50,8 +54,10 @@ async def global_exception_handler(request: Request, exc: Exception): # Serve static files if they exist (built React app) static_dir = Path("static") if static_dir.exists(): + app.mount("/assets", StaticFiles(directory=str(static_dir / "assets")), name="assets") app.mount("/static", StaticFiles(directory="static"), name="static") + # Ensure upload directory exists UPLOAD_DIR = "uploads" os.makedirs(UPLOAD_DIR, exist_ok=True) @@ -69,6 +75,8 @@ def get_factory(): db_service = DatabaseService() queue_manager = QueueManager(db_service) results_manager = ResultsManager() +embeddings_factory = get_embeddings_factory() + @app.get("/") async def read_root(): @@ -604,6 +612,55 @@ async def convert_and_chunk( # ============================================================================ +# ============================================================================ +# EMBEDDINGS ENDPOINTS +# ============================================================================ + +@app.get("/embeddings/providers") +async def get_embedding_providers(): + """List available embedding providers and their status""" + try: + factory = get_embeddings_factory() + providers = factory.list_all_providers() + return {"providers": providers} + except Exception as e: + + logger.error(f"Error listing embedding providers: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +from pydantic import BaseModel + +class EmbedRequest(BaseModel): + texts: List[str] + model: str = "auto" + +@app.post("/embeddings/embed") +async def generate_embeddings(request: EmbedRequest): + """ + Generate embeddings for a list of texts. + """ + try: + factory = get_embeddings_factory() + embeddings = await factory.embed_texts_async(request.model, request.texts) + return { + "success": True, + "model_used": request.model, + "embeddings": embeddings, + "count": len(embeddings) + } + except ValueError as e: + + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Embedding generation failed: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +# ============================================================================ +# END EMBEDDINGS ENDPOINTS +# ============================================================================ + + + @app.delete("/compare/{task_id}") async def delete_comparison(task_id: str): """Delete comparison task and results""" diff --git a/pdfstract/api.py b/pdfstract/api.py index 40e8e50..a17195f 100644 --- a/pdfstract/api.py +++ b/pdfstract/api.py @@ -445,6 +445,67 @@ def get_converter_info(self, library: str) -> Optional[Dict]: all_libs = self.list_libraries() return next((lib for lib in all_libs if lib["name"] == library), None) + # ===== EMBEDDINGS API ===== + + def embed( + self, + texts: Union[str, List[str]], + model: str = "auto" + ) -> List[List[float]]: + """Generate embeddings for text(s) (synchronous) + + Args: + texts: A single string or a list of strings to embed + model: Name of the embedding model/provider to use + (e.g., 'openai', 'ollama', 'sentence-transformers') + Default: 'auto' (picks first available) + + Returns: + List of embedding vectors (each is a list of floats) + + Example: + >>> pdfstract = PDFStract() + >>> vectors = pdfstract.embed(["Hello world", "PDFStract is great"]) + >>> print(len(vectors[0])) # e.g., 1536 for OpenAI + """ + import asyncio + return asyncio.run(self.embed_async(texts, model)) + + async def embed_async( + self, + texts: Union[str, List[str]], + model: str = "auto" + ) -> List[List[float]]: + """Generate embeddings for text(s) asynchronously + + Args: + texts: A single string or a list of strings to embed + model: Name of the embedding model/provider to use + + Returns: + List of embedding vectors + """ + from services.embeddings_factory import get_embeddings_factory + + if isinstance(texts, str): + texts = [texts] + + if not texts: + return [] + + factory = get_embeddings_factory() + return await factory.embed_texts_async(model, texts) + + def list_available_embeddings(self) -> List[str]: + """List names of available embedding providers + + Returns: + List of provider names + """ + from services.embeddings_factory import get_embeddings_factory + return get_embeddings_factory().list_available_embeddings() + + async def convert_chunk_async( self, pdf_path: Union[str, Path], diff --git a/services/embeddings_factory.py b/services/embeddings_factory.py index 9943b88..6a91e61 100644 --- a/services/embeddings_factory.py +++ b/services/embeddings_factory.py @@ -65,6 +65,23 @@ def get_default_embedding(self) -> Optional[str]: return name return None + def list_all_providers(self) -> List[Dict]: + """List all providers with their availability and configuration status""" + result = [] + for name in self._provider_classes.keys(): + inst = self._load_provider(name) + available = inst.available if inst else False + ok, msg = inst.validate_credentials() if inst else (False, "Provider could not be loaded") + + result.append({ + "name": name, + "available": available, + "configured": ok, + "message": msg + }) + return result + + async def embed_texts_async(self, model: str, texts: List[str]) -> List[List[float]]: provider = self.get_embedding(model) if not provider: