Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion frontend/src/App.jsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { useState, useEffect, useRef } from 'react'
import { FileText, Upload, Loader2, Download, CheckCircle2, XCircle, FileDown, Github, Clock, Repeat2, Scissors, DownloadCloud } from 'lucide-react'
import { FileText, Upload, Loader2, Download, CheckCircle2, XCircle, FileDown, Github, Clock, Repeat2, Scissors, DownloadCloud, Database } from 'lucide-react'
import { Button } from './components/ui/button'
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from './components/ui/card'
import { Select } from './components/ui/select'
Expand All @@ -11,6 +11,8 @@ import { RecentComparisons } from './components/RecentComparisons'
import { ComparisonResults } from './components/ComparisonResults'
import { ChunkerSelect } from './components/ChunkerSelect'
import { ChunkResults } from './components/ChunkResults'
import { EmbeddingsPanel } from './components/EmbeddingsPanel'


function App() {
const [libraries, setLibraries] = useState([])
Expand Down Expand Up @@ -948,6 +950,12 @@ function App() {
<TabsTrigger value="preview" className="flex-1 text-xs">
Preview
</TabsTrigger>
{chunkingResult?.chunking?.chunks && (
<TabsTrigger value="embeddings" className="flex-1 text-xs text-blue-600 dark:text-blue-400 font-semibold gap-1.5">
<Database className="w-3 h-3" />
Embeddings
</TabsTrigger>
)}
</TabsList>
</div>
<TabsContent value="source" className="flex-1 overflow-auto p-4 m-0">
Expand All @@ -963,6 +971,16 @@ function App() {
)}
/>
</TabsContent>
{chunkingResult?.chunking?.chunks && (
<TabsContent value="embeddings" className="flex-1 overflow-auto p-4 m-0">
<div className="max-w-2xl mx-auto py-2">
<EmbeddingsPanel
chunks={chunkingResult.chunking.chunks}
isLoading={isLoading}
/>
</div>
</TabsContent>
)}
</Tabs>
)}
</div>
Expand Down
191 changes: 191 additions & 0 deletions frontend/src/components/EmbeddingsPanel.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import React, { useState, useEffect } from 'react';
import { Database, ShieldCheck, ShieldAlert, Cpu, Sparkles, Loader2 } from 'lucide-react';
import { Button } from './ui/button';
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from './ui/card';
import { Select } from './ui/select';
import { Badge } from './ui/badge';
import { Alert, AlertDescription } from './ui/alert';

export function EmbeddingsPanel({ chunks, isLoading: parentLoading }) {
const [providers, setProviders] = useState([]);
const [selectedProvider, setSelectedProvider] = useState('auto');
const [embeddings, setEmbeddings] = useState(null);
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState(null);

useEffect(() => {
fetchProviders();
}, []);

const fetchProviders = async () => {
try {
const response = await fetch('/embeddings/providers');
const data = await response.json();
const rawProviders = data.providers || [];

const autoOpt = {
name: 'auto',
available: true,
configured: true,
message: 'Automatically pick best available provider'
};

setProviders([autoOpt, ...rawProviders]);
} catch (err) {
console.error('Failed to fetch embedding providers:', err);
}
};

const handleGenerateEmbeddings = async () => {
if (!chunks || chunks.length === 0) return;

setIsLoading(true);
setError(null);
setEmbeddings(null);

try {
// Get exact text content from chunks
const texts = chunks.map(c => c.text);

const response = await fetch('/embeddings/embed', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
texts: texts,
model: selectedProvider
})
});

const data = await response.json();

if (response.ok && data.success) {
setEmbeddings(data);
} else {
setError(data.detail || 'Embedding generation failed');
}
} catch (err) {
setError('Failed to generate embeddings: ' + err.message);
} finally {
setIsLoading(false);
}
};

if (!chunks || chunks.length === 0) {
return null;
}

return (
<Card className="mt-6 border-blue-100 dark:border-blue-900 shadow-sm overflow-hidden bg-white dark:bg-slate-950">
<CardHeader className="bg-blue-50/50 dark:bg-blue-950/10 border-b border-blue-50 dark:border-blue-900/50">
<div className="flex items-center justify-between">
<div className="flex items-center gap-2">
<Database className="w-5 h-5 text-blue-600 dark:text-blue-400" />
<div>
<CardTitle className="text-base">Vector Embeddings</CardTitle>
<CardDescription className="text-xs">Turn chunks into mathematical vectors for search</CardDescription>
</div>
</div>
<Badge variant="outline" className="bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-300 border-blue-200 dark:border-blue-800">
{chunks.length} Chunks
</Badge>
</div>
</CardHeader>
<CardContent className="p-6">
<div className="space-y-4">
<div className="grid grid-cols-1 md:grid-cols-2 gap-4 items-end">
<div className="space-y-2">
<label className="text-xs font-semibold text-slate-700 dark:text-slate-300 uppercase tracking-wider">
Embedding Provider
</label>
<Select
value={selectedProvider}
onChange={(e) => setSelectedProvider(e.target.value)}
disabled={isLoading || parentLoading}
className="w-full"
>
{providers.map((p) => (
<option key={p.name} value={p.name} disabled={!p.available}>
{p.name.charAt(0).toUpperCase() + p.name.slice(1)}
{!p.available ? ' (unavailable)' : (!p.configured ? ' (misconfigured)' : '')}
</option>
))}
</Select>
</div>
<Button
onClick={handleGenerateEmbeddings}
disabled={isLoading || parentLoading}
className="w-full bg-blue-600 hover:bg-blue-700 text-white shadow-md transition-all active:scale-[0.98]"
>
{isLoading ? (
<>
<Loader2 className="w-4 h-4 mr-2 animate-spin" />
Generating...
</>
) : (
<>
<Sparkles className="w-4 h-4 mr-2" />
Generate Embeddings
</>
)}
</Button>
</div>

{error && (
<Alert variant="destructive" className="bg-red-50 dark:bg-red-950/20 border-red-100 dark:border-red-900/50 text-red-800 dark:text-red-400">
<ShieldAlert className="w-4 h-4" />
<AlertDescription className="text-xs ml-2">{error}</AlertDescription>
</Alert>
)}

{embeddings && (
<div className="space-y-3 animate-in fade-in slide-in-from-top-2 duration-500">
<div className="flex items-center gap-2 p-3 rounded-lg bg-green-50 dark:bg-green-950/20 border border-green-100 dark:border-green-900/50 text-green-800 dark:text-green-400">
<ShieldCheck className="w-4 h-4" />
<span className="text-xs font-medium">Successfully generated {embeddings.count} embeddings using {embeddings.model_used}</span>
</div>

<div className="rounded-lg border border-slate-200 dark:border-slate-800 overflow-hidden">
<div className="bg-slate-50 dark:bg-slate-900/50 px-3 py-2 border-b border-slate-200 dark:border-slate-800">
<span className="text-[10px] font-bold text-slate-500 uppercase tracking-tighter">Vector Preview (Showing first 5 dimensions)</span>
</div>
<div className="p-0 overflow-x-auto">
<table className="w-full text-xs text-left">
<thead>
<tr className="bg-slate-50/50 dark:bg-slate-900/20 border-b border-slate-100 dark:border-slate-900">
<th className="px-3 py-2 font-semibold text-slate-600 dark:text-slate-400">Chunk</th>
<th className="px-3 py-2 font-semibold text-slate-600 dark:text-slate-400">Preview (First 5 values)</th>
<th className="px-3 py-2 font-semibold text-slate-600 dark:text-slate-400">Dimensions</th>
</tr>
</thead>
<tbody className="divide-y divide-slate-100 dark:divide-slate-900">
{embeddings.embeddings.slice(0, 5).map((vec, i) => (
<tr key={i} className="hover:bg-slate-50/30 dark:hover:bg-slate-900/10">
<td className="px-3 py-2 font-mono text-slate-500">#{i + 1}</td>
<td className="px-3 py-2 font-mono text-blue-600 dark:text-blue-400">
[{vec.slice(0, 5).map(v => v.toFixed(4)).join(', ')}...]
</td>
<td className="px-3 py-2">
<Badge variant="outline" className="text-[10px] h-4">{vec.length}d</Badge>
</td>
</tr>
))}
{embeddings.embeddings.length > 5 && (
<tr>
<td colSpan="3" className="px-3 py-2 text-center text-[10px] text-slate-400 italic bg-slate-50/30 dark:bg-slate-900/5">
+ {embeddings.embeddings.length - 5} more vectors
</td>
</tr>
)}
</tbody>
</table>
</div>
</div>
</div>
)}
</div>
</CardContent>
</Card>
);
}
57 changes: 57 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
from typing import List, Dict, Union, Optional

from fastapi.responses import HTMLResponse, JSONResponse, FileResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
Expand All @@ -18,6 +20,8 @@
from services.queue_manager import QueueManager
from services.results_manager import ResultsManager
from services.chunker_factory import get_chunker_factory
from services.embeddings_factory import get_embeddings_factory


app = FastAPI(title="PDFStract - Unified PDF extraction wrapper", description="Convert PDF files to Markdown using various libraries")

Expand Down Expand Up @@ -50,8 +54,10 @@ async def global_exception_handler(request: Request, exc: Exception):
# Serve static files if they exist (built React app)
static_dir = Path("static")
if static_dir.exists():
app.mount("/assets", StaticFiles(directory=str(static_dir / "assets")), name="assets")
app.mount("/static", StaticFiles(directory="static"), name="static")


# Ensure upload directory exists
UPLOAD_DIR = "uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
Expand All @@ -69,6 +75,8 @@ def get_factory():
db_service = DatabaseService()
queue_manager = QueueManager(db_service)
results_manager = ResultsManager()
embeddings_factory = get_embeddings_factory()


@app.get("/")
async def read_root():
Expand Down Expand Up @@ -604,6 +612,55 @@ async def convert_and_chunk(
# ============================================================================


# ============================================================================
# EMBEDDINGS ENDPOINTS
# ============================================================================

@app.get("/embeddings/providers")
async def get_embedding_providers():
"""List available embedding providers and their status"""
try:
factory = get_embeddings_factory()
providers = factory.list_all_providers()
return {"providers": providers}
except Exception as e:

logger.error(f"Error listing embedding providers: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))

from pydantic import BaseModel

class EmbedRequest(BaseModel):
texts: List[str]
model: str = "auto"

@app.post("/embeddings/embed")
async def generate_embeddings(request: EmbedRequest):
"""
Generate embeddings for a list of texts.
"""
try:
factory = get_embeddings_factory()
embeddings = await factory.embed_texts_async(request.model, request.texts)
return {
"success": True,
"model_used": request.model,
"embeddings": embeddings,
"count": len(embeddings)
}
except ValueError as e:

raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Embedding generation failed: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))

# ============================================================================
# END EMBEDDINGS ENDPOINTS
# ============================================================================



@app.delete("/compare/{task_id}")
async def delete_comparison(task_id: str):
"""Delete comparison task and results"""
Expand Down
61 changes: 61 additions & 0 deletions pdfstract/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,67 @@ def get_converter_info(self, library: str) -> Optional[Dict]:
all_libs = self.list_libraries()
return next((lib for lib in all_libs if lib["name"] == library), None)

# ===== EMBEDDINGS API =====

def embed(
self,
texts: Union[str, List[str]],
model: str = "auto"
) -> List[List[float]]:
"""Generate embeddings for text(s) (synchronous)

Args:
texts: A single string or a list of strings to embed
model: Name of the embedding model/provider to use
(e.g., 'openai', 'ollama', 'sentence-transformers')
Default: 'auto' (picks first available)

Returns:
List of embedding vectors (each is a list of floats)

Example:
>>> pdfstract = PDFStract()
>>> vectors = pdfstract.embed(["Hello world", "PDFStract is great"])
>>> print(len(vectors[0])) # e.g., 1536 for OpenAI
"""
import asyncio
return asyncio.run(self.embed_async(texts, model))

async def embed_async(
self,
texts: Union[str, List[str]],
model: str = "auto"
) -> List[List[float]]:
"""Generate embeddings for text(s) asynchronously

Args:
texts: A single string or a list of strings to embed
model: Name of the embedding model/provider to use

Returns:
List of embedding vectors
"""
from services.embeddings_factory import get_embeddings_factory

if isinstance(texts, str):
texts = [texts]

if not texts:
return []

factory = get_embeddings_factory()
return await factory.embed_texts_async(model, texts)

def list_available_embeddings(self) -> List[str]:
"""List names of available embedding providers

Returns:
List of provider names
"""
from services.embeddings_factory import get_embeddings_factory
return get_embeddings_factory().list_available_embeddings()


async def convert_chunk_async(
self,
pdf_path: Union[str, Path],
Expand Down
Loading