diff --git a/.gitignore b/.gitignore index 1c558120..a550384a 100755 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,6 @@ test-code/ localtestmcp/ *.csv *.pickle + +# Personal dev notes (not tracked) +docs/dev/ diff --git a/.vscode/settings.json b/.vscode/settings.json index 3e1a508b..1fedac26 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -18,5 +18,10 @@ "**/*.egg-info/**": true, "**/build/**": true, "**/dist/**": true - } + }, + "accessibility.signals.terminalBell": { + "sound": "on", + "announcement": "auto" + }, + "cmake.sourceDirectory": "/Users/yichuan/Desktop/code/LEANN/leann/packages/leann-backend-hnsw" } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md new file mode 100644 index 00000000..f92cb738 --- /dev/null +++ b/docs/CHANGELOG.md @@ -0,0 +1,27 @@ +# Changelog + +All notable changes to LEANN are documented here. Append-only, newest entries at the bottom. + +Format: `## YYYY-MM-DD: ` followed by bullet points. + +## 2026-03-05: IVF backend incremental update support + +- Added `leann-backend-ivf` with FAISS IndexIVFFlat + DirectMap.Hashtable. +- IVF supports in-place `add_vectors` and `remove_ids` without full rebuild. +- `leann build` is now idempotent: re-running on an existing index does incremental update (add new, remove deleted, re-index modified files). +- Fixed incremental build chunking inconsistency and shared metadata dict bug. +- Fixed IVF incremental update duplicate chunks from stale `passages.jsonl`. + +## 2026-03-05: MCP server v2 — build, status, and structured search + +- Added `leann_build` MCP tool: build or incrementally update indexes directly from Claude Code. +- Added `leann_status` MCP tool: inspect index details (backend, embedding model, chunk/file count, size). +- `leann_search` now uses `--json` output with file paths always included, formatted as markdown code blocks. +- Fixed `float32` JSON serialization bug in `leann search --json`. +- Cleaned up MCP tool descriptions (concise, no emoji). + +## 2026-03-05: Documentation — roadmap, vision, and dev guidelines + +- Rewrote `docs/roadmap.md` with current P0/P1 priorities from GitHub issue #237. +- Added `docs/ultimate_goal.md` — long-term vision (personal data platform, best code retrieval MCP, multimodal, local-first). +- Added self-contained documentation principle and dev doc maintenance rules to `CLAUDE.md`. diff --git a/docs/issue-proposals/smart-embedding-default.md b/docs/issue-proposals/smart-embedding-default.md new file mode 100644 index 00000000..41dffa5f --- /dev/null +++ b/docs/issue-proposals/smart-embedding-default.md @@ -0,0 +1,41 @@ +# Smart default embedding model based on platform and corpus size + +## Summary + +Propose platform- and corpus-aware default embedding model selection for `leann build` when `--embedding-model` is not explicitly specified. This would improve out-of-the-box experience for different deployment scenarios (macOS CPU, NVIDIA GPU, etc.) without changing behavior when users pass an explicit model. + +## Motivation + +- **Current default**: `facebook/contriever` (~420MB, 768 dim) — heavy for CPU-only builds on large corpora +- **macOS users** often hit slow builds on 20K+ chunks; lighter models like `all-MiniLM-L6-v2` (~90MB) are much faster +- **NVIDIA GPU users** can leverage stronger models; smaller corpora benefit from quality (e.g. Qwen3-Embedding-0.6B), larger ones from balanced models (e.g. bge-base-en-v1.5) + +## Proposed logic + +| Platform | Chunk count | Default model | +|----------|-------------|---------------| +| **macOS** | ≥ 20,000 | `sentence-transformers/all-MiniLM-L6-v2` | +| **macOS** | < 20,000 | `intfloat/e5-small-v2` | +| **NVIDIA GPU** | < 5,000 | `Qwen/Qwen3-Embedding-0.6B` | +| **NVIDIA GPU** | ≥ 5,000 | `BAAI/bge-base-en-v1.5` | +| **Other** | any | `facebook/contriever` (unchanged) | + +## Implementation notes + +1. **Platform detection**: `torch.cuda.is_available()` for NVIDIA; `sys.platform == "darwin"` for macOS +2. **Chunk count**: Known only after loading/chunking; may need to either: + - Do a lightweight pre-scan (e.g. file count × rough chunks per file), or + - Defer default choice until after first chunking pass (and cache for incremental) +3. **Explicit override**: If user passes `--embedding-model`, always use it; this logic applies only when the flag is omitted + +## Model references + +- `sentence-transformers/all-MiniLM-L6-v2`: ~90MB, 384 dim, fast on CPU +- `intfloat/e5-small-v2`: ~90MB, 384 dim +- `Qwen/Qwen3-Embedding-0.6B`: 0.6B params, 1024 dim, strong retrieval +- `BAAI/bge-base-en-v1.5`: ~110M params, 768 dim, good MTEB scores + +## Open questions + +- Should we add a `--embedding-model auto` to explicitly opt into this logic? +- Pre-scan vs post-chunk decision: trade-off between accuracy and implementation complexity diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 5a178f95..3b6ed110 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -2540,7 +2540,7 @@ async def search_documents(self, args): json_results = [ { "id": r.id, - "score": r.score, + "score": float(r.score), "text": r.text, "metadata": r.metadata, } diff --git a/packages/leann-core/src/leann/mcp.py b/packages/leann-core/src/leann/mcp.py index 8ccde94b..dc1fe0da 100755 --- a/packages/leann-core/src/leann/mcp.py +++ b/packages/leann-core/src/leann/mcp.py @@ -5,138 +5,331 @@ import sys +def _run_leann(*args, timeout=120): + """Run a leann CLI command and return (returncode, stdout, stderr).""" + result = subprocess.run( + ["leann", *args], + capture_output=True, + text=True, + timeout=timeout, + ) + return result.returncode, result.stdout, result.stderr + + +def _make_result(request_id, content_text): + return { + "jsonrpc": "2.0", + "id": request_id, + "result": {"content": [{"type": "text", "text": content_text}]}, + } + + +def _make_error(request_id, message): + return { + "jsonrpc": "2.0", + "id": request_id, + "error": {"code": -1, "message": message}, + } + + +TOOLS = [ + { + "name": "leann_search", + "description": ( + "Semantic code search across an indexed codebase. Returns matching code " + "chunks with file paths, scores, and surrounding context.\n\n" + "Use this to find relevant code before making changes — understand existing " + "patterns, locate implementations, and discover related files.\n\n" + "Examples: 'authentication middleware', 'database connection pooling', " + "'error handling in API routes', 'how are embeddings computed'" + ), + "inputSchema": { + "type": "object", + "properties": { + "index_name": { + "type": "string", + "description": "Name of the LEANN index to search. Use leann_list to see available indexes.", + }, + "query": { + "type": "string", + "description": "Natural language or technical search query.", + }, + "top_k": { + "type": "integer", + "default": 5, + "minimum": 1, + "maximum": 20, + "description": "Number of results to return (default 5).", + }, + "complexity": { + "type": "integer", + "default": 32, + "minimum": 16, + "maximum": 128, + "description": "Search precision level (default 32, use 64+ for thorough search).", + }, + }, + "required": ["index_name", "query"], + }, + }, + { + "name": "leann_list", + "description": "List all available LEANN indexes across projects. Shows index names, status, size, and location.", + "inputSchema": {"type": "object", "properties": {}}, + }, + { + "name": "leann_build", + "description": ( + "Build or incrementally update a LEANN index for a codebase. " + "If the index already exists, only new/modified/deleted files are processed " + "(incremental update). Use this to keep the index current after code changes.\n\n" + "Provide file paths or directories to index. For git repos, pass the output " + "of 'git ls-files' as individual paths." + ), + "inputSchema": { + "type": "object", + "properties": { + "index_name": { + "type": "string", + "description": "Name for the index (e.g., 'my-project'). Defaults to current directory name if omitted.", + }, + "docs": { + "type": "array", + "items": {"type": "string"}, + "description": "List of file paths or directories to index.", + }, + "backend_name": { + "type": "string", + "enum": ["hnsw", "ivf"], + "default": "ivf", + "description": "Index backend. 'ivf' supports incremental updates (recommended). 'hnsw' is faster for search but limited incremental support.", + }, + "force": { + "type": "boolean", + "default": False, + "description": "Force full rebuild instead of incremental update.", + }, + }, + "required": ["docs"], + }, + }, + { + "name": "leann_status", + "description": ( + "Show detailed status of a LEANN index: backend type, embedding model, " + "number of chunks, file count, index size, and whether the index is up to date." + ), + "inputSchema": { + "type": "object", + "properties": { + "index_name": { + "type": "string", + "description": "Name of the index to inspect.", + }, + }, + "required": ["index_name"], + }, + }, +] + + +def handle_search(request_id, args): + index_name = args.get("index_name", "") + query = args.get("query", "") + if not index_name or not query: + return _make_result(request_id, "Error: Both index_name and query are required.") + + top_k = args.get("top_k", 5) + complexity = args.get("complexity", 32) + + rc, stdout, stderr = _run_leann( + "search", + index_name, + query, + f"--top-k={top_k}", + f"--complexity={complexity}", + "--json", + "--show-metadata", + "--non-interactive", + ) + + if rc != 0: + return _make_result(request_id, f"Search failed: {stderr.strip()}") + + # Parse JSON results and format for code context + try: + results = json.loads(stdout) + except json.JSONDecodeError: + # Fallback to raw output if --json isn't available + return _make_result( + request_id, stdout if stdout.strip() else f"Search failed: {stderr.strip()}" + ) + + if not results: + return _make_result(request_id, f"No results found for '{query}'.") + + formatted = [] + for i, r in enumerate(results, 1): + meta = r.get("metadata", {}) + file_path = meta.get("file_path") or meta.get("source", "unknown") + score = r.get("score", 0) + text = r.get("text", "").strip() + formatted.append(f"### Result {i} — {file_path} (score: {score:.3f})\n```\n{text}\n```") + + header = f"Found {len(results)} results for '{query}':\n" + return _make_result(request_id, header + "\n\n".join(formatted)) + + +def handle_list(request_id): + rc, stdout, stderr = _run_leann("list") + if rc != 0: + return _make_result(request_id, f"Error listing indexes: {stderr.strip()}") + return _make_result(request_id, stdout) + + +def handle_build(request_id, args): + docs = args.get("docs", []) + if not docs: + return _make_result( + request_id, "Error: 'docs' parameter is required (list of file paths or directories)." + ) + + cmd = ["build"] + + index_name = args.get("index_name") + if index_name: + cmd.append(index_name) + + cmd.extend(["--docs", *docs]) + + backend = args.get("backend_name", "ivf") + cmd.extend([f"--backend-name={backend}"]) + + if args.get("force", False): + cmd.append("--force") + + rc, stdout, stderr = _run_leann(*cmd, timeout=600) + + if rc != 0: + return _make_result(request_id, f"Build failed:\n{stderr.strip()}\n{stdout.strip()}") + + return _make_result(request_id, stdout if stdout.strip() else "Build completed successfully.") + + +def handle_status(request_id, args): + index_name = args.get("index_name", "") + if not index_name: + return _make_result(request_id, "Error: index_name is required.") + + from pathlib import Path + + # Check standard location + leann_dir = Path.cwd() / ".leann" / "indexes" / index_name + meta_path = leann_dir / "documents.leann.meta.json" + passages_path = leann_dir / "documents.leann.passages.jsonl" + + if not meta_path.exists(): + return _make_result(request_id, f"Index '{index_name}' not found at {leann_dir}") + + try: + with open(meta_path) as f: + meta = json.load(f) + except Exception as e: + return _make_result(request_id, f"Error reading index metadata: {e}") + + # Count passages + num_chunks = 0 + file_paths = set() + if passages_path.exists(): + with open(passages_path) as f: + for line in f: + line = line.strip() + if not line: + continue + num_chunks += 1 + try: + passage = json.loads(line) + meta = passage.get("metadata", {}) + fp = meta.get("file_path") or meta.get("source", "") + if fp: + file_paths.add(fp) + except json.JSONDecodeError: + pass + + # Calculate total index size + total_size = 0 + if leann_dir.exists(): + for f in leann_dir.iterdir(): + if f.is_file(): + total_size += f.stat().st_size + + size_mb = total_size / (1024 * 1024) + + backend = meta.get("backend_name", "unknown") + embedding_model = meta.get("embedding_model", "unknown") + embedding_mode = meta.get("embedding_mode", "unknown") + dimensions = meta.get("dimensions", "unknown") + + status_lines = [ + f"Index: {index_name}", + f"Backend: {backend}", + f"Embedding: {embedding_model} ({embedding_mode})", + f"Dimensions: {dimensions}", + f"Chunks: {num_chunks}", + f"Files indexed: {len(file_paths)}", + f"Size: {size_mb:.1f} MB", + f"Location: {leann_dir}", + ] + + return _make_result(request_id, "\n".join(status_lines)) + + def handle_request(request): - if request.get("method") == "initialize": + method = request.get("method") + request_id = request.get("id") + + if method == "initialize": return { "jsonrpc": "2.0", - "id": request.get("id"), + "id": request_id, "result": { "capabilities": {"tools": {}}, "protocolVersion": "2024-11-05", - "serverInfo": {"name": "leann-mcp", "version": "1.0.0"}, + "serverInfo": {"name": "leann-mcp", "version": "2.0.0"}, }, } - elif request.get("method") == "tools/list": + if method == "notifications/initialized": + return None + + if method == "tools/list": return { "jsonrpc": "2.0", - "id": request.get("id"), - "result": { - "tools": [ - { - "name": "leann_search", - "description": """🔍 Search code using natural language - like having a coding assistant who knows your entire codebase! - -🎯 **Perfect for**: -- "How does authentication work?" → finds auth-related code -- "Error handling patterns" → locates try-catch blocks and error logic -- "Database connection setup" → finds DB initialization code -- "API endpoint definitions" → locates route handlers -- "Configuration management" → finds config files and usage - -💡 **Pro tip**: Use this before making any changes to understand existing patterns and conventions.""", - "inputSchema": { - "type": "object", - "properties": { - "index_name": { - "type": "string", - "description": "Name of the LEANN index to search. Use 'leann_list' first to see available indexes.", - }, - "query": { - "type": "string", - "description": "Search query - can be natural language (e.g., 'how to handle errors') or technical terms (e.g., 'async function definition')", - }, - "top_k": { - "type": "integer", - "default": 5, - "minimum": 1, - "maximum": 20, - "description": "Number of search results to return. Use 5-10 for focused results, 15-20 for comprehensive exploration.", - }, - "complexity": { - "type": "integer", - "default": 32, - "minimum": 16, - "maximum": 128, - "description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.", - }, - "show_metadata": { - "type": "boolean", - "default": False, - "description": "Include file paths and metadata in search results. Useful for understanding which files contain the results.", - }, - }, - "required": ["index_name", "query"], - }, - }, - { - "name": "leann_list", - "description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.", - "inputSchema": {"type": "object", "properties": {}}, - }, - ] - }, + "id": request_id, + "result": {"tools": TOOLS}, } - elif request.get("method") == "tools/call": + if method == "tools/call": tool_name = request["params"]["name"] args = request["params"].get("arguments", {}) try: if tool_name == "leann_search": - # Validate required parameters - if not args.get("index_name") or not args.get("query"): - return { - "jsonrpc": "2.0", - "id": request.get("id"), - "result": { - "content": [ - { - "type": "text", - "text": "Error: Both index_name and query are required", - } - ] - }, - } - - # Build simplified command with non-interactive flag for MCP compatibility - cmd = [ - "leann", - "search", - args["index_name"], - args["query"], - f"--top-k={args.get('top_k', 5)}", - f"--complexity={args.get('complexity', 32)}", - "--non-interactive", - ] - if args.get("show_metadata", False): - cmd.append("--show-metadata") - result = subprocess.run(cmd, capture_output=True, text=True) - + return handle_search(request_id, args) elif tool_name == "leann_list": - result = subprocess.run(["leann", "list"], capture_output=True, text=True) - - return { - "jsonrpc": "2.0", - "id": request.get("id"), - "result": { - "content": [ - { - "type": "text", - "text": result.stdout - if result.returncode == 0 - else f"Error: {result.stderr}", - } - ] - }, - } - + return handle_list(request_id) + elif tool_name == "leann_build": + return handle_build(request_id, args) + elif tool_name == "leann_status": + return handle_status(request_id, args) + else: + return _make_error(request_id, f"Unknown tool: {tool_name}") + except subprocess.TimeoutExpired: + return _make_result(request_id, "Error: Command timed out.") except Exception as e: - return { - "jsonrpc": "2.0", - "id": request.get("id"), - "error": {"code": -1, "message": str(e)}, - } + return _make_error(request_id, str(e)) + + return None def main(): diff --git a/packages/leann-mcp/README.md b/packages/leann-mcp/README.md index 5e2055e1..8af45560 100644 --- a/packages/leann-mcp/README.md +++ b/packages/leann-mcp/README.md @@ -37,8 +37,10 @@ claude mcp list | cat Once connected, you'll have access to these powerful semantic search tools in Claude Code: +- **`leann_search`** - Semantic code search with file paths, scores, and context - **`leann_list`** - List all available indexes across your projects -- **`leann_search`** - Perform semantic searches across code and documents +- **`leann_build`** - Build or incrementally update an index (keeps it current as code changes) +- **`leann_status`** - Show index details: backend, embedding model, chunk count, file count, size ## 🎯 Quick Start Example