diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index a14000ec..c8858536 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -659,14 +659,36 @@ jobs: run: | source .venv/bin/activate || source .venv/Scripts/activate python - <<'PY' + import numpy as np import leann import leann_backend_hnsw as h import leann_backend_diskann as d import leann_backend_ivf as ivf from leann import LeannBuilder, LeannSearcher - b = LeannBuilder(backend_name="hnsw") + + b = LeannBuilder( + backend_name="hnsw", + dimensions=2, + is_compact=False, + is_recompute=False, + ) b.add_text("hello arch") - b.build_index("arch_demo.leann") - s = LeannSearcher("arch_demo.leann") - print("search:", s.search("hello", top_k=1)) + b.build_index_from_arrays( + "arch_demo.leann", + ["0"], + np.asarray([[1.0, 0.0]], dtype=np.float32), + ) + + with LeannSearcher( + "arch_demo.leann", + recompute_embeddings=False, + enable_warmup=False, + ) as s: + s.backend_impl.compute_query_embedding = lambda *args, **kwargs: np.asarray( + [[1.0, 0.0]], dtype=np.float32 + ) + result = s.search("hello", top_k=1) + + assert result and result[0].text == "hello arch" + print("arch smoke ok") PY diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml new file mode 100644 index 00000000..ede0dedf --- /dev/null +++ b/.github/workflows/docs-build.yml @@ -0,0 +1,28 @@ +name: Docs Build + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +jobs: + mkdocs: + name: MkDocs Strict Build + runs-on: ubuntu-22.04 + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install documentation dependencies + run: python -m pip install -r docs/requirements.txt + + - name: Build documentation + run: python -m mkdocs build --strict --config-file mkdocs.yml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..127ad87b --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,14 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +mkdocs: + configuration: mkdocs.yml + fail_on_warning: true + +python: + install: + - requirements: docs/requirements.txt diff --git a/README.md b/README.md index 1f60dbff..1394459f 100755 --- a/README.md +++ b/README.md @@ -1110,7 +1110,10 @@ leann ask my-docs "Where are prompts configured?" # Detect file changes since last build/watch checkpoint leann watch my-docs -# List all your indexes +# Keep an index current as files are saved +leann watch my-docs --live + +# List discoverable indexes leann list # Remove an index @@ -1152,23 +1155,27 @@ leann search INDEX_NAME QUERY [OPTIONS] Options: --top-k N Number of results (default: 5) --complexity N Search complexity (default: 64) + --max-depth N Maximum app-index discovery depth (default: 4) --recompute / --no-recompute Enable/disable embedding recomputation (default: enabled). Should not do a `no-recompute` search in a `recompute` build. --pruning-strategy {global,local,proportional} ``` **Watch Command:** ```bash -leann watch INDEX_NAME +leann watch INDEX_NAME [OPTIONS] # Compares the current file system state against the last checkpoint (Merkle tree snapshot) -# and reports which files have been added, removed, or modified, along with their chunk IDs. +# and updates the index when files have been added, removed, or modified. # -# - Automatically saves a new checkpoint after detecting changes +# - Use --once --dry-run to report changes without updating the index +# - Use --live to wake updates from filesystem events instead of the polling interval +# - --debounce-ms controls live event batching (default: 500) +# - Automatically saves a new checkpoint after successful index updates # - Each subsequent run compares against the most recent checkpoint # - File change detection uses SHA-256 content hashing via a Merkle tree # # Example output: -# === Changes since last checkpoint === +# === Changes detected === # modified (1): # - /path/to/file.py # chunks: 42, 43, 44 @@ -1187,13 +1194,22 @@ Options: **List Command:** ```bash -leann list +leann list [OPTIONS] + +Options: + --max-depth N Maximum app-index discovery depth (default: 4) -# Lists all indexes across all projects with status indicators: +# Lists discoverable indexes from the current project and registered project roots: # โœ… - Index is complete and ready to use # โŒ - Index is incomplete or corrupted # ๐Ÿ“ - CLI-created index (in .leann/indexes/) # ๐Ÿ“„ - App-created index (*.leann.meta.json files) +# +# App-created index discovery is bounded and skips heavyweight directories +# such as .git, node_modules, virtualenvs, build/cache folders, and directory symlinks. +# +# Commands that resolve app-created indexes by name also accept --max-depth, +# including search, watch, rebuild, migrate-ids, warmup, daemon, and react. ``` **Remove Command:** @@ -1201,10 +1217,11 @@ leann list leann remove INDEX_NAME [OPTIONS] Options: - --force, -f Force removal without confirmation + --force, -f Force removal without confirmation + --max-depth N Maximum app-index discovery depth (default: 4) # Smart removal: automatically finds and safely removes indexes -# - Shows all matching indexes across projects +# - Shows all bounded matching indexes across registered projects # - Requires confirmation for cross-project removal # - Interactive selection when multiple matches found # - Supports both CLI and app-created indexes @@ -1272,6 +1289,9 @@ results = searcher.search("bananaโ€‘crocodile", use_grep=True, top_k=1) ## Benchmarks +For the current benchmark inventory and reporting template, see the +[Benchmark Guide](docs/benchmarks.md). + **[DiskANN vs HNSW Performance Comparison โ†’](benchmarks/diskann_vs_hnsw_speed_comparison.py)** - Compare search performance between both backends **[Simple Example: Compare LEANN vs FAISS โ†’](benchmarks/compare_faiss_vs_leann.py)** - See storage savings in action diff --git a/benchmarks/README.md b/benchmarks/README.md index 258f260c..3cd0c210 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,135 +1,131 @@ -# ๐Ÿงช LEANN Benchmarks & Testing +# LEANN Benchmarks -This directory contains performance benchmarks and comprehensive tests for the LEANN system, including backend comparisons and sanity checks across different configurations. +This directory contains benchmark scripts and data-dependent evaluation suites for LEANN. The +canonical benchmark inventory now lives in [docs/benchmarks.md](../docs/benchmarks.md). -## ๐Ÿ“ Test Files +## Quick Starts -### `diskann_vs_hnsw_speed_comparison.py` -Performance comparison between DiskANN and HNSW backends: -- โœ… **Search latency** comparison with both backends using recompute -- โœ… **Index size** and **build time** measurements -- โœ… **Score validity** testing (ensures no -inf scores) -- โœ… **Configurable dataset sizes** for different scales +Run a small backend comparison: ```bash -# Quick comparison with 500 docs, 10 queries -python benchmarks/diskann_vs_hnsw_speed_comparison.py - -# Large-scale comparison with 2000 docs, 20 queries -python benchmarks/diskann_vs_hnsw_speed_comparison.py 2000 20 -``` - -### `test_distance_functions.py` -Tests all supported distance functions across DiskANN backend: -- โœ… **MIPS** (Maximum Inner Product Search) -- โœ… **L2** (Euclidean Distance) -- โœ… **Cosine** (Cosine Similarity) - -```bash -uv run python tests/sanity_checks/test_distance_functions.py +uv run python benchmarks/diskann_vs_hnsw_speed_comparison.py ``` -### `test_l2_verification.py` -Specifically verifies that L2 distance is correctly implemented by: -- Building indices with L2 vs Cosine metrics -- Comparing search results and score ranges -- Validating that different metrics produce expected score patterns +Compare LEANN storage usage against a FAISS baseline: ```bash -uv run python tests/sanity_checks/test_l2_verification.py +uv run python benchmarks/compare_faiss_vs_leann.py ``` -### `test_sanity_check.py` -Comprehensive end-to-end verification including: -- Distance function testing -- Embedding model compatibility -- Search result correctness validation -- Backend integration testing +Run the synthetic, model-free code-context expansion benchmark: ```bash -uv run python tests/sanity_checks/test_sanity_check.py +uv run python benchmarks/benchmark_code_context.py \ + --modules 1000 \ + --warmups 1 \ + --repeats 5 \ + --data-source synthetic-code-context \ + --data-revision 2026-06-03-fixture \ + --format markdown \ + --json-output benchmark-results/code-context.json \ + --markdown-output benchmark-results/code-context.md ``` -## ๐ŸŽฏ What These Tests Verify +This benchmark fixture exercises direct related-code matches, direct immediate-base inherited +calls, and bounded one-hop exact-call expansion. Its JSON and Markdown artifacts report data +source/revision, local environment provenance, direct related counts, inherited related counts, +one-hop related counts, expected inherited/one-hop coverage, unique related passages, +cap/truncation accounting, command provenance, and timing summaries. -### โœ… Distance Function Support -- All three distance metrics (MIPS, L2, Cosine) work correctly -- Score ranges are appropriate for each metric type -- Different metrics can produce different rankings (as expected) +The curated documentation report for this benchmark lives in +[docs/code_context_benchmark.md](../docs/code_context_benchmark.md). -### โœ… Backend Integration -- DiskANN backend properly initializes and builds indices -- Graph construction completes without errors -- Search operations return valid results +Run the main retrieval evaluation driver: -### โœ… Embedding Pipeline -- Real-time embedding computation works -- Multiple embedding models are supported -- ZMQ server communication functions correctly - -### โœ… End-to-End Functionality -- Index building โ†’ searching โ†’ result retrieval pipeline -- Metadata preservation through the entire flow -- Error handling and graceful degradation - -## ๐Ÿ” Expected Output - -When all tests pass, you should see: - -``` -๐Ÿ“Š ๆต‹่ฏ•็ป“ๆžœๆ€ป็ป“: - mips : โœ… ้€š่ฟ‡ - l2 : โœ… ้€š่ฟ‡ - cosine : โœ… ้€š่ฟ‡ - -๐ŸŽ‰ ๆต‹่ฏ•ๅฎŒๆˆ! +```bash +uv run benchmarks/run_evaluation.py ``` -## ๐Ÿ› Troubleshooting +This command treats an incomplete `benchmarks/data/` tree, including a README-only checkout or an +`indices/` tree without any `.index` artifact, as missing benchmark data and downloads the public +data pack. The repository-provided prebuilt indexes are large; check local disk capacity before +relying on the automatic download path. -### Common Issues +For larger retrieval runs after data has been downloaded: -**Import Errors**: Ensure you're running from the project root: ```bash -cd /path/to/leann -uv run python tests/sanity_checks/test_distance_functions.py +uv run benchmarks/run_evaluation.py benchmarks/data/indices/rpj_wiki/rpj_wiki \ + --dataset rpj_wiki \ + --data-source LEANN-RAG/leann-rag-evaluation-data \ + --data-revision 2026-06-02-download \ + --num-queries 2000 \ + --top-k 3 \ + --complexity 120 \ + --format markdown \ + --json-output benchmark-results/retrieval-rpj-wiki.json \ + --markdown-output benchmark-results/retrieval-rpj-wiki.md ``` -**Memory Issues**: Reduce graph complexity for resource-constrained systems: -```python -builder = LeannBuilder( - backend_name="diskann", - graph_degree=8, # Reduced from 16 - complexity=16 # Reduced from 32 -) -``` +Retrieval evaluation is search-only by default. Use `--run-llm` only when generation should be +included as opt-in metadata outside retrieval latency. Retrieval artifacts include per-query result +IDs, golden IDs, and duplicate-text counters so reviewers can audit text-overlap recall when passage +ID schemes differ across indexes. CLI-generated artifacts also record SHA256 hashes for the query +and ground-truth files plus shell-quoted script command arguments. -**ZMQ Port Conflicts**: The tests use different ports to avoid conflicts, but you may need to kill existing processes: -```bash -pkill -f "embedding_server" -``` +Retrieval, query-log, and code-context artifacts share the same timing-statistics helper. p95 is the +lower nearest-rank observed sample from sorted timings, so it is deterministic and comparable across +reviewable benchmark reports. The standalone BM25 and DiskANN baseline scripts use the same +observed-sample percentile helpers for their latency report fields. -## ๐Ÿ“Š Performance Expectations +Run one retrieval manifest across multiple prebuilt backend indexes: -### Typical Timing (3 documents, consumer hardware): -- **Index Building**: 2-5 seconds per distance function -- **Search Query**: 50-200ms -- **Recompute Mode**: 5-15 seconds (higher accuracy) - -### Memory Usage: -- **Index Storage**: ~1-2 MB per distance function -- **Runtime Memory**: ~500MB (including model loading) +```bash +uv run python benchmarks/compare_retrieval_backends.py benchmark-results/retrieval-comparison.json \ + --format markdown \ + --json-output benchmark-results/retrieval-comparison-summary.json \ + --markdown-output benchmark-results/retrieval-comparison-summary.md +``` -## ๐Ÿ”— Integration with CI/CD +The comparison manifest keeps dataset, query file, ground truth, `top_k`, complexity, and batch +size identical across runs. The combined JSON/Markdown artifacts report per-run backend, +passage-ID scheme, embedding model/mode, recall@k, hit rate@k, latency, storage bytes, query file +hash, ground-truth file hash, shell-quoted script command arguments, missing result-ID counts, and +duplicate result/golden text counters. +See [docs/benchmarks.md](../docs/benchmarks.md) for the manifest fields and a complete example. -These tests are designed to be run in automated environments: +Summarize query logs with optional ground truth and storage accounting: -```yaml -# GitHub Actions example -- name: Run Sanity Checks - run: | - uv run python tests/sanity_checks/test_distance_functions.py - uv run python tests/sanity_checks/test_l2_verification.py +```bash +uv run python benchmarks/summarize_query_log.py queries.jsonl \ + --ground-truth ground_truth.json \ + --index-path .leann/indexes/my-index/documents.leann \ + --data-source my-dataset \ + --data-revision 2026-06-02-download \ + --format markdown \ + --json-output benchmark-results/my-index-summary.json \ + --markdown-output benchmark-results/my-index-summary.md ``` -The tests are deterministic and should produce consistent results across different platforms. +`--format` controls stdout. Use `--json-output` and `--markdown-output` to write durable review +artifacts in the same run. Query-log summaries include latency mean/median/p95/min/max when +available plus counters for records with missing result IDs, total missing result IDs, missing +latency, missing search mode, missing backend metadata, and shell-quoted script command arguments. + +The `benchmark-results/` directory is ignored by git and is the default scratch location for local +summary artifacts. Commit only curated reports with full dataset, hardware, and command context. + +## Suites + +- `bm25_diskann_baselines/`: Natural Questions BM25 and DiskANN search-only baselines that require + externally synced artifacts, report latency with the shared observed-sample percentile helpers, + and can write JSON reports with query hashes, timing scope, recursive storage bytes, settings, + command arguments, and environment provenance. +- `contextbench/`: trace-driven code-assistant benchmark tooling. +- `enron_emails/`: retrieval and generation evaluation on the Enron email corpus. +- `financebench/`: financial document question-answering benchmark. +- `laion/`: multimodal image/text retrieval benchmark. +- `update/`: update-latency and sequential-vs-offline update strategy benchmarks. + +Benchmark result files committed under subdirectories are reference outputs from their documented +settings and hardware. Do not treat them as automatically refreshed results for the current branch +unless the relevant command has been rerun. diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 7fd0da58..32c38f93 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -209,6 +209,30 @@ When adding new features or making significant changes: 3. Update README.md if needed 4. Include usage examples +### Documentation Build + +The hosted documentation is built from `mkdocs.yml` with the dependencies in +`docs/requirements.txt`. Before changing docs navigation, ReadTheDocs configuration, or benchmark +documentation, run the same strict build used by CI: + +```bash +python3 -m venv .venv +. .venv/bin/activate +python -m pip install -r docs/requirements.txt +python -m mkdocs build --strict --config-file mkdocs.yml +``` + +For local preview: + +```bash +python -m mkdocs serve --config-file mkdocs.yml +``` + +The build writes a local `site/` directory. Treat it as generated output and do not commit it. If +you add a new documentation page, add it to the `nav` section in `mkdocs.yml`. If you add a MkDocs +plugin, theme, or extension, add the dependency to `docs/requirements.txt` so local builds, +ReadTheDocs, and CI use the same docs dependency set. + ## ๐Ÿค” Getting Help - **Discord**: Join our community for discussions diff --git a/docs/ast_chunking_guide.md b/docs/ast_chunking_guide.md index 34d7ccba..4580eb52 100644 --- a/docs/ast_chunking_guide.md +++ b/docs/ast_chunking_guide.md @@ -135,7 +135,7 @@ python -m apps.document_rag --enable-code-chunking --chunk-size 256 --ast-chunk- ## References - [astchunk GitHub Repository](https://github.com/yilinjz/astchunk) -- [LEANN MCP Integration](../packages/leann-mcp/README.md) +- [LEANN MCP Integration](https://github.com/yichuan-w/LEANN/blob/main/packages/leann-mcp/README.md) - [Research Paper](https://arxiv.org/html/2506.15655v1) --- diff --git a/docs/benchmarks.md b/docs/benchmarks.md new file mode 100644 index 00000000..bb4f0cc5 --- /dev/null +++ b/docs/benchmarks.md @@ -0,0 +1,252 @@ +# Benchmarks + +This page inventories the benchmark entry points currently available in this repository. It is a +documentation scaffold for the P0 benchmark roadmap item; it does not claim that every suite is +fully automated in CI or that historical numbers have been freshly reproduced on the current +branch. + +## Quick Local Checks + +These scripts are the best first stop when checking backend behavior on a developer machine. + +| Area | Entry point | What it measures | Notes | +| --- | --- | --- | --- | +| Backend latency and size | `benchmarks/diskann_vs_hnsw_speed_comparison.py` | DiskANN vs HNSW search latency, build time, and index size | Accepts optional document and query counts. | +| Storage comparison | `benchmarks/compare_faiss_vs_leann.py` | LEANN storage savings against a FAISS baseline | Good for a quick sanity check of the storage story. | +| Embedding throughput | `benchmarks/benchmark_embeddings.py` | Embedding provider throughput | Depends on the configured local/provider model. | +| No-recompute baseline | `benchmarks/benchmark_no_recompute.py` | Search behavior when stored embeddings are available | Useful when isolating recompute overhead. | +| Code-context expansion | `benchmarks/benchmark_code_context.py` | Read-time related-code index build and expansion latency | Synthetic, model-free, and covers direct, direct inherited, and one-hop exact-call expansion. | +| Retrieval backend comparison | `benchmarks/compare_retrieval_backends.py` | Recall, hit rate, latency, and storage for multiple prebuilt indexes under one manifest | Evaluation-only; does not build indexes. | + +Example: + +```bash +uv run python benchmarks/diskann_vs_hnsw_speed_comparison.py +uv run python benchmarks/compare_faiss_vs_leann.py +uv run python benchmarks/benchmark_code_context.py --modules 1000 --format markdown +``` + +For a reviewable code-context artifact: + +```bash +uv run python benchmarks/benchmark_code_context.py \ + --modules 1000 \ + --warmups 1 \ + --repeats 5 \ + --data-source synthetic-code-context \ + --data-revision 2026-06-03-fixture \ + --format markdown \ + --json-output benchmark-results/code-context.json \ + --markdown-output benchmark-results/code-context.md +``` + +The code-context benchmark creates synthetic Python modules where each seed passage resolves a +shared import target, a same-module helper, a direct immediate-base inherited method, and the +helper's one-hop exact call target. Its JSON and Markdown outputs include data source/revision, +local environment provenance, direct related-item counts, inherited related-item counts, one-hop +related-item counts, expected inherited/one-hop coverage, unique related passages, +cap/truncation accounting, and timing summaries. + +## Curated Reports + +Curated benchmark reports should be committed only when they include the full command, data source, +revision, environment, and result context needed for review. Generated artifacts under +`benchmark-results/` should remain local unless they are intentionally promoted into a documented +report. + +## P0 Benchmark Acceptance Checklist + +The P0 benchmark roadmap item is not complete until at least one reviewable benchmark pack includes +real retrieval results, not just synthetic helper metrics. A benchmark pack should contain: + +- A retrieval evaluation artifact from `benchmarks/run_evaluation.py` with JSON and Markdown + outputs. +- A query-log summary from `benchmarks/summarize_query_log.py` when replay logs are used for + comparison. +- Dataset name, corpus size, query count, ground-truth source, benchmark data source, and exact + commands. For repository-provided retrieval data, record the Hugging Face dataset + `LEANN-RAG/leann-rag-evaluation-data` and the downloaded revision or download date. +- Backend and index settings, including backend name, compact/recompute mode, `top_k`, + `complexity`, BM25/vector weighting, and passage ID scheme. +- Embedding model, embedding mode, provider options that affect embeddings, and whether embeddings + were recomputed during search. +- Hardware, operating system, Python version, LEANN commit SHA, and whether GPU acceleration was + used. +- Recall@k or hit rate@k where ground truth exists, latency distribution, and local index storage + bytes. + +Generated JSON/Markdown under `benchmark-results/` stays local or is attached to CI/PR artifacts. +Commit only curated reports that include the full context above and call out limitations clearly. + +A minimal first P0 pack should compare the main supported retrieval path against at least one +baseline on a real dataset with ground truth. A broader pack should cover HNSW and IVF, include +BM25/hybrid settings when relevant, and separate retrieval latency from any optional LLM generation. +Use the single-index retrieval and query-log artifact tools for the first reports, but do not treat +manual one-off runs as a complete backend comparison; a reviewable cross-backend pack should keep +dataset, query slice, ground truth, `top_k`, complexity, embedding model, and output schema +identical across rows. + +## Retrieval Evaluation + +`benchmarks/run_evaluation.py` is the main retrieval evaluation driver. It can download evaluation +data when the local `benchmarks/data/` tree is missing required queries, ground truth, or at least +one real `.index` artifact under `indices/`, and it can reuse a previously downloaded index path +for larger runs. The repository-provided prebuilt indexes are large, so check local disk capacity +before relying on the automatic download path; use `--queries-file`, `--ground-truth`, and a +prebuilt or locally built index path when running a smaller custom pack. Evaluation is +retrieval-only by default; use `--run-llm` only when generation should be measured separately from +retrieval latency. + +```bash +uv run benchmarks/run_evaluation.py +uv run benchmarks/run_evaluation.py benchmarks/data/indices/rpj_wiki/rpj_wiki \ + --dataset rpj_wiki \ + --data-source LEANN-RAG/leann-rag-evaluation-data \ + --data-revision 2026-06-02-download \ + --num-queries 2000 \ + --top-k 3 \ + --complexity 120 \ + --format markdown \ + --json-output benchmark-results/retrieval-rpj-wiki.json \ + --markdown-output benchmark-results/retrieval-rpj-wiki.md +``` + +Use this path for recall-oriented comparisons. The JSON artifact records the dataset, data source +and revision, index path, backend, embedding model/mode, query count, `top_k`, complexity, +hardware/platform, local LEANN commit, branch, dirty-worktree flag, query-file SHA256 and +ground-truth-file SHA256 when file paths are available, shell-quoted script command arguments, +storage bytes, whether embeddings were recomputed, per-query result IDs, per-query golden IDs, +missing result-ID counts, and duplicate result/golden text counters. Recall is text-overlap based +for compatibility with indexes that use different passage ID schemes; the duplicate-text counters +are included so reviewers can identify queries where text-set matching may collapse distinct +passages. For custom datasets, pass +`--queries-file`, `--ground-truth`, `--dataset`, `--data-source`, and `--data-revision` explicitly +so the artifact does not depend on path-name inference. + +## Retrieval Backend Comparisons + +`benchmarks/compare_retrieval_backends.py` runs the single-index retrieval evaluator across +multiple prebuilt index paths from one JSON manifest. It is evaluation-only: build or download the +indexes first, then use the manifest to prove that every row uses the same dataset, query slice, +ground truth, `top_k`, complexity, and batch size. + +Example manifest: + +```json +{ + "dataset": "rpj_wiki", + "data_source": "LEANN-RAG/leann-rag-evaluation-data", + "data_revision": "2026-06-02-download", + "queries_file": "benchmarks/data/queries/nq_open.jsonl", + "ground_truth_file": "benchmarks/data/ground_truth/rpj_wiki/flat_results_nq_k3.json", + "num_queries": 2000, + "top_k": 3, + "complexity": 120, + "batch_size": 0, + "runs": [ + { + "name": "hnsw", + "backend": "hnsw", + "index_path": "benchmarks/data/indices/rpj_wiki/rpj_wiki_hnsw" + }, + { + "name": "ivf", + "backend": "ivf", + "index_path": "benchmarks/data/indices/rpj_wiki/rpj_wiki_ivf" + } + ] +} +``` + +Run it with: + +```bash +uv run python benchmarks/compare_retrieval_backends.py benchmark-results/retrieval-comparison.json \ + --format markdown \ + --json-output benchmark-results/retrieval-comparison-summary.json \ + --markdown-output benchmark-results/retrieval-comparison-summary.md +``` + +The combined artifact includes one normalized row per run with backend, passage ID scheme, +embedding model/mode, recall@k, hit rate@k, evaluated query count, missing ground-truth and golden +passage counts, missing result-ID counts, duplicate result/golden text counters, latency +mean/median/p95, storage bytes, local LEANN commit, dirty-worktree flag, and SHA256 hashes for the +query and ground-truth files. CLI-generated comparison artifacts also record the shell-quoted script +command arguments. Keep the full per-index evaluation summaries in the JSON artifact for reviewer +drill-down. + +Reviewable benchmark artifacts use one shared timing-statistics helper for mean, median, p95, min, +and max. p95 is the lower nearest-rank observed sample from the sorted timings, not an interpolated +value, so retrieval, query-log, and code-context reports can be compared without reinterpreting the +percentile rule. The standalone BM25 and DiskANN baseline scripts use the same observed-sample +percentile helpers for their latency report fields. + +## Query Log Summaries + +`leann search --query-log ` and `leann ask --query-log ` write replay-oriented JSONL +records. New records include `duration_ms`, result IDs, backend settings, and search mode. Use +`benchmarks/summarize_query_log.py` to turn those logs into reviewable benchmark summaries: + +```bash +uv run python benchmarks/summarize_query_log.py queries.jsonl \ + --ground-truth ground_truth.json \ + --index-path .leann/indexes/my-index/documents.leann \ + --data-source my-dataset \ + --data-revision 2026-06-02-download \ + --k 10 \ + --format markdown \ + --json-output benchmark-results/my-index-summary.json \ + --markdown-output benchmark-results/my-index-summary.md +``` + +Ground truth can be either a JSON object mapping query text to relevant passage IDs, or JSONL rows +with `query` plus `relevant_ids`, `gold_ids`, `expected_ids`, or `ids`. The summarizer reports +recall@k, hit rate@k, result counts, latency mean/median/p95/min/max when present, +backend/search-mode counts, records with missing result IDs, total missing result IDs, +missing-latency/backend/search-mode counters, local index storage bytes when index paths are +available, data source/revision, shell-quoted script command arguments, and local environment +metadata. +`--format` controls stdout; `--json-output` and `--markdown-output` optionally write reviewable +artifacts in the same run. Ground-truth query text must exactly match the `query` value in the +query log. + +Generated benchmark artifacts record the Python script path and arguments with POSIX-style shell +quoting. Curated reports should still include the full outer shell command, such as `uv run ...`, +when the wrapper or dependency mode affects reproducibility. + +Query logs may contain sensitive query text and, when enabled, query embeddings. Summary artifacts +omit embeddings and result text, but they still include query counts, backend/search-mode details, +latency summaries, recall statistics, missing-ID/missing-metadata counters, and storage file paths. + +`benchmark-results/` is the default local scratch directory for generated benchmark summaries and +is gitignored. Commit only curated benchmark reports with enough context to satisfy the reporting +template below, or attach generated artifacts to a pull request or CI job. + +## Benchmark Suites + +| Suite | Location | Status | Purpose | +| --- | --- | --- | --- | +| BM25 and DiskANN baselines | `benchmarks/bm25_diskann_baselines/` | Data-dependent | Measures Natural Questions search-only latency for BM25 and DiskANN using externally synced artifacts; scripts can write JSON reports with query hashes, timing scope, recursive storage bytes, settings, command arguments, and environment provenance. | +| ContextBench | `benchmarks/contextbench/` | Manual/agent workflow | Runs repository-preparation and trace-driven code-assistant evaluations. | +| Enron emails | `benchmarks/enron_emails/` | Data-dependent | Evaluates retrieval and generation on the Enron email corpus. | +| FinanceBench | `benchmarks/financebench/` | Data-dependent | Evaluates retrieval-augmented generation on financial question answering. | +| LAION multimodal | `benchmarks/laion/` | Data-dependent and multimodal | Evaluates image retrieval and multimodal generation with CLIP/Qwen-style models. | +| Update benchmarks | `benchmarks/update/` | Reproducible with local data, hardware-sensitive | Measures update latency and sequential-vs-offline update strategies. | + +Historical or sample result files in benchmark subdirectories should be treated as reference runs +with their documented hardware and settings, not as automatically refreshed results for the current +checkout. + +## Reporting Template + +When adding benchmark results, include: + +- LEANN commit SHA and backend. +- Dataset name, corpus size, and query count. +- Embedding model and embedding mode. +- Search settings such as `top_k`, `complexity`, `beam_width`, `prune_ratio`, and BM25/vector + weighting when applicable. +- Hardware, operating system, and whether GPU acceleration was used. +- Build time, index size, search latency distribution, and recall@k where ground truth exists. + +This keeps benchmark updates self-contained and reviewable. diff --git a/docs/features.md b/docs/features.md index 0a7f9dc3..aff626ea 100644 --- a/docs/features.md +++ b/docs/features.md @@ -14,7 +14,7 @@ - **๐Ÿš€ High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency - **๐ŸŽฏ Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional) - **๐Ÿ’พ Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead -- **๐Ÿš€ MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](../examples/mlx_demo.py)) +- **๐Ÿš€ MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](https://github.com/yichuan-w/LEANN/blob/main/examples/mlx_demo.py)) ## ๐ŸŽจ Developer Experience diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..e7ce83d1 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,25 @@ +# LEANN Documentation + +LEANN is a local-first vector database and retrieval-augmented generation system designed to keep +large personal corpora searchable on a laptop with much lower storage overhead than traditional +vector databases. + +## Start Here + +- [Features](features.md): overview of supported retrieval, indexing, and application features. +- [Configuration guide](configuration-guide.md): embedding models, backend choices, low-resource + setup, and performance tuning. +- [Benchmarks](benchmarks.md): reproducible benchmark entry points and heavier evaluation suites. +- [FAQ](faq.md): common setup and runtime questions. + +## Core Guides + +- [Metadata filtering](metadata_filtering.md): filter search results by structured metadata. +- [AST chunking](ast_chunking_guide.md): code-aware chunking for source repositories. +- [MCP setup](openclaw-setup.md): connect LEANN to code assistants through MCP. +- [ReAct agent](react_agent.md): multi-step local/web retrieval workflows. + +## Project + +- [Roadmap](roadmap.md): current P0/P1 priorities. +- [Contributing](CONTRIBUTING.md): development workflow and pull request guidance. diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md index e8734895..164efaf8 100644 --- a/docs/normalized_embeddings.md +++ b/docs/normalized_embeddings.md @@ -72,4 +72,6 @@ Using the wrong distance metric with normalized embeddings can lead to: - **Incorrect ranking** of search results - **Suboptimal performance** compared to using the correct metric -For more details on why this happens, see our analysis in the [embedding detection code](../packages/leann-core/src/leann/api.py) which automatically handles normalized embeddings and MIPS distance metric issues. +For more details on why this happens, see our analysis in the +[embedding detection code](https://github.com/yichuan-w/LEANN/blob/main/packages/leann-core/src/leann/api.py), +which automatically handles normalized embeddings and MIPS distance metric issues. diff --git a/docs/react_agent.md b/docs/react_agent.md index 671b3621..7f3b2a52 100644 --- a/docs/react_agent.md +++ b/docs/react_agent.md @@ -230,6 +230,6 @@ This is the first implementation (1/N) of Deep-Research integration. Future enha ## Related Documentation -- [Basic Usage Guide](../README.md) +- [Basic Usage Guide](https://github.com/yichuan-w/LEANN/blob/main/README.md) - [CLI Reference](configuration-guide.md) - [Embedding Models](normalized_embeddings.md) diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..39b1c350 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +mkdocs>=1.6,<2 diff --git a/docs/roadmap.md b/docs/roadmap.md index 2f871146..a062c062 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -35,8 +35,8 @@ The primary near-term goal: make LEANN the go-to MCP server for code-aware AI as ### Documentation -- [ ] ReadTheDocs โ€” hosted documentation site (#234) -- [ ] Benchmarks โ€” recall@k, latency, and storage comparisons across backends +- [ ] ReadTheDocs โ€” MkDocs/ReadTheDocs scaffold added; hosted public site remains (#234) +- [ ] Benchmarks โ€” artifact writers, benchmark inventory, and curated synthetic code-context report added; reproducible recall@k, latency, and storage comparisons across real datasets/backends remain --- diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..fc586d53 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,27 @@ +site_name: LEANN +site_description: Local-first low-storage vector search and RAG. +repo_url: https://github.com/yichuan-w/LEANN +docs_dir: docs + +nav: + - Home: index.md + - Features: features.md + - Configuration: configuration-guide.md + - Benchmarks: + - Overview: benchmarks.md + - Metadata Filtering: metadata_filtering.md + - Grep Search: grep_search.md + - AST Chunking: ast_chunking_guide.md + - Normalized Embeddings: normalized_embeddings.md + - ColQwen Guide: COLQWEN_GUIDE.md + - FlashLib Backend: flashlib_backend_guide.md + - MCP Setup: openclaw-setup.md + - ReAct Agent: react_agent.md + - Slack Setup: slack-setup-guide.md + - User Scripts: user-scripts.md + - Thinking Budget: THINKING_BUDGET_FEATURE.md + - FAQ: faq.md + - Roadmap: roadmap.md + - Ultimate Goal: ultimate_goal.md + - Contributing: CONTRIBUTING.md + - Release Process: RELEASE.md