diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml
index a14000ec..c8858536 100644
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -659,14 +659,36 @@ jobs:
         run: |
           source .venv/bin/activate || source .venv/Scripts/activate
           python - <<'PY'
+          import numpy as np
           import leann
           import leann_backend_hnsw as h
           import leann_backend_diskann as d
           import leann_backend_ivf as ivf
           from leann import LeannBuilder, LeannSearcher
-          b = LeannBuilder(backend_name="hnsw")
+
+          b = LeannBuilder(
+              backend_name="hnsw",
+              dimensions=2,
+              is_compact=False,
+              is_recompute=False,
+          )
           b.add_text("hello arch")
-          b.build_index("arch_demo.leann")
-          s = LeannSearcher("arch_demo.leann")
-          print("search:", s.search("hello", top_k=1))
+          b.build_index_from_arrays(
+              "arch_demo.leann",
+              ["0"],
+              np.asarray([[1.0, 0.0]], dtype=np.float32),
+          )
+
+          with LeannSearcher(
+              "arch_demo.leann",
+              recompute_embeddings=False,
+              enable_warmup=False,
+          ) as s:
+              s.backend_impl.compute_query_embedding = lambda *args, **kwargs: np.asarray(
+                  [[1.0, 0.0]], dtype=np.float32
+              )
+              result = s.search("hello", top_k=1)
+
+          assert result and result[0].text == "hello arch"
+          print("arch smoke ok")
           PY
diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
new file mode 100644
index 00000000..ede0dedf
--- /dev/null
+++ b/.github/workflows/docs-build.yml
@@ -0,0 +1,28 @@
+name: Docs Build
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+jobs:
+  mkdocs:
+    name: MkDocs Strict Build
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install documentation dependencies
+        run: python -m pip install -r docs/requirements.txt
+
+      - name: Build documentation
+        run: python -m mkdocs build --strict --config-file mkdocs.yml
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 00000000..127ad87b
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,14 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+mkdocs:
+  configuration: mkdocs.yml
+  fail_on_warning: true
+
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
index 1f60dbff..1394459f 100755
--- a/README.md
+++ b/README.md
@@ -1110,7 +1110,10 @@ leann ask my-docs "Where are prompts configured?"
 # Detect file changes since last build/watch checkpoint
 leann watch my-docs
 
-# List all your indexes
+# Keep an index current as files are saved
+leann watch my-docs --live
+
+# List discoverable indexes
 leann list
 
 # Remove an index
@@ -1152,23 +1155,27 @@ leann search INDEX_NAME QUERY [OPTIONS]
 Options:
   --top-k N                     Number of results (default: 5)
   --complexity N                Search complexity (default: 64)
+  --max-depth N                 Maximum app-index discovery depth (default: 4)
   --recompute / --no-recompute  Enable/disable embedding recomputation (default: enabled). Should not do a `no-recompute` search in a `recompute` build.
   --pruning-strategy {global,local,proportional}
 ```
 
 **Watch Command:**
 ```bash
-leann watch INDEX_NAME
+leann watch INDEX_NAME [OPTIONS]
 
 # Compares the current file system state against the last checkpoint (Merkle tree snapshot)
-# and reports which files have been added, removed, or modified, along with their chunk IDs.
+# and updates the index when files have been added, removed, or modified.
 #
-# - Automatically saves a new checkpoint after detecting changes
+# - Use --once --dry-run to report changes without updating the index
+# - Use --live to wake updates from filesystem events instead of the polling interval
+# - --debounce-ms controls live event batching (default: 500)
+# - Automatically saves a new checkpoint after successful index updates
 # - Each subsequent run compares against the most recent checkpoint
 # - File change detection uses SHA-256 content hashing via a Merkle tree
 #
 # Example output:
-#   === Changes since last checkpoint ===
+#   === Changes detected ===
 #   modified (1):
 #     - /path/to/file.py
 #       chunks: 42, 43, 44
@@ -1187,13 +1194,22 @@ Options:
 
 **List Command:**
 ```bash
-leann list
+leann list [OPTIONS]
+
+Options:
+  --max-depth N                 Maximum app-index discovery depth (default: 4)
 
-# Lists all indexes across all projects with status indicators:
+# Lists discoverable indexes from the current project and registered project roots:
 # ✅ - Index is complete and ready to use
 # ❌ - Index is incomplete or corrupted
 # 📁 - CLI-created index (in .leann/indexes/)
 # 📄 - App-created index (*.leann.meta.json files)
+#
+# App-created index discovery is bounded and skips heavyweight directories
+# such as .git, node_modules, virtualenvs, build/cache folders, and directory symlinks.
+#
+# Commands that resolve app-created indexes by name also accept --max-depth,
+# including search, watch, rebuild, migrate-ids, warmup, daemon, and react.
 ```
 
 **Remove Command:**
@@ -1201,10 +1217,11 @@ leann list
 leann remove INDEX_NAME [OPTIONS]
 
 Options:
-  --force, -f    Force removal without confirmation
+  --force, -f                    Force removal without confirmation
+  --max-depth N                  Maximum app-index discovery depth (default: 4)
 
 # Smart removal: automatically finds and safely removes indexes
-# - Shows all matching indexes across projects
+# - Shows all bounded matching indexes across registered projects
 # - Requires confirmation for cross-project removal
 # - Interactive selection when multiple matches found
 # - Supports both CLI and app-created indexes
@@ -1272,6 +1289,9 @@ results = searcher.search("banana‑crocodile", use_grep=True, top_k=1)
 
 ## Benchmarks
 
+For the current benchmark inventory and reporting template, see the
+[Benchmark Guide](docs/benchmarks.md).
+
 **[DiskANN vs HNSW Performance Comparison →](benchmarks/diskann_vs_hnsw_speed_comparison.py)** - Compare search performance between both backends
 
 **[Simple Example: Compare LEANN vs FAISS →](benchmarks/compare_faiss_vs_leann.py)** - See storage savings in action
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 258f260c..3cd0c210 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,135 +1,131 @@
-# 🧪 LEANN Benchmarks & Testing
+# LEANN Benchmarks
 
-This directory contains performance benchmarks and comprehensive tests for the LEANN system, including backend comparisons and sanity checks across different configurations.
+This directory contains benchmark scripts and data-dependent evaluation suites for LEANN. The
+canonical benchmark inventory now lives in [docs/benchmarks.md](../docs/benchmarks.md).
 
-## 📁 Test Files
+## Quick Starts
 
-### `diskann_vs_hnsw_speed_comparison.py`
-Performance comparison between DiskANN and HNSW backends:
-- ✅ **Search latency** comparison with both backends using recompute
-- ✅ **Index size** and **build time** measurements
-- ✅ **Score validity** testing (ensures no -inf scores)
-- ✅ **Configurable dataset sizes** for different scales
+Run a small backend comparison:
 
 ```bash
-# Quick comparison with 500 docs, 10 queries
-python benchmarks/diskann_vs_hnsw_speed_comparison.py
-
-# Large-scale comparison with 2000 docs, 20 queries
-python benchmarks/diskann_vs_hnsw_speed_comparison.py 2000 20
-```
-
-### `test_distance_functions.py`
-Tests all supported distance functions across DiskANN backend:
-- ✅ **MIPS** (Maximum Inner Product Search)
-- ✅ **L2** (Euclidean Distance)
-- ✅ **Cosine** (Cosine Similarity)
-
-```bash
-uv run python tests/sanity_checks/test_distance_functions.py
+uv run python benchmarks/diskann_vs_hnsw_speed_comparison.py
 ```
 
-### `test_l2_verification.py`
-Specifically verifies that L2 distance is correctly implemented by:
-- Building indices with L2 vs Cosine metrics
-- Comparing search results and score ranges
-- Validating that different metrics produce expected score patterns
+Compare LEANN storage usage against a FAISS baseline:
 
 ```bash
-uv run python tests/sanity_checks/test_l2_verification.py
+uv run python benchmarks/compare_faiss_vs_leann.py
 ```
 
-### `test_sanity_check.py`
-Comprehensive end-to-end verification including:
-- Distance function testing
-- Embedding model compatibility
-- Search result correctness validation
-- Backend integration testing
+Run the synthetic, model-free code-context expansion benchmark:
 
 ```bash
-uv run python tests/sanity_checks/test_sanity_check.py
+uv run python benchmarks/benchmark_code_context.py \
+  --modules 1000 \
+  --warmups 1 \
+  --repeats 5 \
+  --data-source synthetic-code-context \
+  --data-revision 2026-06-03-fixture \
+  --format markdown \
+  --json-output benchmark-results/code-context.json \
+  --markdown-output benchmark-results/code-context.md
 ```
 
-## 🎯 What These Tests Verify
+This benchmark fixture exercises direct related-code matches, direct immediate-base inherited
+calls, and bounded one-hop exact-call expansion. Its JSON and Markdown artifacts report data
+source/revision, local environment provenance, direct related counts, inherited related counts,
+one-hop related counts, expected inherited/one-hop coverage, unique related passages,
+cap/truncation accounting, command provenance, and timing summaries.
 
-### ✅ Distance Function Support
-- All three distance metrics (MIPS, L2, Cosine) work correctly
-- Score ranges are appropriate for each metric type
-- Different metrics can produce different rankings (as expected)
+The curated documentation report for this benchmark lives in
+[docs/code_context_benchmark.md](../docs/code_context_benchmark.md).
 
-### ✅ Backend Integration
-- DiskANN backend properly initializes and builds indices
-- Graph construction completes without errors
-- Search operations return valid results
+Run the main retrieval evaluation driver:
 
-### ✅ Embedding Pipeline
-- Real-time embedding computation works
-- Multiple embedding models are supported
-- ZMQ server communication functions correctly
-
-### ✅ End-to-End Functionality
-- Index building → searching → result retrieval pipeline
-- Metadata preservation through the entire flow
-- Error handling and graceful degradation
-
-## 🔍 Expected Output
-
-When all tests pass, you should see:
-
-```
-📊 测试结果总结:
-  mips      : ✅ 通过
-  l2        : ✅ 通过
-  cosine    : ✅ 通过
-
-🎉 测试完成!
+```bash
+uv run benchmarks/run_evaluation.py
 ```
 
-## 🐛 Troubleshooting
+This command treats an incomplete `benchmarks/data/` tree, including a README-only checkout or an
+`indices/` tree without any `.index` artifact, as missing benchmark data and downloads the public
+data pack. The repository-provided prebuilt indexes are large; check local disk capacity before
+relying on the automatic download path.
 
-### Common Issues
+For larger retrieval runs after data has been downloaded:
 
-**Import Errors**: Ensure you're running from the project root:
 ```bash
-cd /path/to/leann
-uv run python tests/sanity_checks/test_distance_functions.py
+uv run benchmarks/run_evaluation.py benchmarks/data/indices/rpj_wiki/rpj_wiki \
+  --dataset rpj_wiki \
+  --data-source LEANN-RAG/leann-rag-evaluation-data \
+  --data-revision 2026-06-02-download \
+  --num-queries 2000 \
+  --top-k 3 \
+  --complexity 120 \
+  --format markdown \
+  --json-output benchmark-results/retrieval-rpj-wiki.json \
+  --markdown-output benchmark-results/retrieval-rpj-wiki.md
 ```
 
-**Memory Issues**: Reduce graph complexity for resource-constrained systems:
-```python
-builder = LeannBuilder(
-    backend_name="diskann",
-    graph_degree=8,  # Reduced from 16
-    complexity=16    # Reduced from 32
-)
-```
+Retrieval evaluation is search-only by default. Use `--run-llm` only when generation should be
+included as opt-in metadata outside retrieval latency. Retrieval artifacts include per-query result
+IDs, golden IDs, and duplicate-text counters so reviewers can audit text-overlap recall when passage
+ID schemes differ across indexes. CLI-generated artifacts also record SHA256 hashes for the query
+and ground-truth files plus shell-quoted script command arguments.
 
-**ZMQ Port Conflicts**: The tests use different ports to avoid conflicts, but you may need to kill existing processes:
-```bash
-pkill -f "embedding_server"
-```
+Retrieval, query-log, and code-context artifacts share the same timing-statistics helper. p95 is the
+lower nearest-rank observed sample from sorted timings, so it is deterministic and comparable across
+reviewable benchmark reports. The standalone BM25 and DiskANN baseline scripts use the same
+observed-sample percentile helpers for their latency report fields.
 
-## 📊 Performance Expectations
+Run one retrieval manifest across multiple prebuilt backend indexes:
 
-### Typical Timing (3 documents, consumer hardware):
-- **Index Building**: 2-5 seconds per distance function
-- **Search Query**: 50-200ms
-- **Recompute Mode**: 5-15 seconds (higher accuracy)
-
-### Memory Usage:
-- **Index Storage**: ~1-2 MB per distance function
-- **Runtime Memory**: ~500MB (including model loading)
+```bash
+uv run python benchmarks/compare_retrieval_backends.py benchmark-results/retrieval-comparison.json \
+  --format markdown \
+  --json-output benchmark-results/retrieval-comparison-summary.json \
+  --markdown-output benchmark-results/retrieval-comparison-summary.md
+```
 
-## 🔗 Integration with CI/CD
+The comparison manifest keeps dataset, query file, ground truth, `top_k`, complexity, and batch
+size identical across runs. The combined JSON/Markdown artifacts report per-run backend,
+passage-ID scheme, embedding model/mode, recall@k, hit rate@k, latency, storage bytes, query file
+hash, ground-truth file hash, shell-quoted script command arguments, missing result-ID counts, and
+duplicate result/golden text counters.
+See [docs/benchmarks.md](../docs/benchmarks.md) for the manifest fields and a complete example.
 
-These tests are designed to be run in automated environments:
+Summarize query logs with optional ground truth and storage accounting:
 
-```yaml
-# GitHub Actions example
-- name: Run Sanity Checks
-  run: |
-    uv run python tests/sanity_checks/test_distance_functions.py
-    uv run python tests/sanity_checks/test_l2_verification.py
+```bash
+uv run python benchmarks/summarize_query_log.py queries.jsonl \
+  --ground-truth ground_truth.json \
+  --index-path .leann/indexes/my-index/documents.leann \
+  --data-source my-dataset \
+  --data-revision 2026-06-02-download \
+  --format markdown \
+  --json-output benchmark-results/my-index-summary.json \
+  --markdown-output benchmark-results/my-index-summary.md
 ```
 
-The tests are deterministic and should produce consistent results across different platforms.
+`--format` controls stdout. Use `--json-output` and `--markdown-output` to write durable review
+artifacts in the same run. Query-log summaries include latency mean/median/p95/min/max when
+available plus counters for records with missing result IDs, total missing result IDs, missing
+latency, missing search mode, missing backend metadata, and shell-quoted script command arguments.
+
+The `benchmark-results/` directory is ignored by git and is the default scratch location for local
+summary artifacts. Commit only curated reports with full dataset, hardware, and command context.
+
+## Suites
+
+- `bm25_diskann_baselines/`: Natural Questions BM25 and DiskANN search-only baselines that require
+  externally synced artifacts, report latency with the shared observed-sample percentile helpers,
+  and can write JSON reports with query hashes, timing scope, recursive storage bytes, settings,
+  command arguments, and environment provenance.
+- `contextbench/`: trace-driven code-assistant benchmark tooling.
+- `enron_emails/`: retrieval and generation evaluation on the Enron email corpus.
+- `financebench/`: financial document question-answering benchmark.
+- `laion/`: multimodal image/text retrieval benchmark.
+- `update/`: update-latency and sequential-vs-offline update strategy benchmarks.
+
+Benchmark result files committed under subdirectories are reference outputs from their documented
+settings and hardware. Do not treat them as automatically refreshed results for the current branch
+unless the relevant command has been rerun.
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index 7fd0da58..32c38f93 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -209,6 +209,30 @@ When adding new features or making significant changes:
 3. Update README.md if needed
 4. Include usage examples
 
+### Documentation Build
+
+The hosted documentation is built from `mkdocs.yml` with the dependencies in
+`docs/requirements.txt`. Before changing docs navigation, ReadTheDocs configuration, or benchmark
+documentation, run the same strict build used by CI:
+
+```bash
+python3 -m venv .venv
+. .venv/bin/activate
+python -m pip install -r docs/requirements.txt
+python -m mkdocs build --strict --config-file mkdocs.yml
+```
+
+For local preview:
+
+```bash
+python -m mkdocs serve --config-file mkdocs.yml
+```
+
+The build writes a local `site/` directory. Treat it as generated output and do not commit it. If
+you add a new documentation page, add it to the `nav` section in `mkdocs.yml`. If you add a MkDocs
+plugin, theme, or extension, add the dependency to `docs/requirements.txt` so local builds,
+ReadTheDocs, and CI use the same docs dependency set.
+
 ## 🤔 Getting Help
 
 - **Discord**: Join our community for discussions
diff --git a/docs/ast_chunking_guide.md b/docs/ast_chunking_guide.md
index 34d7ccba..4580eb52 100644
--- a/docs/ast_chunking_guide.md
+++ b/docs/ast_chunking_guide.md
@@ -135,7 +135,7 @@ python -m apps.document_rag --enable-code-chunking --chunk-size 256 --ast-chunk-
 ## References
 
 - [astchunk GitHub Repository](https://github.com/yilinjz/astchunk)
-- [LEANN MCP Integration](../packages/leann-mcp/README.md)
+- [LEANN MCP Integration](https://github.com/yichuan-w/LEANN/blob/main/packages/leann-mcp/README.md)
 - [Research Paper](https://arxiv.org/html/2506.15655v1)
 
 ---
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
new file mode 100644
index 00000000..bb4f0cc5
--- /dev/null
+++ b/docs/benchmarks.md
@@ -0,0 +1,252 @@
+# Benchmarks
+
+This page inventories the benchmark entry points currently available in this repository. It is a
+documentation scaffold for the P0 benchmark roadmap item; it does not claim that every suite is
+fully automated in CI or that historical numbers have been freshly reproduced on the current
+branch.
+
+## Quick Local Checks
+
+These scripts are the best first stop when checking backend behavior on a developer machine.
+
+| Area | Entry point | What it measures | Notes |
+| --- | --- | --- | --- |
+| Backend latency and size | `benchmarks/diskann_vs_hnsw_speed_comparison.py` | DiskANN vs HNSW search latency, build time, and index size | Accepts optional document and query counts. |
+| Storage comparison | `benchmarks/compare_faiss_vs_leann.py` | LEANN storage savings against a FAISS baseline | Good for a quick sanity check of the storage story. |
+| Embedding throughput | `benchmarks/benchmark_embeddings.py` | Embedding provider throughput | Depends on the configured local/provider model. |
+| No-recompute baseline | `benchmarks/benchmark_no_recompute.py` | Search behavior when stored embeddings are available | Useful when isolating recompute overhead. |
+| Code-context expansion | `benchmarks/benchmark_code_context.py` | Read-time related-code index build and expansion latency | Synthetic, model-free, and covers direct, direct inherited, and one-hop exact-call expansion. |
+| Retrieval backend comparison | `benchmarks/compare_retrieval_backends.py` | Recall, hit rate, latency, and storage for multiple prebuilt indexes under one manifest | Evaluation-only; does not build indexes. |
+
+Example:
+
+```bash
+uv run python benchmarks/diskann_vs_hnsw_speed_comparison.py
+uv run python benchmarks/compare_faiss_vs_leann.py
+uv run python benchmarks/benchmark_code_context.py --modules 1000 --format markdown
+```
+
+For a reviewable code-context artifact:
+
+```bash
+uv run python benchmarks/benchmark_code_context.py \
+  --modules 1000 \
+  --warmups 1 \
+  --repeats 5 \
+  --data-source synthetic-code-context \
+  --data-revision 2026-06-03-fixture \
+  --format markdown \
+  --json-output benchmark-results/code-context.json \
+  --markdown-output benchmark-results/code-context.md
+```
+
+The code-context benchmark creates synthetic Python modules where each seed passage resolves a
+shared import target, a same-module helper, a direct immediate-base inherited method, and the
+helper's one-hop exact call target. Its JSON and Markdown outputs include data source/revision,
+local environment provenance, direct related-item counts, inherited related-item counts, one-hop
+related-item counts, expected inherited/one-hop coverage, unique related passages,
+cap/truncation accounting, and timing summaries.
+
+## Curated Reports
+
+Curated benchmark reports should be committed only when they include the full command, data source,
+revision, environment, and result context needed for review. Generated artifacts under
+`benchmark-results/` should remain local unless they are intentionally promoted into a documented
+report.
+
+## P0 Benchmark Acceptance Checklist
+
+The P0 benchmark roadmap item is not complete until at least one reviewable benchmark pack includes
+real retrieval results, not just synthetic helper metrics. A benchmark pack should contain:
+
+- A retrieval evaluation artifact from `benchmarks/run_evaluation.py` with JSON and Markdown
+  outputs.
+- A query-log summary from `benchmarks/summarize_query_log.py` when replay logs are used for
+  comparison.
+- Dataset name, corpus size, query count, ground-truth source, benchmark data source, and exact
+  commands. For repository-provided retrieval data, record the Hugging Face dataset
+  `LEANN-RAG/leann-rag-evaluation-data` and the downloaded revision or download date.
+- Backend and index settings, including backend name, compact/recompute mode, `top_k`,
+  `complexity`, BM25/vector weighting, and passage ID scheme.
+- Embedding model, embedding mode, provider options that affect embeddings, and whether embeddings
+  were recomputed during search.
+- Hardware, operating system, Python version, LEANN commit SHA, and whether GPU acceleration was
+  used.
+- Recall@k or hit rate@k where ground truth exists, latency distribution, and local index storage
+  bytes.
+
+Generated JSON/Markdown under `benchmark-results/` stays local or is attached to CI/PR artifacts.
+Commit only curated reports that include the full context above and call out limitations clearly.
+
+A minimal first P0 pack should compare the main supported retrieval path against at least one
+baseline on a real dataset with ground truth. A broader pack should cover HNSW and IVF, include
+BM25/hybrid settings when relevant, and separate retrieval latency from any optional LLM generation.
+Use the single-index retrieval and query-log artifact tools for the first reports, but do not treat
+manual one-off runs as a complete backend comparison; a reviewable cross-backend pack should keep
+dataset, query slice, ground truth, `top_k`, complexity, embedding model, and output schema
+identical across rows.
+
+## Retrieval Evaluation
+
+`benchmarks/run_evaluation.py` is the main retrieval evaluation driver. It can download evaluation
+data when the local `benchmarks/data/` tree is missing required queries, ground truth, or at least
+one real `.index` artifact under `indices/`, and it can reuse a previously downloaded index path
+for larger runs. The repository-provided prebuilt indexes are large, so check local disk capacity
+before relying on the automatic download path; use `--queries-file`, `--ground-truth`, and a
+prebuilt or locally built index path when running a smaller custom pack. Evaluation is
+retrieval-only by default; use `--run-llm` only when generation should be measured separately from
+retrieval latency.
+
+```bash
+uv run benchmarks/run_evaluation.py
+uv run benchmarks/run_evaluation.py benchmarks/data/indices/rpj_wiki/rpj_wiki \
+  --dataset rpj_wiki \
+  --data-source LEANN-RAG/leann-rag-evaluation-data \
+  --data-revision 2026-06-02-download \
+  --num-queries 2000 \
+  --top-k 3 \
+  --complexity 120 \
+  --format markdown \
+  --json-output benchmark-results/retrieval-rpj-wiki.json \
+  --markdown-output benchmark-results/retrieval-rpj-wiki.md
+```
+
+Use this path for recall-oriented comparisons. The JSON artifact records the dataset, data source
+and revision, index path, backend, embedding model/mode, query count, `top_k`, complexity,
+hardware/platform, local LEANN commit, branch, dirty-worktree flag, query-file SHA256 and
+ground-truth-file SHA256 when file paths are available, shell-quoted script command arguments,
+storage bytes, whether embeddings were recomputed, per-query result IDs, per-query golden IDs,
+missing result-ID counts, and duplicate result/golden text counters. Recall is text-overlap based
+for compatibility with indexes that use different passage ID schemes; the duplicate-text counters
+are included so reviewers can identify queries where text-set matching may collapse distinct
+passages. For custom datasets, pass
+`--queries-file`, `--ground-truth`, `--dataset`, `--data-source`, and `--data-revision` explicitly
+so the artifact does not depend on path-name inference.
+
+## Retrieval Backend Comparisons
+
+`benchmarks/compare_retrieval_backends.py` runs the single-index retrieval evaluator across
+multiple prebuilt index paths from one JSON manifest. It is evaluation-only: build or download the
+indexes first, then use the manifest to prove that every row uses the same dataset, query slice,
+ground truth, `top_k`, complexity, and batch size.
+
+Example manifest:
+
+```json
+{
+  "dataset": "rpj_wiki",
+  "data_source": "LEANN-RAG/leann-rag-evaluation-data",
+  "data_revision": "2026-06-02-download",
+  "queries_file": "benchmarks/data/queries/nq_open.jsonl",
+  "ground_truth_file": "benchmarks/data/ground_truth/rpj_wiki/flat_results_nq_k3.json",
+  "num_queries": 2000,
+  "top_k": 3,
+  "complexity": 120,
+  "batch_size": 0,
+  "runs": [
+    {
+      "name": "hnsw",
+      "backend": "hnsw",
+      "index_path": "benchmarks/data/indices/rpj_wiki/rpj_wiki_hnsw"
+    },
+    {
+      "name": "ivf",
+      "backend": "ivf",
+      "index_path": "benchmarks/data/indices/rpj_wiki/rpj_wiki_ivf"
+    }
+  ]
+}
+```
+
+Run it with:
+
+```bash
+uv run python benchmarks/compare_retrieval_backends.py benchmark-results/retrieval-comparison.json \
+  --format markdown \
+  --json-output benchmark-results/retrieval-comparison-summary.json \
+  --markdown-output benchmark-results/retrieval-comparison-summary.md
+```
+
+The combined artifact includes one normalized row per run with backend, passage ID scheme,
+embedding model/mode, recall@k, hit rate@k, evaluated query count, missing ground-truth and golden
+passage counts, missing result-ID counts, duplicate result/golden text counters, latency
+mean/median/p95, storage bytes, local LEANN commit, dirty-worktree flag, and SHA256 hashes for the
+query and ground-truth files. CLI-generated comparison artifacts also record the shell-quoted script
+command arguments. Keep the full per-index evaluation summaries in the JSON artifact for reviewer
+drill-down.
+
+Reviewable benchmark artifacts use one shared timing-statistics helper for mean, median, p95, min,
+and max. p95 is the lower nearest-rank observed sample from the sorted timings, not an interpolated
+value, so retrieval, query-log, and code-context reports can be compared without reinterpreting the
+percentile rule. The standalone BM25 and DiskANN baseline scripts use the same observed-sample
+percentile helpers for their latency report fields.
+
+## Query Log Summaries
+
+`leann search --query-log <path>` and `leann ask --query-log <path>` write replay-oriented JSONL
+records. New records include `duration_ms`, result IDs, backend settings, and search mode. Use
+`benchmarks/summarize_query_log.py` to turn those logs into reviewable benchmark summaries:
+
+```bash
+uv run python benchmarks/summarize_query_log.py queries.jsonl \
+  --ground-truth ground_truth.json \
+  --index-path .leann/indexes/my-index/documents.leann \
+  --data-source my-dataset \
+  --data-revision 2026-06-02-download \
+  --k 10 \
+  --format markdown \
+  --json-output benchmark-results/my-index-summary.json \
+  --markdown-output benchmark-results/my-index-summary.md
+```
+
+Ground truth can be either a JSON object mapping query text to relevant passage IDs, or JSONL rows
+with `query` plus `relevant_ids`, `gold_ids`, `expected_ids`, or `ids`. The summarizer reports
+recall@k, hit rate@k, result counts, latency mean/median/p95/min/max when present,
+backend/search-mode counts, records with missing result IDs, total missing result IDs,
+missing-latency/backend/search-mode counters, local index storage bytes when index paths are
+available, data source/revision, shell-quoted script command arguments, and local environment
+metadata.
+`--format` controls stdout; `--json-output` and `--markdown-output` optionally write reviewable
+artifacts in the same run. Ground-truth query text must exactly match the `query` value in the
+query log.
+
+Generated benchmark artifacts record the Python script path and arguments with POSIX-style shell
+quoting. Curated reports should still include the full outer shell command, such as `uv run ...`,
+when the wrapper or dependency mode affects reproducibility.
+
+Query logs may contain sensitive query text and, when enabled, query embeddings. Summary artifacts
+omit embeddings and result text, but they still include query counts, backend/search-mode details,
+latency summaries, recall statistics, missing-ID/missing-metadata counters, and storage file paths.
+
+`benchmark-results/` is the default local scratch directory for generated benchmark summaries and
+is gitignored. Commit only curated benchmark reports with enough context to satisfy the reporting
+template below, or attach generated artifacts to a pull request or CI job.
+
+## Benchmark Suites
+
+| Suite | Location | Status | Purpose |
+| --- | --- | --- | --- |
+| BM25 and DiskANN baselines | `benchmarks/bm25_diskann_baselines/` | Data-dependent | Measures Natural Questions search-only latency for BM25 and DiskANN using externally synced artifacts; scripts can write JSON reports with query hashes, timing scope, recursive storage bytes, settings, command arguments, and environment provenance. |
+| ContextBench | `benchmarks/contextbench/` | Manual/agent workflow | Runs repository-preparation and trace-driven code-assistant evaluations. |
+| Enron emails | `benchmarks/enron_emails/` | Data-dependent | Evaluates retrieval and generation on the Enron email corpus. |
+| FinanceBench | `benchmarks/financebench/` | Data-dependent | Evaluates retrieval-augmented generation on financial question answering. |
+| LAION multimodal | `benchmarks/laion/` | Data-dependent and multimodal | Evaluates image retrieval and multimodal generation with CLIP/Qwen-style models. |
+| Update benchmarks | `benchmarks/update/` | Reproducible with local data, hardware-sensitive | Measures update latency and sequential-vs-offline update strategies. |
+
+Historical or sample result files in benchmark subdirectories should be treated as reference runs
+with their documented hardware and settings, not as automatically refreshed results for the current
+checkout.
+
+## Reporting Template
+
+When adding benchmark results, include:
+
+- LEANN commit SHA and backend.
+- Dataset name, corpus size, and query count.
+- Embedding model and embedding mode.
+- Search settings such as `top_k`, `complexity`, `beam_width`, `prune_ratio`, and BM25/vector
+  weighting when applicable.
+- Hardware, operating system, and whether GPU acceleration was used.
+- Build time, index size, search latency distribution, and recall@k where ground truth exists.
+
+This keeps benchmark updates self-contained and reviewable.
diff --git a/docs/features.md b/docs/features.md
index 0a7f9dc3..aff626ea 100644
--- a/docs/features.md
+++ b/docs/features.md
@@ -14,7 +14,7 @@
 - **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency
 - **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional)
 - **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead
-- **🚀 MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](../examples/mlx_demo.py))
+- **🚀 MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](https://github.com/yichuan-w/LEANN/blob/main/examples/mlx_demo.py))
 
 ## 🎨 Developer Experience
 
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..e7ce83d1
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,25 @@
+# LEANN Documentation
+
+LEANN is a local-first vector database and retrieval-augmented generation system designed to keep
+large personal corpora searchable on a laptop with much lower storage overhead than traditional
+vector databases.
+
+## Start Here
+
+- [Features](features.md): overview of supported retrieval, indexing, and application features.
+- [Configuration guide](configuration-guide.md): embedding models, backend choices, low-resource
+  setup, and performance tuning.
+- [Benchmarks](benchmarks.md): reproducible benchmark entry points and heavier evaluation suites.
+- [FAQ](faq.md): common setup and runtime questions.
+
+## Core Guides
+
+- [Metadata filtering](metadata_filtering.md): filter search results by structured metadata.
+- [AST chunking](ast_chunking_guide.md): code-aware chunking for source repositories.
+- [MCP setup](openclaw-setup.md): connect LEANN to code assistants through MCP.
+- [ReAct agent](react_agent.md): multi-step local/web retrieval workflows.
+
+## Project
+
+- [Roadmap](roadmap.md): current P0/P1 priorities.
+- [Contributing](CONTRIBUTING.md): development workflow and pull request guidance.
diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md
index e8734895..164efaf8 100644
--- a/docs/normalized_embeddings.md
+++ b/docs/normalized_embeddings.md
@@ -72,4 +72,6 @@ Using the wrong distance metric with normalized embeddings can lead to:
 - **Incorrect ranking** of search results
 - **Suboptimal performance** compared to using the correct metric
 
-For more details on why this happens, see our analysis in the [embedding detection code](../packages/leann-core/src/leann/api.py) which automatically handles normalized embeddings and MIPS distance metric issues.
+For more details on why this happens, see our analysis in the
+[embedding detection code](https://github.com/yichuan-w/LEANN/blob/main/packages/leann-core/src/leann/api.py),
+which automatically handles normalized embeddings and MIPS distance metric issues.
diff --git a/docs/react_agent.md b/docs/react_agent.md
index 671b3621..7f3b2a52 100644
--- a/docs/react_agent.md
+++ b/docs/react_agent.md
@@ -230,6 +230,6 @@ This is the first implementation (1/N) of Deep-Research integration. Future enha
 
 ## Related Documentation
 
-- [Basic Usage Guide](../README.md)
+- [Basic Usage Guide](https://github.com/yichuan-w/LEANN/blob/main/README.md)
 - [CLI Reference](configuration-guide.md)
 - [Embedding Models](normalized_embeddings.md)
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..39b1c350
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1 @@
+mkdocs>=1.6,<2
diff --git a/docs/roadmap.md b/docs/roadmap.md
index 2f871146..a062c062 100644
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -35,8 +35,8 @@ The primary near-term goal: make LEANN the go-to MCP server for code-aware AI as
 
 ### Documentation
 
-- [ ] ReadTheDocs — hosted documentation site (#234)
-- [ ] Benchmarks — recall@k, latency, and storage comparisons across backends
+- [ ] ReadTheDocs — MkDocs/ReadTheDocs scaffold added; hosted public site remains (#234)
+- [ ] Benchmarks — artifact writers, benchmark inventory, and curated synthetic code-context report added; reproducible recall@k, latency, and storage comparisons across real datasets/backends remain
 
 ---
 
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 00000000..fc586d53
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,27 @@
+site_name: LEANN
+site_description: Local-first low-storage vector search and RAG.
+repo_url: https://github.com/yichuan-w/LEANN
+docs_dir: docs
+
+nav:
+  - Home: index.md
+  - Features: features.md
+  - Configuration: configuration-guide.md
+  - Benchmarks:
+      - Overview: benchmarks.md
+  - Metadata Filtering: metadata_filtering.md
+  - Grep Search: grep_search.md
+  - AST Chunking: ast_chunking_guide.md
+  - Normalized Embeddings: normalized_embeddings.md
+  - ColQwen Guide: COLQWEN_GUIDE.md
+  - FlashLib Backend: flashlib_backend_guide.md
+  - MCP Setup: openclaw-setup.md
+  - ReAct Agent: react_agent.md
+  - Slack Setup: slack-setup-guide.md
+  - User Scripts: user-scripts.md
+  - Thinking Budget: THINKING_BUDGET_FEATURE.md
+  - FAQ: faq.md
+  - Roadmap: roadmap.md
+  - Ultimate Goal: ultimate_goal.md
+  - Contributing: CONTRIBUTING.md
+  - Release Process: RELEASE.md