NVIDIA · oliverholworthy · Mar 10, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
@@ -0,0 +1,15 @@
+# Transient directories generated by retrieval_bench
+cache/
+results/
+traces/
+nemo_agentic_logs/
+shards/
+
+# Environment
+.env
+.venv/
+
+# Python
+__pycache__/
+*.pyc
+*.egg-info/
@@ -0,0 +1,115 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# retrieval-bench
+
+Pipeline evaluation framework for document retrieval.
+
+This package depends on `vidore-benchmark` for shared framework modules that are
+not vendored in this slim repository snapshot.
+
+## Security note
+
+All model backends load HuggingFace models with `trust_remote_code=True`, which
+executes Python code shipped with the model repository. Only use model IDs from
+sources you trust.
+
+## Installation
+
+```bash
+pip install -e .
+```
+
+### Model dependencies
+
+Most models work with `transformers==4.51.0`:
+
+```bash
+pip install transformers==4.51.0
+pip install flash-attn==2.6.3 --no-build-isolation
+```
+
+The `nemotron-colembed-vl-8b-v2` model requires a newer transformers release:
+
+```bash
+pip install transformers==5.0.0rc0
+pip install flash-attn==2.6.3 --no-build-isolation
+```
+
+## Dense retrieval
+
+Retrieval-only evaluation with a pluggable backend. The `--backend` flag
+selects which retriever to use.
+
+```bash
+retrieval-bench evaluate dense-retrieval \
+  --dataset-name bright/biology \
+  --backend llama-nv-embed-reasoning-3b
+
+retrieval-bench evaluate dense-retrieval \
+  --dataset-name vidore/vidore_v3_hr \
+  --backend llama-nemotron-embed-vl-1b-v2 \
+  --language english
+
+retrieval-bench evaluate dense-retrieval \
+  --dataset-name vidore/vidore_v3_hr \
+  --backend nemotron-colembed-vl-8b-v2 \
+  --language english
+```
+
+Backend-specific overrides can be passed via `--pipeline-args` JSON:
+
+```bash
+retrieval-bench evaluate dense-retrieval \
+  --dataset-name bright/biology \
+  --backend llama-nv-embed-reasoning-3b \
+  --pipeline-args '{"model_id":"~/checkpoints/my_model","scoring_batch_size":2048}'
+```
+
+## Agentic retrieval
+
+Dense retrieval augmented with an LLM agent that iteratively refines results.
+
+```bash
+retrieval-bench evaluate agentic-retrieval \
+  --dataset-name bright/biology \
+  --backend llama-nv-embed-reasoning-3b \
+  --llm-model your-llm-model \
+  --num-concurrent 10
+
+retrieval-bench evaluate agentic-retrieval \
+  --dataset-name vidore/vidore_v3_hr \
+  --backend llama-nemotron-embed-vl-1b-v2 \
+  --llm-model your-llm-model
+```
+
+By default the pipeline reads `OPENAI_API_KEY` and `OPENAI_BASE_URL` from
+environment variables. Override them via `--pipeline-args`:
+
+```bash
+retrieval-bench evaluate agentic-retrieval \
+  --dataset-name bright/biology \
+  --backend llama-nv-embed-reasoning-3b \
+  --llm-model your-llm-model \
+  --pipeline-args '{"api_key":"os.environ/MY_KEY","base_url":"os.environ/MY_URL"}'
+```
+
+### Available backends
+
+| Backend | Retriever |
+|---------|-----------|
+| `llama-nv-embed-reasoning-3b` | NeMo Reasoning dense retriever |
+| `llama-nemoretriever-colembed-3b-v1` | ColEmbed late-interaction retriever |
+| `llama-nemotron-embed-vl-1b-v2` | Nemotron Embed VL 1B v2 (multimodal dense) |
+| `nemotron-colembed-vl-8b-v2` | Nemotron ColEmbed VL 8B v2 (multimodal late-interaction) |
+
+## Utilities
+
+```bash
+retrieval-bench evaluate utils list-datasets
+retrieval-bench evaluate utils list-backends
+retrieval-bench evaluate utils report-results --results-dir results/my_run
+retrieval-bench evaluate utils compare-results --results-dirs results/run_a --results-dirs results/run_b
+```
@@ -0,0 +1,64 @@
+[project]
+name = "retrieval-bench"
+version = "0.1.0"
+description = "Retrieval pipeline benchmarking toolkit for ViDoRe V3 and BRIGHT leaderboards."
+authors = []
+readme = "README.md"
+requires-python = ">=3.9,<3.14"
+license = { text = "Apache-2.0" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Developers",
+    "Operating System :: OS Independent",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "vidore-benchmark @ git+https://github.com/illuin-tech/vidore-benchmark.git@a93c1d916e305a1116ea557e1170de7adeb531e4",
+    "aiofiles",
+    "bm25s",
+    "datasets>=4.5.0",
+    "jinja2",
+    "litellm",
+    "Pillow",
+    "pydantic>=2.0.0,<3.0.0",
+    "pytrec_eval",
+    "PyStemmer",
+    "python-dotenv>=1.0.1,<2.0.0",
+    "rich",
+    "torch",
+    "transformers>=4.56.0",
+    "typer>=0.12.3,<1.0.0",
+    "torchvision>=0.23.0",
+]
+
+[project.optional-dependencies]
+dev = ["pytest>=8.2.1", "ruff>=0.4.5"]
+
+[project.scripts]
+retrieval-bench = "retrieval_bench.cli.main:app"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/retrieval_bench"]
+
+[tool.pytest.ini_options]
+filterwarnings = ["ignore::Warning"]
+markers = ["slow: marks test as slow"]
+testpaths = ["tests"]
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "N"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Top-level package exports.
+
+Note: This package previously re-exported many optional retriever implementations.
+Those imports can be heavyweight and may pull optional dependencies at import time.
+For pipeline evaluation workflows, we keep this module lightweight and only export
+the pipeline-evaluation API surface.
+"""
+
+from .pipeline_evaluation import (  # noqa: F401
+    BasePipeline,
+    aggregate_results,
+    evaluate_retrieval,
+    get_available_datasets,
+    load_vidore_dataset,
+    print_dataset_info,
+)
+
+__all__ = [
+    "BasePipeline",
+    "evaluate_retrieval",
+    "aggregate_results",
+    "load_vidore_dataset",
+    "get_available_datasets",
+    "print_dataset_info",
+]