Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 25 additions & 24 deletions benchmarks/contextbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ This directory keeps a small local runner around the upstream ContextBench repo.

## Kept Files

- `contextbench_official_repo/`: upstream ContextBench code and data.
- `scripts/*.py`: local preparation, run, and evaluation scripts.
- `mitmproxy_addons/trace_recorder.py`: HTTP trace recorder used while Claude runs.
- `requirements-run.txt`: extra Python dependencies for these local scripts.

To run ContextBench locally, download or clone the upstream ContextBench repo
into `contextbench_official_repo/`; it is not generated by these scripts.

Generated directories such as `.venv/`, `.mitmproxy-venv/`, `traces/`,
`logs/`, `scripts/contextbench_work_dir_*`, and `scripts/contextbench_eval_repos/`
can be deleted and regenerated.
Expand All @@ -26,12 +28,6 @@ pip install -r requirements-run.txt

## 2. Install Runtime CLIs

Install LEANN:

```bash
uv tool install leann-core --with leann
```

Install `mitmdump` in a separate environment:

```bash
Expand All @@ -49,16 +45,35 @@ The run script also expects:

## 3. Prepare Repos And LEANN Indexes

This step clones/prepares each selected ContextBench repository and builds a
`.leann/` index inside the task worktree. The run scripts preserve that index
when resetting task repositories.

```bash
cd scripts
WORK_ROOT=contextbench_work_dir_claude python prepare_repos_with_leann.py
python prepare_repos_with_leann.py
```

Useful options:

- `SELECTED_IDS=id1,id2`: prepare only specific instances.

## 4. Run Selected Tasks

`LEANN_MODE` controls whether Claude receives the LEANN MCP search hint:

- `LEANN_MODE=mcp`: use the `.leann/` index through the configured MCP server.
- `LEANN_MODE=none`: baseline run without LEANN.

`LEANN_MODE` defaults to `mcp`; set `LEANN_MODE=none` for baseline runs. If
`LEANN_MODE=mcp` but a task has no `.leann/` directory, the runner continues
without LEANN for that task.

Run with LEANN MCP:

```bash
cd scripts
LEANN_ENABLED=1 \
LEANN_MODE=mcp \
WORK_ROOT=contextbench_work_dir_claude \
OUTPUT_FILE=all_predictions_claude.jsonl \
python batch_run_selected.py
Expand All @@ -67,18 +82,12 @@ python batch_run_selected.py
Run without LEANN:

```bash
LEANN_ENABLED=0 \
LEANN_MODE=none \
WORK_ROOT=contextbench_work_dir_claude \
OUTPUT_FILE=all_predictions_claude_baseline.jsonl \
python batch_run_selected.py
```

Run specific IDs without editing the script:

```bash
SELECTED_IDS=id1,id2 python batch_run_selected.py
```

## 5. Evaluate Results

Context retrieval metrics:
Expand All @@ -93,11 +102,3 @@ PYTHONPATH=. python -m contextbench.evaluate \
--out "../scripts/contextbench_official_eval_claude.jsonl" \
2>&1 | tee "../scripts/contextbench_official_eval_claude.log"
```

## 6. Clean Generated Files

```bash
rm -rf .venv .mitmproxy-venv .eval-venv .leann .pycache_tmp logs traces
rm -rf scripts/.leann scripts/scripts
rm -rf scripts/contextbench_eval_repos scripts/contextbench_work_dir_claude scripts/contextbench_work_dir_claude_overlap160
```
37 changes: 23 additions & 14 deletions benchmarks/contextbench/scripts/auto_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ def _get_int_env(name: str, default: int, min_value: int = 1) -> int:
PREFETCH_STRICT = os.environ.get("PREFETCH_STRICT", "1").strip() != "0"


def _resolve_leann_mode() -> str:
mode = os.environ.get("LEANN_MODE", "").strip().lower()
if not mode:
return "mcp"
if mode in {"mcp", "none"}:
return mode
print(f"⚠️ Invalid LEANN_MODE={mode!r}; falling back to 'mcp'")
return "mcp"


# Get uncommitted changes compared to the current HEAD commit.
def get_git_diff(repo_dir: Path) -> str:
try:
Expand Down Expand Up @@ -66,9 +76,12 @@ def setup_task_environment(

repo = Repo(target_dir)
repo.git.reset("--hard")
# Preserve LEANN indexes if present.
# Clear leftovers before checkout so untracked files cannot block it.
repo.git.clean("-fdx", "-e", ".leann/")
repo.git.checkout(task["base_commit"])
repo.git.reset("--hard")
# After checkout, ensure the base commit worktree is clean while preserving LEANN indexes.
repo.git.clean("-fdx", "-e", ".leann/")
(target_dir / "PROBLEM.md").write_text(task["problem_statement"], encoding="utf-8")
return target_dir

Expand Down Expand Up @@ -321,22 +334,18 @@ def build_strict_mcp_config_without_server(server_name: str) -> Optional[str]:

# Decide whether LEANN/MCP integration is available for this task repo.
def resolve_leann_integration(target_dir: Path) -> dict[str, str]:
leann_enabled = os.environ.get("LEANN_ENABLED", "1") != "0"
use_mcp = os.environ.get("LEANN_USE_MCP", "1") != "0"
requested_mode = _resolve_leann_mode()
mcp_server_name = os.environ.get("LEANN_MCP_SERVER", "leann-server")
leann_index_exists = (target_dir / ".leann").exists()

mode = "none"
if leann_enabled and leann_index_exists:
if use_mcp:
mode = "mcp"
print(
f" -> 🔍 LEANN MCP enabled (server: {mcp_server_name}, "
f"forced top_k={LEANN_TOP_K})"
)
else:
print(" -> ⚠️ LEANN_USE_MCP=0 but CLI mode is disabled; continuing without LEANN")
elif leann_enabled:
if requested_mode == "mcp" and leann_index_exists:
mode = "mcp"
print(
f" -> 🔍 LEANN MCP enabled (server: {mcp_server_name}, "
f"forced top_k={LEANN_TOP_K})"
)
elif requested_mode == "mcp":
print(" -> ⚠️ LEANN enabled but no .leann index found; continuing without LEANN")

print(mode)
Expand Down Expand Up @@ -394,7 +403,7 @@ def run_claude_autonomous(
cfg_path = resolve_claude_mcp_config(server_name)
if not cfg_path:
raise RuntimeError(
f"LEANN_USE_MCP=1 but MCP server '{server_name}' was not found in Claude config. "
f"LEANN_MODE=mcp but MCP server '{server_name}' was not found in Claude config. "
"Set CLAUDE_MCP_CONFIG_PATH or configure this server in Claude settings."
)
print(f" -> ✅ Claude MCP config found for '{server_name}': {cfg_path}")
Expand Down
71 changes: 37 additions & 34 deletions benchmarks/contextbench/scripts/batch_run_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
from pathlib import Path

from auto_run import prefetch_task_repositories
from auto_run import prefetch_task_repositories, run_single_task
from datasets import load_dataset

ROOT = Path(__file__).resolve().parents[1]
Expand Down Expand Up @@ -46,7 +46,7 @@ def main():

api_key = os.environ.get("ANTHROPIC_API_KEY")
if api_key:
print(f"🔑 Using API key from environment: {api_key[:20]}...")
print(f"🔑 Using API key from environment: {api_key[:10]}...")
else:
print("🔐 ANTHROPIC_API_KEY not set; using Claude CLI logged-in session.")

Expand Down Expand Up @@ -88,6 +88,8 @@ def main():
else:
print("⏭️ PREFETCH_REPOS=0; skipping prefetch step.")

success_count = 0
failure_count = 0
for i, task in enumerate(selected_tasks):
instance_id = task["instance_id"]
repo_url = task["repo_url"]
Expand All @@ -96,38 +98,39 @@ def main():
print(f"📦 [{i + 1}/{len(selected_tasks)}] Running: {instance_id}")
print(f" repo: {repo_url} source: {task.get('source', '?')}")

# try:
# patch, elapsed, traj_data, usage = run_single_task(
# instance_id=instance_id,
# repo_url=repo_url,
# work_root=WORK_ROOT,
# mitm_script_path=str(MITM_SCRIPT),
# trace_dir=TRACE_DIR,
# model=MODEL,
# task=task,
# )

# result_entry = {
# "instance_id": instance_id,
# "model_patch": patch if patch else "",
# "model_name_or_path": "claude-code-cli",
# "elapsed_seconds": round(elapsed, 1),
# "traj_data": traj_data,
# "token_usage": usage,
# }

# with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
# f.write(json.dumps(result_entry) + "\n")
# print(f"✅ Result saved for {instance_id}")
# success_count += 1

# except Exception as e:
# print(f"❌ Error processing {instance_id}: {e}")
# failure_count += 1
# finally:
# cleanup_residuals()
# print("💤 Cooldown...")
# time.sleep(20)
# If want to run the task, uncomment the following code
# try:
# patch, elapsed, traj_data, usage = run_single_task(
# instance_id=instance_id,
# repo_url=repo_url,
# work_root=WORK_ROOT,
# mitm_script_path=str(MITM_SCRIPT),
# trace_dir=TRACE_DIR,
# model=MODEL,
# task=task,
# )

# result_entry = {
# "instance_id": instance_id,
# "model_patch": patch if patch else "",
# "model_name_or_path": "claude-code-cli",
# "elapsed_seconds": round(elapsed, 1),
# "traj_data": traj_data,
# "token_usage": usage,
# }

# with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
# f.write(json.dumps(result_entry) + "\n")
# print(f"✅ Result saved for {instance_id}")
# success_count += 1

# except Exception as e:
# print(f"❌ Error processing {instance_id}: {e}")
# failure_count += 1
# finally:
# cleanup_residuals()
# print("💤 Cooldown...")
# time.sleep(20)

# print(
# f"\n✅ Finished {len(selected_tasks)} random tasks: "
Expand Down
66 changes: 33 additions & 33 deletions benchmarks/contextbench/scripts/batch_run_selected.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
MITM_SCRIPT = ROOT / "mitmproxy_addons" / "trace_recorder.py"
TRACE_DIR = ROOT / "traces" / "raw"

# Instances to run. Set instance_ids here or pass via SELECTED_IDS env var (comma-separated).
# Instances to run, set instance_ids here
SELECTED_IDS = [
# "SWE-Bench-Pro__python__maintenance__bugfix__19a1fba2",
# "SWE-Bench-Pro__python__maintenance__bugfix__2464eadb",
Expand All @@ -42,37 +42,37 @@
# "SWE-Bench-Pro__python__maintenance__bugfix__3cfd9a02",
# "SWE-Bench-Pro__python__maintenance__bugfix__4c132bfd",
# "SWE-Bench-Pro__python__maintenance__bugfix__7c2efe8a",
"SWE-Bench-Pro__go__maintenance__bugfix__40a717e5",
"SWE-Bench-Pro__go__maintenance__bugfix__52d866b3",
"SWE-Bench-Pro__go__maintenance__bugfix__720b4d92",
"SWE-Bench-Pro__go__maintenance__bugfix__997c7afd",
"SWE-Bench-Pro__javascript__maintenance__bugfix__82518720",
"SWE-Bench-Pro__javascript__maintenance__bugfix__e31ec45c",
"SWE-Bench-Pro__python__maintenance__bugfix__07bb383a",
"SWE-Bench-Pro__python__maintenance__bugfix__0bac5789",
"SWE-Bench-Pro__python__maintenance__bugfix__18d7bbbc",
"SWE-Bench-Pro__python__maintenance__bugfix__1cf3e889",
"SWE-Bench-Pro__python__maintenance__bugfix__20dad82b",
"SWE-Bench-Pro__python__maintenance__bugfix__20f502e0",
"SWE-Bench-Pro__python__maintenance__bugfix__509a20d9",
"SWE-Bench-Pro__python__maintenance__bugfix__53ca6a30",
"SWE-Bench-Pro__python__maintenance__bugfix__552343cd",
"SWE-Bench-Pro__python__maintenance__bugfix__5b2cf9bb",
"SWE-Bench-Pro__python__maintenance__bugfix__66e05eaa",
"SWE-Bench-Pro__python__maintenance__bugfix__6ebb54dc",
"SWE-Bench-Pro__python__maintenance__bugfix__87bfb374",
"SWE-Bench-Pro__python__maintenance__bugfix__89932d58",
"SWE-Bench-Pro__python__maintenance__bugfix__942d0b14",
"SWE-Bench-Pro__python__maintenance__bugfix__983f2896",
"SWE-Bench-Pro__python__maintenance__bugfix__a984b409",
"SWE-Bench-Pro__python__maintenance__bugfix__aa07d0c3",
"SWE-Bench-Pro__python__maintenance__bugfix__cf01f471",
"SWE-Bench-Pro__python__maintenance__bugfix__d2506f10",
"SWE-Bench-Pro__python__maintenance__bugfix__e579f2f0",
"SWE-Bench-Pro__python__maintenance__bugfix__eafb1f0b",
"SWE-Bench-Pro__python__maintenance__bugfix__ef8756b1",
"SWE-Bench-Pro__python__maintenance__bugfix__f87209f8",
"SWE-Bench-Pro__python__maintenance__bugfix__ff79bafd",
# "SWE-Bench-Pro__go__maintenance__bugfix__40a717e5",
# "SWE-Bench-Pro__go__maintenance__bugfix__52d866b3",
# "SWE-Bench-Pro__go__maintenance__bugfix__720b4d92",
# "SWE-Bench-Pro__go__maintenance__bugfix__997c7afd",
# "SWE-Bench-Pro__javascript__maintenance__bugfix__82518720",
# "SWE-Bench-Pro__javascript__maintenance__bugfix__e31ec45c",
# "SWE-Bench-Pro__python__maintenance__bugfix__07bb383a",
# "SWE-Bench-Pro__python__maintenance__bugfix__0bac5789",
# "SWE-Bench-Pro__python__maintenance__bugfix__18d7bbbc",
# "SWE-Bench-Pro__python__maintenance__bugfix__1cf3e889",
# "SWE-Bench-Pro__python__maintenance__bugfix__20dad82b",
# "SWE-Bench-Pro__python__maintenance__bugfix__20f502e0",
# "SWE-Bench-Pro__python__maintenance__bugfix__509a20d9",
# "SWE-Bench-Pro__python__maintenance__bugfix__53ca6a30",
# "SWE-Bench-Pro__python__maintenance__bugfix__552343cd",
# "SWE-Bench-Pro__python__maintenance__bugfix__5b2cf9bb",
# "SWE-Bench-Pro__python__maintenance__bugfix__66e05eaa",
# "SWE-Bench-Pro__python__maintenance__bugfix__6ebb54dc",
# "SWE-Bench-Pro__python__maintenance__bugfix__87bfb374",
# "SWE-Bench-Pro__python__maintenance__bugfix__89932d58",
# "SWE-Bench-Pro__python__maintenance__bugfix__942d0b14",
# "SWE-Bench-Pro__python__maintenance__bugfix__983f2896",
# "SWE-Bench-Pro__python__maintenance__bugfix__a984b409",
# "SWE-Bench-Pro__python__maintenance__bugfix__aa07d0c3",
# "SWE-Bench-Pro__python__maintenance__bugfix__cf01f471",
# "SWE-Bench-Pro__python__maintenance__bugfix__d2506f10",
# "SWE-Bench-Pro__python__maintenance__bugfix__e579f2f0",
# "SWE-Bench-Pro__python__maintenance__bugfix__eafb1f0b",
# "SWE-Bench-Pro__python__maintenance__bugfix__ef8756b1",
# "SWE-Bench-Pro__python__maintenance__bugfix__f87209f8",
# "SWE-Bench-Pro__python__maintenance__bugfix__ff79bafd",
]

if os.environ.get("SELECTED_IDS"):
Expand Down Expand Up @@ -105,7 +105,7 @@ def main():

api_key = os.environ.get("ANTHROPIC_API_KEY")
if api_key:
print(f"🔑 Using API key from environment: {api_key[:20]}...")
print(f"🔑 Using API key from environment: {api_key[:10]}...")
else:
print("🔐 ANTHROPIC_API_KEY not set; using Claude CLI logged-in session.")

Expand Down
Loading
Loading