diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index fdf2cd4a08..de69c8ce99 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -147,9 +147,9 @@ After all jobs are complete, you can check the results in `/eval-res "pass@1": { "num_entries": 500, "gen_seconds": 7172, - "issues_resolved": 45.0, - "no_patch": 0.4, - "patch_cant_apply": 0.8 + "issues_resolved": 48.4, + "no_patch": 1.0, + "patch_cant_apply": 1.6 } } } diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py index 368b69529f..4a75bbf7d4 100644 --- a/nemo_skills/inference/eval/swebench.py +++ b/nemo_skills/inference/eval/swebench.py @@ -249,7 +249,9 @@ def __init__(self, cfg: SweBenchGenerationConfig): # make venv & install swe-agent dependencies "uv venv --python 3.12 --managed-python venv && " "source venv/bin/activate && " - "uv pip install -e ." + "uv pip install -e . && " + # force downgrade rich - newer versions cause the swe-agent logger to hang in some instances + "uv pip install rich==14.2.0" ) elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands: diff --git a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py index 30e6566742..2a94f19ee0 100644 --- a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py +++ b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py @@ -22,12 +22,12 @@ from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402 METRIC_RANGES = { - # +/- 4 pts from scores measured on 2025-10-08 (avg of 6 runs for OpenHands, 3 for SWE-agent) + # +/- 4 pts from scores measured on 2026-01-30 (avg of 3 runs) "openhands": { - ("swe-bench", "pass@1", "issues_resolved"): (41.9, 47.9), + ("swe-bench", "pass@1", "issues_resolved"): (44.3, 52.3), }, "swe_agent": { - ("swe-bench", "pass@1", "issues_resolved"): (45.5, 52.4), + ("swe-bench", "pass@1", "issues_resolved"): (45.5, 53.5), }, }