From dc50480aed4fc671bee518fe859484dd41c9953d Mon Sep 17 00:00:00 2001 From: Nikolai Ludwig Date: Fri, 30 Jan 2026 17:00:15 +0400 Subject: [PATCH 1/4] Fix rich==14.2.0 for SWE-agent Signed-off-by: Nikolai Ludwig --- nemo_skills/inference/eval/swebench.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py index 368b69529f..3181dae38c 100644 --- a/nemo_skills/inference/eval/swebench.py +++ b/nemo_skills/inference/eval/swebench.py @@ -249,7 +249,8 @@ def __init__(self, cfg: SweBenchGenerationConfig): # make venv & install swe-agent dependencies "uv venv --python 3.12 --managed-python venv && " "source venv/bin/activate && " - "uv pip install -e ." + "uv pip install -e . && " + "uv pip install rich==14.2.0" ) elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands: From a1b957cd67f2c817cd84aad3fe33f11da4adcb61 Mon Sep 17 00:00:00 2001 From: Nikolai Ludwig Date: Fri, 30 Jan 2026 17:13:18 +0400 Subject: [PATCH 2/4] Add comment Signed-off-by: Nikolai Ludwig --- nemo_skills/inference/eval/swebench.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py index 3181dae38c..4a75bbf7d4 100644 --- a/nemo_skills/inference/eval/swebench.py +++ b/nemo_skills/inference/eval/swebench.py @@ -250,6 +250,7 @@ def __init__(self, cfg: SweBenchGenerationConfig): "uv venv --python 3.12 --managed-python venv && " "source venv/bin/activate && " "uv pip install -e . && " + # force downgrade rich - newer versions cause the swe-agent logger to hang in some instances "uv pip install rich==14.2.0" ) From 259efd6857cbcc429ab67702393fcc2b855ea97a Mon Sep 17 00:00:00 2001 From: Nikolai Ludwig Date: Fri, 30 Jan 2026 17:19:15 +0400 Subject: [PATCH 3/4] Adjust expected SWE-bench results for OpenHands Signed-off-by: Nikolai Ludwig --- docs/evaluation/code.md | 6 +++--- tests/slurm-tests/qwen3coder_30b_swebench/check_results.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md index fdf2cd4a08..de69c8ce99 100644 --- a/docs/evaluation/code.md +++ b/docs/evaluation/code.md @@ -147,9 +147,9 @@ After all jobs are complete, you can check the results in `/eval-res "pass@1": { "num_entries": 500, "gen_seconds": 7172, - "issues_resolved": 45.0, - "no_patch": 0.4, - "patch_cant_apply": 0.8 + "issues_resolved": 48.4, + "no_patch": 1.0, + "patch_cant_apply": 1.6 } } } diff --git a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py index 30e6566742..9a678bacb6 100644 --- a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py +++ b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py @@ -22,9 +22,9 @@ from utils import assert_all, get_nested_value, load_json, soft_assert # noqa: E402 METRIC_RANGES = { - # +/- 4 pts from scores measured on 2025-10-08 (avg of 6 runs for OpenHands, 3 for SWE-agent) + # +/- 4 pts from scores measured on 2026-01-30 (avg of 3 runs) "openhands": { - ("swe-bench", "pass@1", "issues_resolved"): (41.9, 47.9), + ("swe-bench", "pass@1", "issues_resolved"): (44.3, 52.3), }, "swe_agent": { ("swe-bench", "pass@1", "issues_resolved"): (45.5, 52.4), From d9913c957916cc30fb05a68e7f1d885e5bc4a7a8 Mon Sep 17 00:00:00 2001 From: Nikolai Ludwig Date: Fri, 30 Jan 2026 20:37:08 +0400 Subject: [PATCH 4/4] Adjust upper bound for SWE-agent Signed-off-by: Nikolai Ludwig --- tests/slurm-tests/qwen3coder_30b_swebench/check_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py index 9a678bacb6..2a94f19ee0 100644 --- a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py +++ b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py @@ -27,7 +27,7 @@ ("swe-bench", "pass@1", "issues_resolved"): (44.3, 52.3), }, "swe_agent": { - ("swe-bench", "pass@1", "issues_resolved"): (45.5, 52.4), + ("swe-bench", "pass@1", "issues_resolved"): (45.5, 53.5), }, }