NVIDIA-NeMo · Kipok · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -147,9 +147,9 @@ After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-res
     "pass@1": {
       "num_entries": 500,
       "gen_seconds": 7172,
-      "issues_resolved": 45.0,
-      "no_patch": 0.4,
-      "patch_cant_apply": 0.8
+      "issues_resolved": 48.4,
+      "no_patch": 1.0,
+      "patch_cant_apply": 1.6
     }
   }
 }

diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py
@@ -249,7 +249,9 @@ def __init__(self, cfg: SweBenchGenerationConfig):
                 # make venv & install swe-agent dependencies
                 "uv venv --python 3.12 --managed-python venv && "
                 "source venv/bin/activate && "
-                "uv pip install -e ."
+                "uv pip install -e . && "
+                # force downgrade rich - newer versions cause the swe-agent logger to hang in some instances
+                "uv pip install rich==14.2.0"
             )
 
         elif self.cfg.agent_framework == SupportedAgentFrameworks.openhands:

diff --git a/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py b/tests/slurm-tests/qwen3coder_30b_swebench/check_results.py
@@ -22,12 +22,12 @@
 from utils import assert_all, get_nested_value, load_json, soft_assert  # noqa: E402
 
 METRIC_RANGES = {
-    # +/- 4 pts from scores measured on 2025-10-08 (avg of 6 runs for OpenHands, 3 for SWE-agent)
+    # +/- 4 pts from scores measured on 2026-01-30 (avg of 3 runs)
     "openhands": {
-        ("swe-bench", "pass@1", "issues_resolved"): (41.9, 47.9),
+        ("swe-bench", "pass@1", "issues_resolved"): (44.3, 52.3),
     },
     "swe_agent": {
-        ("swe-bench", "pass@1", "issues_resolved"): (45.5, 52.4),
+        ("swe-bench", "pass@1", "issues_resolved"): (45.5, 53.5),
     },
 }