groq · lvogel04 · Dec 11, 2025 · Nov 25, 2025 · Nov 29, 2025 · Dec 1, 2025
@@ -216,7 +216,8 @@ All cache data is stored under `~/.openbench`. The cache command helps you monit
 | `--log-format`       | `BENCH_LOG_FORMAT`        | `eval`                    | Output logging format (eval/json)                                |
 | `--hub-repo`         | `BENCH_HUB_REPO`          | `None`                    | Push results to a Hugging Face Hub dataset                       |
 | `--keep-livemcp-root` | `BENCH_KEEP_LIVEMCP_ROOT` | `False`                   | Allow preservation of root data after livemcpbench eval runs     |
-| `--code-agent`       | `BENCH_CODE_AGENT`        | `opencode`                | Select code agent for exercism tasks                             |
+| `--code-agent`       | `BENCH_CODE_AGENT`        | `codex`                   | Select code agent for Exercism tasks (codex/aider/opencode/claude_code/roo) |
+| `--hidden-tests`     | `BENCH_HIDDEN_TESTS`      | `False`                   | Run Exercism agents with hidden tests |
 
 ## Grader Information
 

@@ -21,8 +21,9 @@ The benchmark places agents in realistic development environments where they mus
 - **Multiple Code Agent Harnesses**: Evaluate models across different code agent frameworks on identical tasks for fair comparison:
   - **Aider** - AI-powered pair programming tool with git integration
   - **OpenCode** - OpenAI-compatible code generation tool
-  - **Claude** - Claude-based code editor with file system access
+  - **Claude Code** - Claude-based code editor with file system access
   - **Roo** - VS Code extension with interactive development
+  - **Codex** - OpenAI Codex cli coding agent
 
 - **Realistic Development Environment**: Agents work in full file system workspaces with multiple files, build configurations, and test suites
 
@@ -32,7 +33,7 @@ The benchmark places agents in realistic development environments where they mus
 
 ## Usage
 
-Run Exercism evaluation across all languages with the default code agent (opencode):
+Run Exercism evaluation across all languages with the default code agent (codex):
 
 ```bash
 bench eval exercism
@@ -41,6 +42,7 @@ bench eval exercism
 Specify a different code agent:
 
 ```bash
+bench eval exercism --code-agent codex
 bench eval exercism --code-agent aider
 bench eval exercism --code-agent claude
 bench eval exercism --code-agent roo
@@ -52,6 +54,14 @@ Evaluate with a specific model:
 bench eval exercism --code-agent opencode --model groq/gpt-oss-120b
 ```
 
+Hide the Exercism test suites from the agent (tests still run after the edit loop):
+
+```bash
+bench eval exercism --hidden-tests
+```
+
+With `--hidden-tests`, the agent works inside a sanitized copy of the repo with the language-specific tests excluded, while the final verification still runs against the full workspace to ensure correctness.
+
 Run evaluation for a single language:
 
 ```bash
@@ -112,4 +122,3 @@ The multi-agent design allows for unique comparative analysis:
 - Compare how different models perform on identical tasks
 - Evaluate which code agent harness works best for different languages and models
 - Identify model strengths and weaknesses across programming paradigms
-
@@ -104,8 +104,8 @@ Harnesses
 
 Examples
 ```bash
-# Python with aider
-bench eval exercism_python --code-agent aider --model groq/llama-3.3-70b
+# Python with codex (default)
+bench eval exercism_python --code-agent codex --model openai/gpt-5
 
 # Go with Roo (requires OpenRouter model IDs)
 bench eval exercism_go --code-agent roo \
@@ -187,7 +187,7 @@ bench eval arabic_exams_general_knowledge --model groq/llama-3.3-70b
 
 Exercism (alpha)
 ```bash
-bench eval exercism_python --code-agent aider --model groq/llama-3.3-70b
+bench eval exercism_python --code-agent codex --model openai/gpt-5
 ```
 
 MultiChallenge

@@ -8230,23 +8230,6 @@ export const evalGroupsData = [
       "ethics_virtue"
     ]
   },
-  {
-    "name": "Exercism",
-    "description": "Aggregate of 5 Exercism coding tasks",
-    "category": "eval-group",
-    "tags": [
-      "eval-group"
-    ],
-    "id": "exercism",
-    "benchmark_count": 5,
-    "benchmarks": [
-      "exercism_go",
-      "exercism_java",
-      "exercism_javascript",
-      "exercism_python",
-      "exercism_rust"
-    ]
-  },
   {
     "name": "GLUE",
     "description": "Aggregate of 10 GLUE NLU tasks",

@@ -15,7 +15,9 @@ authors = [
 dependencies = [
     "datasets>=3.6.0",
     "groq>=0.33.0",
-    "inspect-ai==0.3.125",
+    "inspect-ai==0.3.141",
+    "inspect_swe>=0.2.26",
+    "anthropic>=0.69.0",
     "openai>=2.0.0",
     "pillow>=10.0.0",
     "jsonschema>=4.23.0",

@@ -16,7 +16,9 @@ authors = [
 dependencies = [
     "datasets>=3.6.0",
     "groq>=0.33.0",
-    "inspect-ai==0.3.125",
+    "inspect-ai==0.3.141",
+    "inspect_swe>=0.2.26",
+    "anthropic>=0.69.0",
     "openai>=2.0.0",
     "pillow>=10.0.0",
     "jsonschema>=4.23.0",

@@ -652,6 +652,14 @@ def run_eval(
             envvar="BENCH_CODE_AGENT",
         ),
     ] = None,
+    hidden_tests: Annotated[
+        bool,
+        typer.Option(
+            "--hidden-tests",
+            help="Run code agents in a sanitized copy of the repo with Exercism tests hidden",
+            envvar="BENCH_HIDDEN_TESTS",
+        ),
+    ] = False,
 ) -> List[EvalLog] | None:
     """
     Run a benchmark on a model.
@@ -697,6 +705,17 @@ def run_eval(
                     "For --code-agent roo, --model must be an OpenRouter model id prefixed with 'openrouter/'. "
                     "Example: --model openrouter/anthropic/claude-sonnet-4-20250514"
                 )
+    # claude code only supports anthropic models
+    if code_agent and code_agent.lower() == "claude_code":
+        for model_name in model:
+            if not model_name.startswith("anthropic/"):
+                raise typer.BadParameter(
+                    "For claude_code, --model must be an Anthropic model id prefixed with 'anthropic/'. "
+                )
+
+    # Propagate hidden test preference to tasks that support it
+    if hidden_tests:
+        task_args["hide_tests"] = True
 
     # Validate model names
     for model_name in model:

@@ -8,16 +8,20 @@
 from .base import BaseCodeAgent
 from .aider import AiderAgent
 from .opencode import OpenCodeAgent
-from .claude import ClaudeAgent
+from .gemini import GeminiAgent
 from .roo import RooAgent
+from .claude import ClaudeCodeAgent
+from .codex import CodexAgent
 from .manager import AgentManager
 from .docker_manager import DockerManager
 
 __all__ = [
     "BaseCodeAgent",
     "AiderAgent",
     "OpenCodeAgent",
-    "ClaudeAgent",
+    "GeminiAgent",
+    "ClaudeCodeAgent",
+    "CodexAgent",
     "RooAgent",
     "AgentManager",
     "DockerManager",

@@ -1,137 +1,69 @@
 """
-Claude code agent implementation.
+Claude Code agent backed by inspect_swe.
 """
 
 from __future__ import annotations
 
-import os
 from typing import List
 
+from inspect_ai.agent import AgentState
+from inspect_ai.model import ChatMessageUser, ModelOutput
+
+from openbench.utils.cli_commands import format_execution_output
+
 from .base import BaseCodeAgent
-from openbench.utils.cli_commands import (
-    generate_env_setup_script,
-    write_prompt_to_file,
-    write_and_execute_script,
-    read_log_file,
-    format_execution_output,
-    get_claude_script_template,
-)
-from openbench.utils.docker import ClaudeCommands
+from inspect_swe import claude_code
 
 
-class ClaudeAgent(BaseCodeAgent):
-    """Claude-based code editor with file system access."""
+class ClaudeCodeAgent(BaseCodeAgent):
+    """Claude Code CLI agent via inspect_swe."""
 
-    def __init__(self):
-        super().__init__("claude")
+    def __init__(self) -> None:
+        super().__init__("claude_code")
 
     async def execute(self, workdir: str, prompt_text: str, model: str) -> str:
-        """Execute Claude Code CLI command.
+        """Execute Claude Code."""
 
-        Args:
-            workdir: Working directory path for the task
-            prompt_text: The prompt to send to claude code
-            model: Model string to use with claude code
-
-        Returns:
-            Formatted output string with claude code execution results
-        """
         try:
-            # Check for required API key
-            anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
-            if not anthropic_api_key:
-                return "ERROR: ANTHROPIC_API_KEY is not set"
-
-            # Write prompt to avoid shell quoting issues
-            if not await write_prompt_to_file(prompt_text, "claude_code_prompt.txt"):
-                return "ERROR: failed to write prompt file"
-
-            # Get environment setup script
-            env_setup = generate_env_setup_script()
-
-            # Create claude execution script
-            script_content = get_claude_script_template().format(
-                workdir=workdir, env_setup=env_setup, model=model
-            )
-
-            # Execute the script
-            result = await write_and_execute_script(
-                script_content,
-                "claude_script.sh",
-                timeout=1800,  # 30 minutes
-            )
-
-            # Read claude-specific log
-            additional_logs = []
-            claude_log = await read_log_file(
-                "/tmp/claude-code-output.log", "CLAUDE CODE", tail_lines=200
-            )
-            if claude_log:
-                additional_logs.append(claude_log)
-
-            return format_execution_output(result, additional_logs)
-
-        except Exception as e:
-            return f"ERROR: Failed to run claude code: {str(e)}"
+            claude_agent = claude_code(cwd=workdir, model=model)
+            state = AgentState(messages=[ChatMessageUser(content=prompt_text)])
+            completed_state = await claude_agent(state)
+            stdout_text = _format_agent_output(completed_state.output)
+            result = {
+                "returncode": 0,
+                "success": True,
+                "stdout": stdout_text,
+                "stderr": "",
+            }
+            return format_execution_output(result)
+        except Exception as exc:  # pragma: no cover - defensive
+            return f"ERROR: claude_code execution failed: {exc}"
 
     def resolve_model(self, state_model: str) -> str:
-        """Resolve the appropriate model string for Claude.
-
-        Args:
-            state_model: Model from TaskState.model
-
-        Returns:
-            Resolved model string for Claude (removes anthropic/ prefix)
-        """
-        # Claude CLI uses Anthropic models directly (remove prefix)
-        if state_model.startswith("anthropic/"):
-            return state_model[len("anthropic/") :]
-        return state_model
-
-    def get_setup_commands(self) -> List[str]:
-        """Get setup commands required by Claude.
-
-        Returns:
-            Empty list (no special setup required)
-        """
-        return []
+        """Resolve the appropriate model string for Claude Code."""
+        stripped = (state_model or "").strip()
+        return stripped if stripped else self.get_default_model()
 
     def get_default_model(self) -> str:
-        """Get the default model for Claude.
-
-        Returns:
-            Default model string
-        """
-        return "anthropic/claude-sonnet-4-20250514"
+        return "anthropic/claude-sonnet-4-5-20250929"
 
     def get_description(self) -> str:
-        """Get description of Claude.
-
-        Returns:
-            Description string
-        """
-        return "Claude cli code agent"
+        return "Claude Code agent."
 
     def get_dockerfile_commands(self) -> List[str]:
-        """Get Dockerfile commands to install Claude Code CLI.
-
-        Returns:
-            List of Dockerfile RUN commands
-        """
-        return ClaudeCommands.DOCKERFILE_COMMANDS
-
-    def get_base_packages(self) -> List[str]:
-        """Get base packages required by Claude.
+        return []
 
-        Returns:
-            List of apt package names
-        """
-        return ClaudeCommands.BASE_PACKAGES
 
-    def get_env_requirements(self) -> List[str]:
-        """Get environment variables required by Claude.
+def _format_agent_output(output: ModelOutput) -> str:
+    """Render agent output as plain text."""
+    if not output or not output.choices:
+        return "Agent completed without emitting assistant output."
 
-        Returns:
-            List of environment variable names
-        """
-        return ["ANTHROPIC_API_KEY"]  # Claude specifically requires Anthropic API key
+    parts: List[str] = []
+    for idx, choice in enumerate(output.choices, start=1):
+        message = choice.message
+        text = (
+            message.text.strip() if message and message.text else ""
+        ) or "(no text output)"
+        parts.append(f"[Choice {idx}] {text}")
+    return "\n\n".join(parts)