diff --git a/README.md b/README.md index 8dc08e90..76a7d0d6 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,8 @@ All cache data is stored under `~/.openbench`. The cache command helps you monit | `--log-format` | `BENCH_LOG_FORMAT` | `eval` | Output logging format (eval/json) | | `--hub-repo` | `BENCH_HUB_REPO` | `None` | Push results to a Hugging Face Hub dataset | | `--keep-livemcp-root` | `BENCH_KEEP_LIVEMCP_ROOT` | `False` | Allow preservation of root data after livemcpbench eval runs | -| `--code-agent` | `BENCH_CODE_AGENT` | `opencode` | Select code agent for exercism tasks | +| `--code-agent` | `BENCH_CODE_AGENT` | `codex` | Select code agent for Exercism tasks (codex/aider/opencode/claude_code/roo) | +| `--hidden-tests` | `BENCH_HIDDEN_TESTS` | `False` | Run Exercism agents with hidden tests | ## Grader Information diff --git a/docs/evals/exercism.mdx b/docs/evals/exercism.mdx index e215471e..29e9c46d 100644 --- a/docs/evals/exercism.mdx +++ b/docs/evals/exercism.mdx @@ -21,8 +21,9 @@ The benchmark places agents in realistic development environments where they mus - **Multiple Code Agent Harnesses**: Evaluate models across different code agent frameworks on identical tasks for fair comparison: - **Aider** - AI-powered pair programming tool with git integration - **OpenCode** - OpenAI-compatible code generation tool - - **Claude** - Claude-based code editor with file system access + - **Claude Code** - Claude-based code editor with file system access - **Roo** - VS Code extension with interactive development + - **Codex** - OpenAI Codex cli coding agent - **Realistic Development Environment**: Agents work in full file system workspaces with multiple files, build configurations, and test suites @@ -32,7 +33,7 @@ The benchmark places agents in realistic development environments where they mus ## Usage -Run Exercism evaluation across all languages with the default code agent (opencode): +Run Exercism evaluation across all languages with the default code agent (codex): ```bash bench eval exercism @@ -41,6 +42,7 @@ bench eval exercism Specify a different code agent: ```bash +bench eval exercism --code-agent codex bench eval exercism --code-agent aider bench eval exercism --code-agent claude bench eval exercism --code-agent roo @@ -52,6 +54,14 @@ Evaluate with a specific model: bench eval exercism --code-agent opencode --model groq/gpt-oss-120b ``` +Hide the Exercism test suites from the agent (tests still run after the edit loop): + +```bash +bench eval exercism --hidden-tests +``` + +With `--hidden-tests`, the agent works inside a sanitized copy of the repo with the language-specific tests excluded, while the final verification still runs against the full workspace to ensure correctness. + Run evaluation for a single language: ```bash @@ -112,4 +122,3 @@ The multi-agent design allows for unique comparative analysis: - Compare how different models perform on identical tasks - Evaluate which code agent harness works best for different languages and models - Identify model strengths and weaknesses across programming paradigms - diff --git a/docs/release-notes.mdx b/docs/release-notes.mdx index a34a3168..91ff1c3f 100644 --- a/docs/release-notes.mdx +++ b/docs/release-notes.mdx @@ -104,8 +104,8 @@ Harnesses Examples ```bash -# Python with aider -bench eval exercism_python --code-agent aider --model groq/llama-3.3-70b +# Python with codex (default) +bench eval exercism_python --code-agent codex --model openai/gpt-5 # Go with Roo (requires OpenRouter model IDs) bench eval exercism_go --code-agent roo \ @@ -187,7 +187,7 @@ bench eval arabic_exams_general_knowledge --model groq/llama-3.3-70b Exercism (alpha) ```bash -bench eval exercism_python --code-agent aider --model groq/llama-3.3-70b +bench eval exercism_python --code-agent codex --model openai/gpt-5 ``` MultiChallenge diff --git a/docs/snippets/benchmarks.data.mdx b/docs/snippets/benchmarks.data.mdx index 5c36b2a1..d734fc77 100644 --- a/docs/snippets/benchmarks.data.mdx +++ b/docs/snippets/benchmarks.data.mdx @@ -8230,23 +8230,6 @@ export const evalGroupsData = [ "ethics_virtue" ] }, - { - "name": "Exercism", - "description": "Aggregate of 5 Exercism coding tasks", - "category": "eval-group", - "tags": [ - "eval-group" - ], - "id": "exercism", - "benchmark_count": 5, - "benchmarks": [ - "exercism_go", - "exercism_java", - "exercism_javascript", - "exercism_python", - "exercism_rust" - ] - }, { "name": "GLUE", "description": "Aggregate of 10 GLUE NLU tasks", diff --git a/packages/openbench-core/pyproject.toml b/packages/openbench-core/pyproject.toml index 3801a92d..67b583d3 100644 --- a/packages/openbench-core/pyproject.toml +++ b/packages/openbench-core/pyproject.toml @@ -15,7 +15,9 @@ authors = [ dependencies = [ "datasets>=3.6.0", "groq>=0.33.0", - "inspect-ai==0.3.125", + "inspect-ai==0.3.141", + "inspect_swe>=0.2.26", + "anthropic>=0.69.0", "openai>=2.0.0", "pillow>=10.0.0", "jsonschema>=4.23.0", diff --git a/pyproject.toml b/pyproject.toml index 564f59ac..a03f2d0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,9 @@ authors = [ dependencies = [ "datasets>=3.6.0", "groq>=0.33.0", - "inspect-ai==0.3.125", + "inspect-ai==0.3.141", + "inspect_swe>=0.2.26", + "anthropic>=0.69.0", "openai>=2.0.0", "pillow>=10.0.0", "jsonschema>=4.23.0", diff --git a/src/openbench/_cli/eval_command.py b/src/openbench/_cli/eval_command.py index 9d6a6651..d9f93969 100644 --- a/src/openbench/_cli/eval_command.py +++ b/src/openbench/_cli/eval_command.py @@ -652,6 +652,14 @@ def run_eval( envvar="BENCH_CODE_AGENT", ), ] = None, + hidden_tests: Annotated[ + bool, + typer.Option( + "--hidden-tests", + help="Run code agents in a sanitized copy of the repo with Exercism tests hidden", + envvar="BENCH_HIDDEN_TESTS", + ), + ] = False, ) -> List[EvalLog] | None: """ Run a benchmark on a model. @@ -697,6 +705,17 @@ def run_eval( "For --code-agent roo, --model must be an OpenRouter model id prefixed with 'openrouter/'. " "Example: --model openrouter/anthropic/claude-sonnet-4-20250514" ) + # claude code only supports anthropic models + if code_agent and code_agent.lower() == "claude_code": + for model_name in model: + if not model_name.startswith("anthropic/"): + raise typer.BadParameter( + "For claude_code, --model must be an Anthropic model id prefixed with 'anthropic/'. " + ) + + # Propagate hidden test preference to tasks that support it + if hidden_tests: + task_args["hide_tests"] = True # Validate model names for model_name in model: diff --git a/src/openbench/agents/__init__.py b/src/openbench/agents/__init__.py index 7f22c92b..674fe48e 100644 --- a/src/openbench/agents/__init__.py +++ b/src/openbench/agents/__init__.py @@ -8,8 +8,10 @@ from .base import BaseCodeAgent from .aider import AiderAgent from .opencode import OpenCodeAgent -from .claude import ClaudeAgent +from .gemini import GeminiAgent from .roo import RooAgent +from .claude import ClaudeCodeAgent +from .codex import CodexAgent from .manager import AgentManager from .docker_manager import DockerManager @@ -17,7 +19,9 @@ "BaseCodeAgent", "AiderAgent", "OpenCodeAgent", - "ClaudeAgent", + "GeminiAgent", + "ClaudeCodeAgent", + "CodexAgent", "RooAgent", "AgentManager", "DockerManager", diff --git a/src/openbench/agents/claude.py b/src/openbench/agents/claude.py index 030073bf..5fb9d375 100644 --- a/src/openbench/agents/claude.py +++ b/src/openbench/agents/claude.py @@ -1,137 +1,69 @@ """ -Claude code agent implementation. +Claude Code agent backed by inspect_swe. """ from __future__ import annotations -import os from typing import List +from inspect_ai.agent import AgentState +from inspect_ai.model import ChatMessageUser, ModelOutput + +from openbench.utils.cli_commands import format_execution_output + from .base import BaseCodeAgent -from openbench.utils.cli_commands import ( - generate_env_setup_script, - write_prompt_to_file, - write_and_execute_script, - read_log_file, - format_execution_output, - get_claude_script_template, -) -from openbench.utils.docker import ClaudeCommands +from inspect_swe import claude_code -class ClaudeAgent(BaseCodeAgent): - """Claude-based code editor with file system access.""" +class ClaudeCodeAgent(BaseCodeAgent): + """Claude Code CLI agent via inspect_swe.""" - def __init__(self): - super().__init__("claude") + def __init__(self) -> None: + super().__init__("claude_code") async def execute(self, workdir: str, prompt_text: str, model: str) -> str: - """Execute Claude Code CLI command. + """Execute Claude Code.""" - Args: - workdir: Working directory path for the task - prompt_text: The prompt to send to claude code - model: Model string to use with claude code - - Returns: - Formatted output string with claude code execution results - """ try: - # Check for required API key - anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") - if not anthropic_api_key: - return "ERROR: ANTHROPIC_API_KEY is not set" - - # Write prompt to avoid shell quoting issues - if not await write_prompt_to_file(prompt_text, "claude_code_prompt.txt"): - return "ERROR: failed to write prompt file" - - # Get environment setup script - env_setup = generate_env_setup_script() - - # Create claude execution script - script_content = get_claude_script_template().format( - workdir=workdir, env_setup=env_setup, model=model - ) - - # Execute the script - result = await write_and_execute_script( - script_content, - "claude_script.sh", - timeout=1800, # 30 minutes - ) - - # Read claude-specific log - additional_logs = [] - claude_log = await read_log_file( - "/tmp/claude-code-output.log", "CLAUDE CODE", tail_lines=200 - ) - if claude_log: - additional_logs.append(claude_log) - - return format_execution_output(result, additional_logs) - - except Exception as e: - return f"ERROR: Failed to run claude code: {str(e)}" + claude_agent = claude_code(cwd=workdir, model=model) + state = AgentState(messages=[ChatMessageUser(content=prompt_text)]) + completed_state = await claude_agent(state) + stdout_text = _format_agent_output(completed_state.output) + result = { + "returncode": 0, + "success": True, + "stdout": stdout_text, + "stderr": "", + } + return format_execution_output(result) + except Exception as exc: # pragma: no cover - defensive + return f"ERROR: claude_code execution failed: {exc}" def resolve_model(self, state_model: str) -> str: - """Resolve the appropriate model string for Claude. - - Args: - state_model: Model from TaskState.model - - Returns: - Resolved model string for Claude (removes anthropic/ prefix) - """ - # Claude CLI uses Anthropic models directly (remove prefix) - if state_model.startswith("anthropic/"): - return state_model[len("anthropic/") :] - return state_model - - def get_setup_commands(self) -> List[str]: - """Get setup commands required by Claude. - - Returns: - Empty list (no special setup required) - """ - return [] + """Resolve the appropriate model string for Claude Code.""" + stripped = (state_model or "").strip() + return stripped if stripped else self.get_default_model() def get_default_model(self) -> str: - """Get the default model for Claude. - - Returns: - Default model string - """ - return "anthropic/claude-sonnet-4-20250514" + return "anthropic/claude-sonnet-4-5-20250929" def get_description(self) -> str: - """Get description of Claude. - - Returns: - Description string - """ - return "Claude cli code agent" + return "Claude Code agent." def get_dockerfile_commands(self) -> List[str]: - """Get Dockerfile commands to install Claude Code CLI. - - Returns: - List of Dockerfile RUN commands - """ - return ClaudeCommands.DOCKERFILE_COMMANDS - - def get_base_packages(self) -> List[str]: - """Get base packages required by Claude. + return [] - Returns: - List of apt package names - """ - return ClaudeCommands.BASE_PACKAGES - def get_env_requirements(self) -> List[str]: - """Get environment variables required by Claude. +def _format_agent_output(output: ModelOutput) -> str: + """Render agent output as plain text.""" + if not output or not output.choices: + return "Agent completed without emitting assistant output." - Returns: - List of environment variable names - """ - return ["ANTHROPIC_API_KEY"] # Claude specifically requires Anthropic API key + parts: List[str] = [] + for idx, choice in enumerate(output.choices, start=1): + message = choice.message + text = ( + message.text.strip() if message and message.text else "" + ) or "(no text output)" + parts.append(f"[Choice {idx}] {text}") + return "\n\n".join(parts) diff --git a/src/openbench/agents/codex.py b/src/openbench/agents/codex.py new file mode 100644 index 00000000..6b72ee7a --- /dev/null +++ b/src/openbench/agents/codex.py @@ -0,0 +1,67 @@ +""" +Codex CLI agent backed by inspect_swe. +""" + +from __future__ import annotations + +from typing import List + +from inspect_ai.agent import AgentState +from inspect_ai.model import ChatMessageUser, ModelOutput + +from openbench.utils.cli_commands import format_execution_output + +from .base import BaseCodeAgent +from inspect_swe import codex_cli + + +class CodexAgent(BaseCodeAgent): + """Codex CLI agent via inspect_swe.""" + + def __init__(self) -> None: + super().__init__("codex") + + async def execute(self, workdir: str, prompt_text: str, model: str) -> str: + """Execute Codex CLI agent.""" + try: + codex_agent = codex_cli(cwd=workdir, model=model, model_config=model) + state = AgentState(messages=[ChatMessageUser(content=prompt_text)]) + completed_state = await codex_agent(state) + stdout_text = _format_agent_output(completed_state.output) + result = { + "returncode": 0, + "success": True, + "stdout": stdout_text, + "stderr": "", + } + return format_execution_output(result) + except Exception as exc: # pragma: no cover - defensive + return f"ERROR: codex execution failed: {exc}" + + def resolve_model(self, state_model: str) -> str: + stripped = (state_model or "").strip() + return stripped if stripped else self.get_default_model() + + def get_default_model(self) -> str: + return "openai/gpt-5" + + def get_description(self) -> str: + return "Codex CLI agent" + + def get_dockerfile_commands(self) -> List[str]: + return [] + + +def _format_agent_output(output: ModelOutput) -> str: + """Render inspect_swe agent output as plain text.""" + if not output or not output.choices: + return "Agent completed without emitting assistant output." + + parts: List[str] = [] + for idx, choice in enumerate(output.choices, start=1): + message = choice.message + text = ( + message.text.strip() if message and message.text else "" + ) or "(no text output)" + parts.append(f"[Choice {idx}] {text}") + return "\n\n".join(parts) diff --git a/src/openbench/agents/gemini.py b/src/openbench/agents/gemini.py new file mode 100644 index 00000000..db84242a --- /dev/null +++ b/src/openbench/agents/gemini.py @@ -0,0 +1,130 @@ +""" +Gemini CLI agent implementation. +""" + +from __future__ import annotations + +import os +from typing import List + +from .base import BaseCodeAgent +from openbench.utils.cli_commands import ( + generate_env_setup_script, + write_prompt_to_file, + write_and_execute_script, + read_log_file, + format_execution_output, + get_gemini_script_template, +) +from openbench.utils.docker import GeminiCommands + + +class GeminiAgent(BaseCodeAgent): + """Google Gemini CLI code generation tool.""" + + def __init__(self): + super().__init__("gemini") + + async def execute(self, workdir: str, prompt_text: str, model: str) -> str: + """Execute Gemini CLI command. + + Args: + workdir: Working directory path for the task + prompt_text: The prompt to send to gemini + model: Model string to use with gemini + + Returns: + Formatted output string with gemini execution results + """ + try: + if not await write_prompt_to_file(prompt_text, "gemini_prompt.txt"): + return "ERROR: failed to write prompt file" + + # Get environment setup script + env_setup = generate_env_setup_script() + + # Create gemini execution script + script_content = get_gemini_script_template().format( + workdir=workdir, env_setup=env_setup, model=model + ) + + # Execute the script + result = await write_and_execute_script( + script_content, + "gemini_script.sh", + timeout=1800, # 30 minutes + ) + + additional_logs = [] + gemini_log = await read_log_file( + "/tmp/gemini-output.log", "GEMINI", tail_lines=200 + ) + if gemini_log: + additional_logs.append(gemini_log) + + return format_execution_output(result, additional_logs) + + except Exception as e: + return f"ERROR: Failed to run gemini: {str(e)}" + + def resolve_model(self, state_model: str) -> str: + """Resolve the appropriate model string for Gemini CLI. + + Args: + state_model: Model from TaskState.model + + Returns: + Resolved model string for Gemini CLI + """ + if state_model.startswith("google/"): + return state_model[7:] + + return state_model + + def get_setup_commands(self) -> List[str]: + """Get setup commands required by Gemini CLI. + + Returns: + Empty list (no special setup required) + """ + return [] + + def get_default_model(self) -> str: + """Get the default model for Gemini CLI. + + Returns: + Default model string + """ + return os.getenv("BENCH_MODEL", "google/gemini-2.5-pro") + + def get_description(self) -> str: + """Get description of Gemini CLI. + + Returns: + Description string + """ + return "gemini cli code agent" + + def get_dockerfile_commands(self) -> List[str]: + """Get Dockerfile commands to install Gemini CLI. + + Returns: + List of Dockerfile RUN commands + """ + return GeminiCommands.DOCKERFILE_COMMANDS + + def get_base_packages(self) -> List[str]: + """Get base packages required by Gemini CLI. + + Returns: + List of apt package names + """ + return GeminiCommands.BASE_PACKAGES + + def get_env_requirements(self) -> List[str]: + """Get environment variables required by Gemini CLI. + + Returns: + List of environment variable names + """ + return ["GEMINI_API_KEY"] diff --git a/src/openbench/agents/manager.py b/src/openbench/agents/manager.py index 93b12635..fd266651 100644 --- a/src/openbench/agents/manager.py +++ b/src/openbench/agents/manager.py @@ -9,8 +9,10 @@ from .base import BaseCodeAgent from .aider import AiderAgent from .opencode import OpenCodeAgent -from .claude import ClaudeAgent +from .gemini import GeminiAgent from .roo import RooAgent +from .claude import ClaudeCodeAgent +from .codex import CodexAgent class AgentManager: @@ -19,7 +21,9 @@ class AgentManager: _agents: Dict[str, Callable[[], BaseCodeAgent]] = { "aider": AiderAgent, "opencode": OpenCodeAgent, - "claude": ClaudeAgent, + "gemini": GeminiAgent, + "claude_code": ClaudeCodeAgent, + "codex": CodexAgent, "roo": RooAgent, } @@ -178,7 +182,7 @@ def get_help_text(cls) -> str: Formatted help text describing all available code agents """ agent_names = cls.get_supported_agents() - default_agent = "opencode" + default_agent = "codex" help_text = f"CLI code agent to use for code evaluation. Options: {', '.join(agent_names)} (default: {default_agent})" return help_text diff --git a/src/openbench/config.py b/src/openbench/config.py index 0d146ab2..9070ee7d 100644 --- a/src/openbench/config.py +++ b/src/openbench/config.py @@ -6703,17 +6703,6 @@ def get_eval_metadata(path_like: str) -> BenchmarkMetadata | None: "arabic_exams_social_science_primary_school", ], ), - "exercism": EvalGroup( - name="Exercism", - description="Aggregate of 5 Exercism coding tasks", - benchmarks=[ - "exercism_go", - "exercism_java", - "exercism_javascript", - "exercism_python", - "exercism_rust", - ], - ), "anli": EvalGroup( name="ANLI", description="Aggregate of 3 ANLI rounds", diff --git a/src/openbench/evals/exercism/exercism.py b/src/openbench/evals/exercism/exercism.py index c641f948..acf84918 100644 --- a/src/openbench/evals/exercism/exercism.py +++ b/src/openbench/evals/exercism/exercism.py @@ -16,6 +16,7 @@ from openbench.scorers.exercism import exercism_scorer from openbench.agents import AgentManager from openbench.agents.docker_manager import DockerManager +from openbench.utils.text import EXERCISM_HIDDEN_TEST_PROMPT TASK_DIR = Path(__file__).parent @@ -25,7 +26,8 @@ @task def exercism( languages: Optional[List[str]] = None, - code_agent: str = "opencode", + code_agent: str = "codex", + hide_tests: bool = False, ) -> Task: """ Exercism: Multi-language coding benchmark. @@ -37,8 +39,10 @@ def exercism( languages: List of programming languages to include (python, go, javascript, java, rust). If None, includes all supported languages. code_agent: CLI code agent to use for code evaluation. - Defaults to 'opencode'. Can also be set via --code-agent flag. - Valid options: aider, opencode, claude, roo + Defaults to 'codex'. Can also be set via --code-agent flag. + Valid options: codex, aider, opencode, claude_code, roo + hide_tests: When True, run the agent in a sanitized copy that excludes + Exercism test suites. Tests still execute against the full repo. Returns: Task configured for Exercism evaluation @@ -60,6 +64,9 @@ def exercism( if not hasattr(sample, "metadata") or sample.metadata is None: sample.metadata = {} sample.metadata["code_agent"] = code_agent + sample.metadata["hide_tests"] = hide_tests + if hide_tests: + sample.input = EXERCISM_HIDDEN_TEST_PROMPT # Determine task name based on languages if languages and len(languages) == 1: @@ -81,55 +88,57 @@ def exercism( @task -def exercism_python(code_agent: str = "opencode") -> Task: +def exercism_python(code_agent: str = "codex", hide_tests: bool = False) -> Task: """ Exercism: Python coding tasks only. Returns: Task configured for Python-only Exercism evaluation """ - return exercism(languages=["python"], code_agent=code_agent) + return exercism(languages=["python"], code_agent=code_agent, hide_tests=hide_tests) @task -def exercism_javascript(code_agent: str = "opencode") -> Task: +def exercism_javascript(code_agent: str = "codex", hide_tests: bool = False) -> Task: """ Exercism: JavaScript coding tasks only. Returns: Task configured for JavaScript-only Exercism evaluation """ - return exercism(languages=["javascript"], code_agent=code_agent) + return exercism( + languages=["javascript"], code_agent=code_agent, hide_tests=hide_tests + ) @task -def exercism_go(code_agent: str = "opencode") -> Task: +def exercism_go(code_agent: str = "codex", hide_tests: bool = False) -> Task: """ Exercism: Go coding tasks only. Returns: Task configured for Go-only Exercism evaluation """ - return exercism(languages=["go"], code_agent=code_agent) + return exercism(languages=["go"], code_agent=code_agent, hide_tests=hide_tests) @task -def exercism_java(code_agent: str = "opencode") -> Task: +def exercism_java(code_agent: str = "codex", hide_tests: bool = False) -> Task: """ Exercism: Java coding tasks only. Returns: Task configured for Java-only Exercism evaluation """ - return exercism(languages=["java"], code_agent=code_agent) + return exercism(languages=["java"], code_agent=code_agent, hide_tests=hide_tests) @task -def exercism_rust(code_agent: str = "opencode") -> Task: +def exercism_rust(code_agent: str = "codex", hide_tests: bool = False) -> Task: """ Exercism: Rust coding tasks only. Returns: Task configured for Rust-only Exercism evaluation """ - return exercism(languages=["rust"], code_agent=code_agent) + return exercism(languages=["rust"], code_agent=code_agent, hide_tests=hide_tests) diff --git a/src/openbench/provider_config.py b/src/openbench/provider_config.py index 91e9ef16..c766617f 100644 --- a/src/openbench/provider_config.py +++ b/src/openbench/provider_config.py @@ -188,6 +188,7 @@ def get_all_env_vars(self) -> List[str]: display_name="Google AI", api_key_env="GOOGLE_API_KEY", base_url="https://generativelanguage.googleapis.com/v1beta", + additional_env_vars=["GEMINI_API_KEY"], supports_vision=True, supports_function_calling=True, ), diff --git a/src/openbench/solvers/exercism_solver.py b/src/openbench/solvers/exercism_solver.py index afed638e..5c7b2f96 100644 --- a/src/openbench/solvers/exercism_solver.py +++ b/src/openbench/solvers/exercism_solver.py @@ -1,24 +1,28 @@ """ Unified CLI solver for exercism tasks that supports multiple code agents. -This solver provides a unified interface for different CLI code agents (aider, opencode, claude, roo) +This solver provides a unified interface for different CLI code agents +(codex, aider, opencode, claude_code, roo) and selects the appropriate tool based on the --code-agent flag or task arguments. Supported code agents: +- codex: Codex CLI agent powered by inspect_swe (default) - aider: AI-powered pair programming tool with git integration - opencode: OpenAI-compatible code generation tool -- claude: Claude-based code editor with file system access +- claude_code: Claude Code editor powered by inspect_swe - roo: Roo extension for VS Code with interactive development Usage: - openbench eval exercism --code-agent aider --model groq/llama-3.1-70b + openbench eval exercism --code-agent codex --model openai/gpt-5 openbench eval exercism --code-agent opencode --model openai/gpt-4o-mini - openbench eval exercism --code-agent claude --model anthropic/claude-sonnet-4-20250514 + openbench eval exercism --code-agent claude_code --model anthropic/claude-sonnet-4-5-20250929 + openbench eval exercism --code-agent aider --model groq/llama-3.1-70b openbench eval exercism --code-agent roo --model openrouter/anthropic/claude-sonnet-4-20250514 """ from __future__ import annotations +from typing import Any, Dict from inspect_ai.solver import Solver, TaskState, solver from openbench.utils.cli_commands import ( @@ -26,6 +30,8 @@ run_setup_commands, run_final_test, format_solver_output, + prepare_hidden_workspace, + sync_agent_workspace, ) from openbench.agents import AgentManager @@ -39,8 +45,8 @@ def exercism_solver() -> Solver: the appropriate tool based on the code agent specified in task arguments. The code agent can be specified via: - - CLI flag: --code-agent aider|opencode|claude|roo - - Defaults to 'aider' if not specified + - CLI flag: --code-agent codex|aider|opencode|claude_code|roo + - Defaults to 'codex' if not specified Returns: Solver function that handles the task execution @@ -63,13 +69,13 @@ async def solve(state: TaskState, generate) -> TaskState: # type: ignore[overri if not isinstance(setup_commands, list): setup_commands = [] - code_agent = state.metadata.get("code_agent", "aider") + code_agent = state.metadata.get("code_agent", "codex") # Validate code agent input if isinstance(code_agent, list) and len(code_agent) > 0: code_agent = code_agent[0] elif not isinstance(code_agent, str): - code_agent = "aider" + code_agent = "codex" code_agent = code_agent.lower() @@ -88,19 +94,49 @@ async def solve(state: TaskState, generate) -> TaskState: # type: ignore[overri ) return state - workdir = f"/workspace/{language}/{task_name}" + full_workdir = f"/workspace/{language}/{task_name}" + agent_workdir = full_workdir + hide_tests = bool(state.metadata.get("hide_tests")) + sync_context: Dict[str, Any] | None = None + + if hide_tests: + prep_result = await prepare_hidden_workspace(language, task_name) + if not prep_result.get("success"): + stderr = prep_result.get("stderr") or "unknown error" + state.output.completion = ( + f"ERROR: Failed to prepare hidden workspace: {stderr}" + ) + return state + agent_workdir = str(prep_result.get("agent_dir", full_workdir)) + sync_context = { + "hidden_paths": prep_result.get("hidden_paths", {}), + } + prompt_text = state.input_text # Run any language-specific setup commands inside the task directory - setup_out = await run_setup_commands(setup_commands, workdir) + setup_out = await run_setup_commands(setup_commands, agent_workdir) agent = AgentManager.get_agent(code_agent) model = agent.resolve_model_with_fallback(str(state.model)) - code_agent_out = await agent.execute(workdir, prompt_text, model) + code_agent_out = await agent.execute(agent_workdir, prompt_text, model) - test_out = await run_final_test(test_command, workdir) + if hide_tests and sync_context is not None: + sync_result = await sync_agent_workspace( + agent_workdir, + full_workdir, + dict(sync_context.get("hidden_paths", {})), + ) + if not sync_result.get("success"): + stderr = sync_result.get("stderr") or "unknown error" + state.output.completion = ( + f"ERROR: Failed to sync hidden workspace: {stderr}" + ) + return state + + test_out = await run_final_test(test_command, full_workdir) state.output.completion = format_solver_output( code_agent, setup_out, code_agent_out, test_out diff --git a/src/openbench/utils/cli_commands.py b/src/openbench/utils/cli_commands.py index f9a94c2a..d865b39a 100644 --- a/src/openbench/utils/cli_commands.py +++ b/src/openbench/utils/cli_commands.py @@ -1,18 +1,26 @@ """ Utility functions for CLI-based solvers that run tasks inside Docker sandboxes. -This module provides common functionality for different CLI code agents (aider, opencode, roo) +This module provides common functionality for different CLI code agents +(aider, opencode, claude_code, codex, roo) including repository management, environment setup, and command execution. """ from __future__ import annotations +import json +import os import re +import shlex from typing import Any, Dict, List, Optional from inspect_ai.util import sandbox from openbench.provider_config import ProviderManager +from openbench.utils.text import DISCOVER_TEST_FILES_SCRIPT + + +AGENT_WORKSPACE_ROOT = "/workspace/.agent_workspaces" # ============================================================================= @@ -47,6 +55,139 @@ async def ensure_repo_and_task(language: str, task_name: str) -> bool: return False +def _normalize_relative_path(path: str) -> str: + """Normalize a relative path for rsync exclude usage.""" + if not path: + return "" + normalized = os.path.normpath(path) + if normalized == ".": + return "" + return normalized.lstrip("./") + + +def _build_exclude_flags(hidden_paths: Dict[str, List[str]]) -> List[str]: + """Convert hidden path lists into rsync --exclude flags.""" + flags: List[str] = [] + + for file_path in hidden_paths.get("files", []): + rel = _normalize_relative_path(file_path) + if not rel: + continue + flags.append(f"--exclude={shlex.quote(rel)}") + + for dir_path in hidden_paths.get("dirs", []): + rel = _normalize_relative_path(dir_path) + if not rel: + continue + rel_dir = rel.rstrip("/") + flags.append(f"--exclude={shlex.quote(rel_dir)}") + flags.append(f"--exclude={shlex.quote(rel_dir + '/**')}") + + return flags + + +async def discover_hidden_paths(full_dir: str) -> Dict[str, Any]: + """Detect test files and directories.""" + script = DISCOVER_TEST_FILES_SCRIPT.format(root_dir=full_dir) + + result = await sandbox().exec( + cmd=["bash", "-lc", script], + timeout=120, + ) + + if result.returncode != 0: + return { + "success": False, + "stdout": result.stdout or "", + "stderr": result.stderr or "failed to detect hidden paths", + } + + stdout = result.stdout.strip() or "{}" + try: + hidden_paths = json.loads(stdout) + hidden_paths.setdefault("dirs", []) + hidden_paths.setdefault("files", []) + except json.JSONDecodeError as exc: + return { + "success": False, + "stdout": stdout, + "stderr": f"failed to parse hidden path output: {exc}", + } + + return { + "success": True, + "hidden_paths": hidden_paths, + } + + +async def prepare_hidden_workspace(language: str, task_name: str) -> Dict[str, Any]: + """Create a sanitized workspace copy with tests removed.""" + full_dir = f"/workspace/{language}/{task_name}" + agent_dir = os.path.join(AGENT_WORKSPACE_ROOT, language, task_name) + discovery = await discover_hidden_paths(full_dir) + if not discovery.get("success"): + return discovery + + hidden_paths: Dict[str, List[str]] = discovery.get("hidden_paths", {}) + exclude_flags = _build_exclude_flags(hidden_paths) + exclude_args = " ".join(exclude_flags) + + if exclude_args: + rsync_cmd = f'rsync -a --delete {exclude_args} "{full_dir}/" "{agent_dir}/"' + else: + rsync_cmd = f'rsync -a --delete "{full_dir}/" "{agent_dir}/"' + + script_lines = [ + "set -euo pipefail", + f"rm -rf {shlex.quote(agent_dir)}", + f"mkdir -p {shlex.quote(agent_dir)}", + rsync_cmd, + ] + + result = await sandbox().exec( + cmd=["bash", "-lc", "\n".join(script_lines)], + timeout=240, + ) + + return { + "success": result.returncode == 0, + "agent_dir": agent_dir, + "hidden_paths": hidden_paths, + "stdout": result.stdout or "", + "stderr": result.stderr or "", + } + + +async def sync_agent_workspace( + agent_dir: str, full_dir: str, hidden_paths: Dict[str, List[str]] +) -> Dict[str, Any]: + """Propagate changes from the sanitized workspace back into the full repo for grading.""" + exclude_flags = _build_exclude_flags(hidden_paths) + exclude_args = " ".join(exclude_flags) + if exclude_args: + rsync_cmd = f'rsync -a --delete {exclude_args} "{agent_dir}/" "{full_dir}/"' + else: + rsync_cmd = f'rsync -a --delete "{agent_dir}/" "{full_dir}/"' + + script_lines = [ + "set -euo pipefail", + f"test -d {shlex.quote(agent_dir)}", + f"test -d {shlex.quote(full_dir)}", + rsync_cmd, + ] + + result = await sandbox().exec( + cmd=["bash", "-lc", "\n".join(script_lines)], + timeout=240, + ) + + return { + "success": result.returncode == 0, + "stdout": result.stdout or "", + "stderr": result.stderr or "", + } + + async def run_setup_commands(setup_commands: List[str], workdir: str) -> str: """Run optional language-specific setup commands inside the task directory. @@ -392,11 +533,11 @@ def get_opencode_script_template() -> str: """ -def get_claude_script_template() -> str: - """Get the Claude Code execution script template. +def get_gemini_script_template() -> str: + """Get the Gemini CLI execution script template. Returns: - Claude Code script template with placeholders + Gemini script template with placeholders """ return """#!/bin/bash set +e @@ -404,18 +545,21 @@ def get_claude_script_template() -> str: cd {workdir} # Read the prompt from file -PROMPT=$(cat /tmp/claude_code_prompt.txt) +PROMPT=$(cat /tmp/gemini_prompt.txt) {env_setup} -echo "Running Claude Code with prompt: $PROMPT" -echo "Model: {model}" +# Map GOOGLE_API_KEY to GEMINI_API_KEY if GEMINI_API_KEY is not set +if [ -z "$GEMINI_API_KEY" ] && [ -n "$GOOGLE_API_KEY" ]; then + export GEMINI_API_KEY="$GOOGLE_API_KEY" + echo "Mapped GOOGLE_API_KEY to GEMINI_API_KEY" +fi + +echo "Running Gemini CLI with prompt: $PROMPT" echo "Working directory: $(pwd)" -echo "$PROMPT" | claude -p --model "{model}" \ - --permission-mode acceptEdits \ - --allowedTools "Bash(*)" "Read" "Edit" \ - 2>&1 | tee /tmp/claude-code-output.log +# Run Gemini with --yolo flag to enable automatic file editing +gemini --model {model} -p "$PROMPT" --yolo 2>&1 | tee /tmp/gemini-output.log """ @@ -592,8 +736,10 @@ def format_solver_output( code_agent_section_map = { "aider": "AIDER_OUTPUT", "opencode": "OPENCODE_OUTPUT", - "claude": "CLAUDE_CODE_OUTPUT", + "claude_code": "CLAUDE_CODE_OUTPUT", + "codex": "CODEX_CLI_OUTPUT", "roo": "ROO_CLI_EXECUTION", + "gemini": "GEMINI_OUTPUT", } code_agent_section = code_agent_section_map.get( diff --git a/src/openbench/utils/docker.py b/src/openbench/utils/docker.py index c210d986..6f4e6047 100644 --- a/src/openbench/utils/docker.py +++ b/src/openbench/utils/docker.py @@ -61,6 +61,7 @@ "jq", "xz-utils", "bash", + "rsync", ] # Language runtime packages for exercism support @@ -105,10 +106,10 @@ class OpenCodeCommands: BASE_PACKAGES = ["curl", "gnupg", "ca-certificates"] -class ClaudeCommands: - """Docker commands for Claude agent installation.""" +class GeminiCommands: + """Docker commands for Gemini CLI agent installation.""" - DOCKERFILE_COMMANDS = ["RUN npm install -g @anthropic-ai/claude-code"] + DOCKERFILE_COMMANDS = ["RUN npm install -g @google/gemini-cli@latest"] BASE_PACKAGES = ["curl", "gnupg", "ca-certificates"] @@ -250,7 +251,9 @@ def get_agent_docker_commands(agent_name: str) -> List[str]: commands_map = { "aider": AiderCommands.DOCKERFILE_COMMANDS, "opencode": OpenCodeCommands.DOCKERFILE_COMMANDS, - "claude": ClaudeCommands.DOCKERFILE_COMMANDS, + "gemini": GeminiCommands.DOCKERFILE_COMMANDS, + "claude_code": [], + "codex": [], "roo": RooCommands.get_dockerfile_commands(), } @@ -275,7 +278,9 @@ def get_agent_base_packages(agent_name: str) -> List[str]: packages_map = { "aider": AiderCommands.BASE_PACKAGES, "opencode": OpenCodeCommands.BASE_PACKAGES, - "claude": ClaudeCommands.BASE_PACKAGES, + "gemini": GeminiCommands.BASE_PACKAGES, + "claude_code": [], + "codex": [], "roo": RooCommands.BASE_PACKAGES, } diff --git a/src/openbench/utils/text.py b/src/openbench/utils/text.py index 2cb13ec0..3f8dc5c9 100644 --- a/src/openbench/utils/text.py +++ b/src/openbench/utils/text.py @@ -208,6 +208,19 @@ def normalize_mcq_answer(extracted_answer: str) -> str: Please return only the generated summary text, without any additional titles or preambles. """ + +EXERCISM_HIDDEN_TEST_PROMPT = """ +Your job is to complete a coding exercise described in the markdown files inside the `docs` directory. + +A file with the implementation stubbed out has been created for you, along with the surrounding project scaffolding. + +To successfully complete the exercise, implement the required functionality so the solution satisfies the specification. + +Tests have been hidden from you and you will not be able to run them. Complete the exercise to the best of your ability based on the instructions. + +You should start by reading the files in the `docs` directory so that you understand the exercise, and then examine the stubbed out implementation. +""".strip() + LIVEMCPBENCH_VERDICT_PATTERN = re.compile( r"Thoughts:\s*(.+?)\s*Status:\s*(\w+)", re.DOTALL ) @@ -217,7 +230,7 @@ def normalize_mcq_answer(extracted_answer: str) -> str: {question} -Remember to show your work clearly and end with ‘ANSWER: X’ where X is your final numerical answer. +Remember to show your work clearly and end with 'ANSWER: X' where X is your final numerical answer. """ @@ -589,3 +602,40 @@ def get_chatml_tok_cnt(chat_messages_str: str) -> int: """ # Expected SHA-256 hash for factscore db FACTSCORE_DB_SHA256 = "31cf7b6b4465459844bb00f3a6ac75560fc7d1525112205a21859323dc5d33d7" + +# Python script template for discovering test files and directories in exercism tasks +DISCOVER_TEST_FILES_SCRIPT = """python3 - <<'PY' +import json +import os + +root = {root_dir!r} +root = os.path.abspath(root) +dir_matches = set() +file_matches = set() + +for current, dirnames, filenames in os.walk(root): + rel_current = os.path.relpath(current, root) + rel_current = "" if rel_current == "." else rel_current + + keep_dirs = [] + for dirname in dirnames: + rel_path = os.path.join(rel_current, dirname) if rel_current else dirname + if "test" in dirname.lower(): + dir_matches.add(rel_path) + else: + keep_dirs.append(dirname) + dirnames[:] = keep_dirs + + for filename in filenames: + lowered = filename.lower() + if "test" in lowered or lowered.endswith(".spec.js"): + rel_path = os.path.join(rel_current, filename) if rel_current else filename + file_matches.add(rel_path) + +result = {{ + "dirs": sorted(dir_matches), + "files": sorted(file_matches), +}} + +print(json.dumps(result)) +PY""" diff --git a/tests/test_exercism_cli.py b/tests/test_exercism_cli.py new file mode 100644 index 00000000..505e6fc5 --- /dev/null +++ b/tests/test_exercism_cli.py @@ -0,0 +1,470 @@ +"""Tests for Exercism-specific CLI helpers.""" + +from __future__ import annotations + +import asyncio +import subprocess +from types import SimpleNamespace + +import pytest + +from openbench.utils import cli_commands + + +class _RecordingSandbox: + def __init__(self) -> None: + self.commands: list[list[str]] = [] + + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + self.commands.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + +class _LocalSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + completed = await asyncio.to_thread( + subprocess.run, + cmd, + capture_output=True, + text=True, + env=env, + ) + return SimpleNamespace( + returncode=completed.returncode, + stdout=completed.stdout, + stderr=completed.stderr, + ) + + +@pytest.mark.asyncio +async def test_discover_hidden_paths_detects_spec_js(tmp_path, monkeypatch): + """Ensure .spec.js files are treated as hidden test files.""" + task_dir = tmp_path / "javascript" / "two-fer" + (task_dir / "src").mkdir(parents=True) + (task_dir / "src" / "two_fer.spec.js").write_text("// spec file") + + local_sandbox = _LocalSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: local_sandbox) + + result = await cli_commands.discover_hidden_paths(str(task_dir)) + + assert result["success"] is True + hidden_files = result["hidden_paths"]["files"] + assert "src/two_fer.spec.js" in hidden_files + + +@pytest.mark.asyncio +async def test_prepare_hidden_workspace_builds_excludes(monkeypatch): + """prepare_hidden_workspace should exclude discovered test artifacts.""" + + async def fake_discover(path: str): + assert path == "/workspace/javascript/two-fer" + return { + "success": True, + "hidden_paths": {"files": ["src/two_fer.spec.js"], "dirs": []}, + } + + recording_sandbox = _RecordingSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: recording_sandbox) + monkeypatch.setattr(cli_commands, "discover_hidden_paths", fake_discover) + + result = await cli_commands.prepare_hidden_workspace("javascript", "two-fer") + + assert result["success"] is True + assert result["agent_dir"].endswith("/javascript/two-fer") + # Verify rsync command contains exclude flag for the spec file + rsync_script = recording_sandbox.commands[-1][2] + assert "--exclude=src/two_fer.spec.js" in rsync_script + + +@pytest.mark.asyncio +async def test_sync_agent_workspace_respects_hidden_paths(monkeypatch): + """sync_agent_workspace should keep hidden paths excluded during sync.""" + recording_sandbox = _RecordingSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: recording_sandbox) + + hidden_paths = {"files": ["src/two_fer.spec.js"], "dirs": ["testdata"]} + + result = await cli_commands.sync_agent_workspace( + "/tmp/agent", "/tmp/full", hidden_paths + ) + + assert result["success"] is True + script = recording_sandbox.commands[-1][2] + assert "--exclude=src/two_fer.spec.js" in script + assert "--exclude=testdata" in script + assert "--exclude='testdata/**'" in script + + +# ============================================================================= +# Unit Tests for Helper Functions +# ============================================================================= + + +def test_normalize_relative_path_edge_cases(): + """Test path normalization edge cases.""" + from openbench.utils.cli_commands import _normalize_relative_path + + # Empty string + assert _normalize_relative_path("") == "" + + # Current directory + assert _normalize_relative_path(".") == "" + assert _normalize_relative_path("./") == "" + + # Leading ./ should be stripped + assert _normalize_relative_path("./src/file.py") == "src/file.py" + + # Nested paths with .. + assert _normalize_relative_path("./foo/../bar") == "bar" + + # Already normalized paths + assert _normalize_relative_path("src/test/file.py") == "src/test/file.py" + + # Trailing slashes + assert _normalize_relative_path("./src/") == "src" + + +def test_build_exclude_flags_empty_inputs(): + """Test exclude flag building with empty inputs.""" + from openbench.utils.cli_commands import _build_exclude_flags + + # Empty dict + assert _build_exclude_flags({}) == [] + + # Empty files and dirs + assert _build_exclude_flags({"files": [], "dirs": []}) == [] + + +def test_build_exclude_flags_files_only(): + """Test exclude flags with only files.""" + from openbench.utils.cli_commands import _build_exclude_flags + + flags = _build_exclude_flags({"files": ["test_file.py", "src/test.js"], "dirs": []}) + + assert len(flags) == 2 + assert "--exclude=test_file.py" in flags + assert "--exclude=src/test.js" in flags + + +def test_build_exclude_flags_dirs_only(): + """Test exclude flags with only directories.""" + from openbench.utils.cli_commands import _build_exclude_flags + + flags = _build_exclude_flags({"files": [], "dirs": ["tests", "src/test_data"]}) + + # Each dir should have 2 flags (dir itself and dir/**) + assert len(flags) == 4 + assert "--exclude=tests" in flags + assert "--exclude='tests/**'" in flags + + +def test_build_exclude_flags_skips_empty_paths(): + """Test that empty paths are skipped.""" + from openbench.utils.cli_commands import _build_exclude_flags + + flags = _build_exclude_flags( + {"files": ["", "valid.py", "."], "dirs": ["", ".", "valid_dir"]} + ) + + # Should only include valid paths + assert "--exclude=valid.py" in flags + assert "--exclude=valid_dir" in flags + + +# ============================================================================= +# Additional discover_hidden_paths Tests +# ============================================================================= + + +@pytest.mark.asyncio +async def test_discover_hidden_paths_detects_test_dirs(tmp_path, monkeypatch): + """Ensure directories with 'test' in name are detected.""" + task_dir = tmp_path / "python" / "task" + (task_dir / "tests").mkdir(parents=True) + (task_dir / "src").mkdir(parents=True) + (task_dir / "src" / "test_utils").mkdir(parents=True) + + local_sandbox = _LocalSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: local_sandbox) + + result = await cli_commands.discover_hidden_paths(str(task_dir)) + + assert result["success"] is True + hidden_dirs = result["hidden_paths"]["dirs"] + assert "tests" in hidden_dirs + assert "src/test_utils" in hidden_dirs + + +@pytest.mark.asyncio +async def test_discover_hidden_paths_case_insensitive(tmp_path, monkeypatch): + """Ensure detection is case-insensitive.""" + task_dir = tmp_path / "task" + task_dir.mkdir(parents=True) + (task_dir / "TEST_file.py").write_text("") + (task_dir / "Test_Dir").mkdir() + + local_sandbox = _LocalSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: local_sandbox) + + result = await cli_commands.discover_hidden_paths(str(task_dir)) + + assert result["success"] is True + assert "TEST_file.py" in result["hidden_paths"]["files"] + assert "Test_Dir" in result["hidden_paths"]["dirs"] + + +@pytest.mark.asyncio +async def test_discover_hidden_paths_handles_script_failure(monkeypatch): + """Test error handling when script fails.""" + + class FailingSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + return SimpleNamespace(returncode=1, stdout="", stderr="Python error") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: FailingSandbox()) + + result = await cli_commands.discover_hidden_paths("/some/path") + + assert result["success"] is False + assert "stderr" in result + assert "Python error" in result["stderr"] + + +@pytest.mark.asyncio +async def test_discover_hidden_paths_handles_invalid_json(monkeypatch): + """Test error handling when JSON parsing fails.""" + + class BadJsonSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + return SimpleNamespace(returncode=0, stdout="not valid json{", stderr="") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: BadJsonSandbox()) + + result = await cli_commands.discover_hidden_paths("/some/path") + + assert result["success"] is False + assert "failed to parse" in result["stderr"] + + +@pytest.mark.asyncio +async def test_discover_hidden_paths_no_test_files(tmp_path, monkeypatch): + """Test with directory containing no test files.""" + task_dir = tmp_path / "clean_task" + task_dir.mkdir(parents=True) + (task_dir / "main.py").write_text("") + (task_dir / "utils.py").write_text("") + + local_sandbox = _LocalSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: local_sandbox) + + result = await cli_commands.discover_hidden_paths(str(task_dir)) + + assert result["success"] is True + assert result["hidden_paths"]["files"] == [] + assert result["hidden_paths"]["dirs"] == [] + + +# ============================================================================= +# prepare_hidden_workspace Error Handling Tests +# ============================================================================= + + +@pytest.mark.asyncio +async def test_prepare_hidden_workspace_propagates_discovery_failure(monkeypatch): + """Test that discovery failures are propagated.""" + + async def failing_discover(path: str): + return {"success": False, "stdout": "", "stderr": "Discovery failed"} + + monkeypatch.setattr(cli_commands, "discover_hidden_paths", failing_discover) + monkeypatch.setattr(cli_commands, "sandbox", lambda: _RecordingSandbox()) + + result = await cli_commands.prepare_hidden_workspace("python", "task") + + assert result["success"] is False + assert result["stderr"] == "Discovery failed" + + +@pytest.mark.asyncio +async def test_prepare_hidden_workspace_no_excludes(monkeypatch): + """Test workspace preparation when no test files found.""" + + async def clean_discover(path: str): + return {"success": True, "hidden_paths": {"files": [], "dirs": []}} + + recording_sandbox = _RecordingSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: recording_sandbox) + monkeypatch.setattr(cli_commands, "discover_hidden_paths", clean_discover) + + result = await cli_commands.prepare_hidden_workspace("python", "task") + + assert result["success"] is True + # Verify rsync command doesn't have --exclude flags + rsync_script = recording_sandbox.commands[-1][2] + assert "--exclude" not in rsync_script + + +# ============================================================================= +# run_setup_commands Tests +# ============================================================================= + + +@pytest.mark.asyncio +async def test_run_setup_commands_empty_list(monkeypatch): + """Test with no setup commands.""" + result = await cli_commands.run_setup_commands([], "/workspace/task") + assert result == "No setup commands" + + +@pytest.mark.asyncio +async def test_run_setup_commands_success(monkeypatch): + """Test successful setup command execution.""" + + class SuccessSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + return SimpleNamespace(returncode=0, stdout="Setup complete", stderr="") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: SuccessSandbox()) + + result = await cli_commands.run_setup_commands( + ["pip install -r requirements.txt"], "/workspace/task" + ) + + assert "Exit Code: 0" in result + assert "Success: True" in result + assert "Setup complete" in result + + +@pytest.mark.asyncio +async def test_run_setup_commands_failure(monkeypatch): + """Test setup command failure.""" + + class FailingSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + return SimpleNamespace(returncode=1, stdout="", stderr="Package not found") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: FailingSandbox()) + + result = await cli_commands.run_setup_commands( + ["pip install nonexistent"], "/workspace/task" + ) + + assert "Exit Code: 1" in result + assert "Success: False" in result + assert "Package not found" in result + + +# ============================================================================= +# run_final_test Tests +# ============================================================================= + + +@pytest.mark.asyncio +async def test_run_final_test_python_filename_normalization(monkeypatch): + """Test Python test filename normalization (hyphens to underscores).""" + + class RecordingCommandSandbox: + def __init__(self): + self.last_cmd = None + + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + self.last_cmd = cmd + return SimpleNamespace(returncode=0, stdout="OK", stderr="") + + sandbox = RecordingCommandSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: sandbox) + + await cli_commands.run_final_test( + "pytest two-fer-test.py", "/workspace/python/two-fer" + ) + + # Verify hyphens were converted to underscores + assert "two_fer_test.py" in sandbox.last_cmd[2] + + +@pytest.mark.asyncio +async def test_run_final_test_non_python_unchanged(monkeypatch): + """Test that non-Python tests don't get filename changes.""" + + class RecordingCommandSandbox: + def __init__(self): + self.last_cmd = None + + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + self.last_cmd = cmd + return SimpleNamespace(returncode=0, stdout="OK", stderr="") + + sandbox = RecordingCommandSandbox() + monkeypatch.setattr(cli_commands, "sandbox", lambda: sandbox) + + await cli_commands.run_final_test( + "npm test two-fer-test.js", "/workspace/javascript/two-fer" + ) + + # Verify filename was NOT changed (no python in path) + assert "two-fer-test.js" in sandbox.last_cmd[2] + + +@pytest.mark.asyncio +async def test_run_final_test_exception_handling(monkeypatch): + """Test exception handling in test execution.""" + + class ExceptionSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + raise RuntimeError("Sandbox crashed") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: ExceptionSandbox()) + + result = await cli_commands.run_final_test("pytest test.py", "/workspace/task") + + assert "ERROR: test run failed" in result + assert "Sandbox crashed" in result + + +# ============================================================================= +# ensure_repo_and_task Tests +# ============================================================================= + + +@pytest.mark.asyncio +async def test_ensure_repo_and_task_success(monkeypatch): + """Test successful repo and task verification.""" + + class SuccessSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: SuccessSandbox()) + + result = await cli_commands.ensure_repo_and_task("python", "two-fer") + assert result is True + + +@pytest.mark.asyncio +async def test_ensure_repo_and_task_missing_task(monkeypatch): + """Test with missing task directory.""" + + class FailingSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + # Fail on the task existence check + return SimpleNamespace(returncode=1, stdout="", stderr="") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: FailingSandbox()) + + result = await cli_commands.ensure_repo_and_task("python", "nonexistent") + assert result is False + + +@pytest.mark.asyncio +async def test_ensure_repo_and_task_exception(monkeypatch): + """Test exception handling.""" + + class ExceptionSandbox: + async def exec(self, cmd, timeout=0, env=None): # type: ignore[override] + raise Exception("Connection failed") + + monkeypatch.setattr(cli_commands, "sandbox", lambda: ExceptionSandbox()) + + result = await cli_commands.ensure_repo_and_task("python", "task") + assert result is False diff --git a/tests/test_groq_provider.py b/tests/test_groq_provider.py index 3ff16422..09ef41e0 100644 --- a/tests/test_groq_provider.py +++ b/tests/test_groq_provider.py @@ -143,31 +143,31 @@ def test_stream_parameter_extraction_true(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() model_args = {"stream": True, "other_arg": "value"} - + groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - **model_args + **model_args, ) - + # Stream parameter should be extracted and set assert groq_api.stream is True - + def test_stream_parameter_extraction_false(self): """Test that stream=False is properly extracted from model args.""" with patch("httpx.AsyncClient"): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() model_args = {"stream": False, "other_arg": "value"} - + groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - **model_args + **model_args, ) - + # Stream parameter should be extracted and set assert groq_api.stream is False @@ -177,14 +177,14 @@ def test_stream_parameter_default_true(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() model_args = {"other_arg": "value"} - + groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - **model_args + **model_args, ) - + # Stream parameter should default to True assert groq_api.stream is True @@ -194,14 +194,14 @@ def test_stream_parameter_removed_from_client_args(self): with patch("openbench.model._providers.groq.AsyncGroq") as mock_groq: config = GenerateConfig() model_args = {"stream": True, "other_arg": "value"} - + GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - **model_args + **model_args, ) - + # Verify AsyncGroq was called without the stream parameter mock_groq.assert_called_once() call_kwargs = mock_groq.call_args[1] @@ -214,14 +214,14 @@ def test_completion_params_includes_stream_when_enabled(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() model_args = {"stream": True} - + groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - **model_args + **model_args, ) - + params = groq_api.completion_params(config) assert params["stream"] is True @@ -231,14 +231,14 @@ def test_completion_params_includes_stream_by_default(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() model_args = {} # No stream parameter specified - + groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - **model_args + **model_args, ) - + params = groq_api.completion_params(config) assert params["stream"] is True @@ -248,12 +248,12 @@ async def test_handle_streaming_response_content_accumulation(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - stream=True + stream=True, ) - + # Create mock stream chunks chunk1 = MagicMock() chunk1.choices = [MagicMock()] @@ -266,7 +266,7 @@ async def test_handle_streaming_response_content_accumulation(self): chunk1.model = "test-model" chunk1.system_fingerprint = "test-fingerprint" chunk1.created = 1234567890 - + chunk2 = MagicMock() chunk2.choices = [MagicMock()] chunk2.choices[0].delta.content = "world!" @@ -275,13 +275,13 @@ async def test_handle_streaming_response_content_accumulation(self): chunk2.choices[0].delta.executed_tools = None chunk2.choices[0].finish_reason = "stop" chunk2.id = "test-id" - + # Mock async iterator mock_stream = AsyncMock() mock_stream.__aiter__.return_value = [chunk1, chunk2] - + result = await groq_api._handle_streaming_response(mock_stream, []) - + # Verify content was accumulated assert result.choices[0].message.content == "Hello world!" assert result.choices[0].finish_reason == "stop" @@ -294,12 +294,12 @@ async def test_handle_streaming_response_reasoning_accumulation(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - stream=True + stream=True, ) - + # Create mock stream chunks with reasoning chunk1 = MagicMock() chunk1.choices = [MagicMock()] @@ -310,7 +310,7 @@ async def test_handle_streaming_response_reasoning_accumulation(self): chunk1.choices[0].finish_reason = None chunk1.id = "test-id" chunk1.model = "test-model" - + chunk2 = MagicMock() chunk2.choices = [MagicMock()] chunk2.choices[0].delta.content = None @@ -318,14 +318,17 @@ async def test_handle_streaming_response_reasoning_accumulation(self): chunk2.choices[0].delta.tool_calls = None chunk2.choices[0].delta.executed_tools = None chunk2.choices[0].finish_reason = "stop" - + mock_stream = AsyncMock() mock_stream.__aiter__.return_value = [chunk1, chunk2] - + result = await groq_api._handle_streaming_response(mock_stream, []) - + # Verify reasoning was accumulated - assert result.choices[0].message.reasoning == "First reasoning second reasoning" + assert ( + result.choices[0].message.reasoning + == "First reasoning second reasoning" + ) async def test_handle_streaming_response_tool_calls_accumulation(self): """Test that streaming response properly accumulates tool calls.""" @@ -333,12 +336,12 @@ async def test_handle_streaming_response_tool_calls_accumulation(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - stream=True + stream=True, ) - + # Create mock stream chunks with tool calls tool_call1 = MagicMock() tool_call1.index = 0 @@ -346,14 +349,14 @@ async def test_handle_streaming_response_tool_calls_accumulation(self): tool_call1.type = "function" tool_call1.function.name = "test_func" tool_call1.function.arguments = '{"param":' - + tool_call2 = MagicMock() tool_call2.index = 0 tool_call2.id = None tool_call2.type = None tool_call2.function.name = None tool_call2.function.arguments = ' "value"}' - + chunk1 = MagicMock() chunk1.choices = [MagicMock()] chunk1.choices[0].delta.content = None @@ -363,7 +366,7 @@ async def test_handle_streaming_response_tool_calls_accumulation(self): chunk1.choices[0].finish_reason = None chunk1.id = "test-id" chunk1.model = "test-model" - + chunk2 = MagicMock() chunk2.choices = [MagicMock()] chunk2.choices[0].delta.content = None @@ -371,12 +374,12 @@ async def test_handle_streaming_response_tool_calls_accumulation(self): chunk2.choices[0].delta.tool_calls = [tool_call2] chunk2.choices[0].delta.executed_tools = None chunk2.choices[0].finish_reason = "tool_calls" - + mock_stream = AsyncMock() mock_stream.__aiter__.return_value = [chunk1, chunk2] - + result = await groq_api._handle_streaming_response(mock_stream, []) - + # Verify tool call was accumulated assert len(result.choices[0].message.tool_calls) == 1 tool_call = result.choices[0].message.tool_calls[0] @@ -391,18 +394,18 @@ async def test_handle_streaming_response_empty_stream(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - stream=True + stream=True, ) - + # Empty stream mock_stream = AsyncMock() mock_stream.__aiter__.return_value = [] - + result = await groq_api._handle_streaming_response(mock_stream, []) - + # Should return valid but empty response assert result.choices[0].message.content == "" assert result.choices[0].message.reasoning is None @@ -414,12 +417,12 @@ async def test_handle_streaming_response_usage_extraction(self): with patch("openbench.model._providers.groq.AsyncGroq"): config = GenerateConfig() groq_api = GroqAPI( - model_name="test-model", - api_key="test-key", + model_name="test-model", + api_key="test-key", config=config, - stream=True + stream=True, ) - + # Create chunk with x_groq usage chunk = MagicMock() chunk.choices = [] @@ -428,11 +431,11 @@ async def test_handle_streaming_response_usage_extraction(self): chunk.x_groq.usage.prompt_tokens = 10 chunk.x_groq.usage.completion_tokens = 20 chunk.x_groq.usage.total_tokens = 30 - + mock_stream = AsyncMock() mock_stream.__aiter__.return_value = [chunk] - + result = await groq_api._handle_streaming_response(mock_stream, []) - + # Verify usage was extracted assert result.usage == chunk.x_groq.usage diff --git a/uv.lock b/uv.lock index 8783e487..1ac87d42 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -194,7 +194,7 @@ wheels = [ [[package]] name = "anthropic" -version = "0.73.0" +version = "0.74.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -206,9 +206,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f0/07/f550112c3f5299d02f06580577f602e8a112b1988ad7c98ac1a8f7292d7e/anthropic-0.73.0.tar.gz", hash = "sha256:30f0d7d86390165f86af6ca7c3041f8720bb2e1b0e12a44525c8edfdbd2c5239", size = 425168, upload-time = "2025-11-14T18:47:52.635Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/7b/609eea5c54ae69b1a4a94169d4b0c86dc5c41b43509989913f6cdc61b81d/anthropic-0.74.1.tar.gz", hash = "sha256:04c087b2751385c524f6d332d066a913870e4de8b3e335fb0a0c595f1f88dc6e", size = 428981, upload-time = "2025-11-19T22:17:31.533Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/b1/5d4d3f649e151e58dc938cf19c4d0cd19fca9a986879f30fea08a7b17138/anthropic-0.73.0-py3-none-any.whl", hash = "sha256:0d56cd8b3ca3fea9c9b5162868bdfd053fbc189b8b56d4290bd2d427b56db769", size = 367839, upload-time = "2025-11-14T18:47:51.195Z" }, + { url = "https://files.pythonhosted.org/packages/dd/45/6b18d0692302b8cbc01a10c35b43953d3c4172fbd4f83337b8ed21a8eaa4/anthropic-0.74.1-py3-none-any.whl", hash = "sha256:b07b998d1cee7f41d9f02530597d7411672b362cc2417760a40c0167b81c6e65", size = 371473, upload-time = "2025-11-19T22:17:29.998Z" }, ] [[package]] @@ -1423,13 +1423,14 @@ wheels = [ [[package]] name = "inspect-ai" -version = "0.3.125" +version = "0.3.141" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aioboto3" }, { name = "aiohttp" }, { name = "anyio" }, { name = "beautifulsoup4" }, + { name = "boto3" }, { name = "click" }, { name = "debugpy" }, { name = "docstring-parser" }, @@ -1458,11 +1459,31 @@ dependencies = [ { name = "tenacity" }, { name = "textual" }, { name = "typing-extensions" }, + { name = "universal-pathlib" }, { name = "zipp" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ff/9a/1a72dddd91503a87a64049b0f2b0012759624e6a1435b65e03803f265813/inspect_ai-0.3.125.tar.gz", hash = "sha256:2aeb491dc8cf03689e3080b642016d10aa5431867e1b8dbedcfe439a8f256fe8", size = 11900429, upload-time = "2025-08-25T13:32:36.35Z" } +sdist = { url = "https://files.pythonhosted.org/packages/69/d0/fcb7209b9e1b4a49a534de8bc7c12954830906756ff079782f0a2c80ae76/inspect_ai-0.3.141.tar.gz", hash = "sha256:4d82e289c8e4ea241a99c780a54ff5e3f546abdbe2a6c24dbbf4f53ffcf09631", size = 42787335, upload-time = "2025-10-27T11:57:05.748Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/43/afbed4ccb75c9864599ed571b5e0a06bb105e2d75f72e35d0f89e6761b12/inspect_ai-0.3.141-py3-none-any.whl", hash = "sha256:22339ee9619619770bca451274ff23af39a7e309ea412f2051adfdf2e7d7731d", size = 34133970, upload-time = "2025-10-27T11:56:56.99Z" }, +] + +[[package]] +name = "inspect-swe" +version = "0.2.26" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "inspect-ai" }, + { name = "nest-asyncio" }, + { name = "platformdirs" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/35/d0234d0588c99c75ebdbbd02a6ce6fe82c958a0236589132a3f4a45dcb96/inspect_swe-0.2.26.tar.gz", hash = "sha256:811fcb299170980ec77f774a2c6c43b2875e9728c6054d700a93d83335932103", size = 16854, upload-time = "2025-11-15T20:37:25.064Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a9/d8/dc509a17c29db0132e9ad96072a1052eec30d0f3f8f89ccbbf100cddabb0/inspect_ai-0.3.125-py3-none-any.whl", hash = "sha256:ac1113744b63a5b74364160446a5a9671ca4b393113ba4fcc4fee09380e38cca", size = 3290751, upload-time = "2025-08-25T13:32:31.31Z" }, + { url = "https://files.pythonhosted.org/packages/2b/02/82706180d3d7796633eb558c5239a336ee182d91dbebdcc952e86aa552b8/inspect_swe-0.2.26-py3-none-any.whl", hash = "sha256:44fa73ef08be843b565117f43ee1dd3e737f27a39427d7a4c5d36d244ccb6236", size = 23490, upload-time = "2025-11-15T20:37:23.699Z" }, ] [[package]] @@ -2501,7 +2522,7 @@ wheels = [ [[package]] name = "openai" -version = "2.8.0" +version = "2.8.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2513,9 +2534,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/04/0c/b9321e12f89e236f5e9a46346c30fb801818e22ba33b798a5aca84be895c/openai-2.8.0.tar.gz", hash = "sha256:4851908f6d6fcacbd47ba659c5ac084f7725b752b6bfa1e948b6fbfc111a6bad", size = 602412, upload-time = "2025-11-13T18:15:25.847Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d5/e4/42591e356f1d53c568418dc7e30dcda7be31dd5a4d570bca22acb0525862/openai-2.8.1.tar.gz", hash = "sha256:cb1b79eef6e809f6da326a7ef6038719e35aa944c42d081807bfa1be8060f15f", size = 602490, upload-time = "2025-11-17T22:39:59.549Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/5b/e1/0a6560bab7fb7b5a88d35a505b859c6d969cb2fa2681b568eb5d95019dec/openai-2.8.0-py3-none-any.whl", hash = "sha256:ba975e347f6add2fe13529ccb94d54a578280e960765e5224c34b08d7e029ddf", size = 1022692, upload-time = "2025-11-13T18:15:23.621Z" }, + { url = "https://files.pythonhosted.org/packages/55/4f/dbc0c124c40cb390508a82770fb9f6e3ed162560181a85089191a851c59a/openai-2.8.1-py3-none-any.whl", hash = "sha256:c6c3b5a04994734386e8dad3c00a393f56d3b68a27cd2e8acae91a59e4122463", size = 1022688, upload-time = "2025-11-17T22:39:57.675Z" }, ] [[package]] @@ -2523,9 +2544,11 @@ name = "openbench" version = "0.5.2" source = { editable = "." } dependencies = [ + { name = "anthropic" }, { name = "datasets" }, { name = "groq" }, { name = "inspect-ai" }, + { name = "inspect-swe" }, { name = "jsonschema" }, { name = "mcp" }, { name = "numpy" }, @@ -2591,9 +2614,11 @@ tau-bench = [ [package.metadata] requires-dist = [ + { name = "anthropic", specifier = ">=0.69.0" }, { name = "datasets", specifier = ">=3.6.0" }, { name = "groq", specifier = ">=0.33.0" }, - { name = "inspect-ai", specifier = "==0.3.125" }, + { name = "inspect-ai", specifier = "==0.3.141" }, + { name = "inspect-swe", specifier = ">=0.2.26" }, { name = "jsonschema", specifier = ">=4.23.0" }, { name = "mcp", specifier = ">=1.13.1" }, { name = "numpy", specifier = "==2.2.6" }, @@ -2797,6 +2822,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" }, ] +[[package]] +name = "pathlib-abc" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/cb/448649d7f25d228bf0be3a04590ab7afa77f15e056f8fa976ed05ec9a78f/pathlib_abc-0.5.2.tar.gz", hash = "sha256:fcd56f147234645e2c59c7ae22808b34c364bb231f685ddd9f96885aed78a94c", size = 33342, upload-time = "2025-10-10T18:37:20.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/29/c028a0731e202035f0e2e0bfbf1a3e46ad6c628cbb17f6f1cc9eea5d9ff1/pathlib_abc-0.5.2-py3-none-any.whl", hash = "sha256:4c9d94cf1b23af417ce7c0417b43333b06a106c01000b286c99de230d95eefbb", size = 19070, upload-time = "2025-10-10T18:37:19.437Z" }, +] + [[package]] name = "pillow" version = "11.3.0" @@ -4303,6 +4337,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/37/87/1f677586e8ac487e29672e4b17455758fce261de06a0d086167bb760361a/uc_micro_py-1.0.3-py3-none-any.whl", hash = "sha256:db1dffff340817673d7b466ec86114a9dc0e9d4d9b5ba229d9d60e5c12600cd5", size = 6229, upload-time = "2024-02-09T16:52:00.371Z" }, ] +[[package]] +name = "universal-pathlib" +version = "0.3.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec" }, + { name = "pathlib-abc" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/db/6874223d251a2e146dae57a27ca8cb1f71e7e135aa51ad394173ffe18fc0/universal_pathlib-0.3.6.tar.gz", hash = "sha256:d8640454ff08305fc639f7980e8bad4a7d38e82f6389ff993fb0e7b2a4969de9", size = 249113, upload-time = "2025-11-13T17:05:29.882Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/5d/fc1f5478eb486a59549e0dbea5827633bbba01139b549968d4936154b756/universal_pathlib-0.3.6-py3-none-any.whl", hash = "sha256:ff10a86e5340ad986b6f04847bb64ba397dff7467450234ffa2ab5ff135641d8", size = 78715, upload-time = "2025-11-13T17:05:28.101Z" }, +] + [[package]] name = "uritemplate" version = "4.2.0"