From 57f9b70665e3f8d928a9fe15008c2d46b0dffdc5 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Tue, 12 May 2026 22:13:26 +0530 Subject: [PATCH 01/35] refactor: core sandbox infra to use openenv.core.harness.sandbox --- envs/opencode_env/__init__.py | 7 ++- envs/opencode_env/harness.py | 7 ++- envs/opencode_env/sandbox/__init__.py | 51 +++---------------- envs/opencode_env/sandbox/build_template.py | 10 ++-- src/openenv/core/harness/sandbox/__init__.py | 31 +++++++++++ .../openenv/core/harness}/sandbox/base.py | 10 ++-- .../core/harness/sandbox/e2b_backend.py | 11 +++- .../core/harness}/sandbox/interception.py | 8 +-- tests/envs/test_opencode_env.py | 2 +- 9 files changed, 74 insertions(+), 63 deletions(-) create mode 100644 src/openenv/core/harness/sandbox/__init__.py rename {envs/opencode_env => src/openenv/core/harness}/sandbox/base.py (87%) rename envs/opencode_env/sandbox/e2b.py => src/openenv/core/harness/sandbox/e2b_backend.py (94%) rename {envs/opencode_env => src/openenv/core/harness}/sandbox/interception.py (98%) diff --git a/envs/opencode_env/__init__.py b/envs/opencode_env/__init__.py index 223be6f7b..17cd145b3 100644 --- a/envs/opencode_env/__init__.py +++ b/envs/opencode_env/__init__.py @@ -30,7 +30,12 @@ RolloutResult, RolloutTurn, ) -from .sandbox import E2BSandboxBackend, SandboxBackend, SandboxHandle +from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle + +try: + from openenv.core.harness.sandbox import E2BSandboxBackend +except ImportError: # e2b not installed + E2BSandboxBackend = None # type: ignore[assignment,misc] from .task import OpenCodeTask __all__ = [ diff --git a/envs/opencode_env/harness.py b/envs/opencode_env/harness.py index da4410dd4..dc0eb55be 100644 --- a/envs/opencode_env/harness.py +++ b/envs/opencode_env/harness.py @@ -52,7 +52,7 @@ opencode_config_path, system_prompt_path, ) -from .sandbox.base import BgJob, SandboxBackend, SandboxHandle +from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle from .task import OpenCodeTask @@ -64,7 +64,10 @@ # Where the proxy source lives on disk (in this repo). Uploaded into the # sandbox at /home/user/proxy/interception.py before each rollout, unless # the sandbox was created from a template that already has it baked in. -_PROXY_SOURCE_PATH = Path(__file__).parent / "sandbox" / "interception.py" +_PROXY_SOURCE_PATH = ( + Path(__file__).resolve().parents[2] + / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py" +) Verifier = Callable[[SandboxHandle, OpenCodeTask], VerifyResult] diff --git a/envs/opencode_env/sandbox/__init__.py b/envs/opencode_env/sandbox/__init__.py index 321f81547..a3496a2b1 100644 --- a/envs/opencode_env/sandbox/__init__.py +++ b/envs/opencode_env/sandbox/__init__.py @@ -4,50 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Sandbox backends for the OpenCode harness. +"""Sandbox backends — re-exported from ``openenv.core.harness.sandbox``. -The primitive ships with :class:`E2BSandboxBackend` as the default; any backend -that satisfies the :class:`SandboxBackend` / :class:`SandboxHandle` protocols -can be swapped in. - -The ``e2b`` import is wrapped in ``try/except`` so this package can be loaded -in environments where ``e2b`` isn't installed (CI smoke tests, lint runs). -Instantiating ``E2BSandboxBackend`` without ``e2b`` raises a clear error. +The canonical source for sandbox protocols and implementations now lives in +``src/openenv/core/harness/sandbox/``. This package re-exports everything +so that ``from opencode_env.sandbox import ...`` keeps working, but all new +code should import from ``openenv.core.harness.sandbox`` directly. """ -from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle - -try: - from .e2b import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle # noqa: F401 -except ImportError as _e2b_err: # pragma: no cover - - class _RequiresE2B: - """Stub raised when ``e2b`` is not installed. - - Lets the package import cleanly so unit tests, ``openenv validate``, - and the docs build can run without the heavy ``e2b`` dependency. - Actually constructing one of these classes raises a clear ImportError. - """ - - _e2b_import_error = _e2b_err - - def __init__(self, *_args, **_kwargs): - raise ImportError( - "e2b is not installed; install it via " - "`pip install 'openenv-opencode-env[dev]'` or " - "`pip install e2b` to use E2BSandboxBackend. " - f"Original import error: {self._e2b_import_error}" - ) - - E2BBgJob = E2BSandboxBackend = E2BSandboxHandle = _RequiresE2B # type: ignore[assignment] - - -__all__ = [ - "BgJob", - "ExecResult", - "SandboxBackend", - "SandboxHandle", - "E2BBgJob", - "E2BSandboxBackend", - "E2BSandboxHandle", -] +from openenv.core.harness.sandbox import * # noqa: F401,F403 +from openenv.core.harness.sandbox import __all__ # noqa: F401 diff --git a/envs/opencode_env/sandbox/build_template.py b/envs/opencode_env/sandbox/build_template.py index 01c32d537..084a95e64 100644 --- a/envs/opencode_env/sandbox/build_template.py +++ b/envs/opencode_env/sandbox/build_template.py @@ -44,8 +44,10 @@ from e2b import Template, default_build_logger -_ENV_DIR = Path(__file__).resolve().parent -_PROXY_SOURCE = _ENV_DIR / "interception.py" +_REPO_ROOT = Path(__file__).resolve().parents[3] +_PROXY_SOURCE = ( + _REPO_ROOT / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py" +) def _load_env(path: Path) -> None: @@ -91,7 +93,7 @@ def build_template(name: str, *, skip_cache: bool = False) -> str: .make_dir("/home/user/task") .make_dir("/home/user/workdir") .make_dir("/home/user/proxy") - .copy("interception.py", "/home/user/proxy/interception.py") + .copy(str(_PROXY_SOURCE), "/home/user/proxy/interception.py") .set_workdir("/home/user/workdir") ) if skip_cache: @@ -121,7 +123,7 @@ def main(argv: list[str] | None = None) -> int: ) args = p.parse_args(argv) - _load_env(_ENV_DIR / ".env") + _load_env(_REPO_ROOT / "envs" / "opencode_env" / "sandbox" / ".env") if not os.environ.get("E2B_API_KEY"): print("ERROR: E2B_API_KEY required.", file=sys.stderr) return 2 diff --git a/src/openenv/core/harness/sandbox/__init__.py b/src/openenv/core/harness/sandbox/__init__.py new file mode 100644 index 000000000..d0324a7d7 --- /dev/null +++ b/src/openenv/core/harness/sandbox/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Sandbox backends for harness-driven rollouts. + +Provides the :class:`SandboxBackend` / :class:`SandboxHandle` protocols and +concrete implementations. Any harness adapter can use any backend — the +sandbox layer is orthogonal to the agent CLI choice. + +The ``e2b`` import is wrapped in ``try/except`` so this package loads cleanly +in environments where ``e2b`` isn't installed (CI smoke tests, lint runs). +""" + +from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle + +__all__ = [ + "BgJob", + "ExecResult", + "SandboxBackend", + "SandboxHandle", +] + +try: + from .e2b_backend import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle + + __all__.extend(["E2BBgJob", "E2BSandboxBackend", "E2BSandboxHandle"]) +except ImportError: + pass # e2b not installed — stubs live in envs/opencode_env/sandbox/__init__.py diff --git a/envs/opencode_env/sandbox/base.py b/src/openenv/core/harness/sandbox/base.py similarity index 87% rename from envs/opencode_env/sandbox/base.py rename to src/openenv/core/harness/sandbox/base.py index 76869149a..4b2620799 100644 --- a/envs/opencode_env/sandbox/base.py +++ b/src/openenv/core/harness/sandbox/base.py @@ -6,12 +6,12 @@ """Sandbox backend protocol. -A ``SandboxBackend`` produces ``SandboxHandle`` instances that the harness uses -to stage files, run the OpenCode install, launch the agent as a background -process, and later tear the sandbox down. +A ``SandboxBackend`` produces ``SandboxHandle`` instances that harnesses use +to stage files, install agent CLIs, launch the agent as a background process, +and later tear the sandbox down. -Backends can be implemented against any provider (E2B, Docker, Modal, Prime) -as long as they satisfy the Protocols defined here. +Backends can be implemented against any provider (E2B, CubeSandbox, Docker, +Modal) as long as they satisfy the Protocols defined here. """ from __future__ import annotations diff --git a/envs/opencode_env/sandbox/e2b.py b/src/openenv/core/harness/sandbox/e2b_backend.py similarity index 94% rename from envs/opencode_env/sandbox/e2b.py rename to src/openenv/core/harness/sandbox/e2b_backend.py index b567a9e65..f344346ba 100644 --- a/envs/opencode_env/sandbox/e2b.py +++ b/src/openenv/core/harness/sandbox/e2b_backend.py @@ -4,7 +4,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""E2B implementation of :class:`SandboxBackend`.""" +"""E2B implementation of :class:`SandboxBackend`. + +Works with both E2B cloud (api.e2b.dev) and self-hosted E2B-compatible +backends like CubeSandbox. For CubeSandbox, set:: + + E2B_API_URL=http://your-cubesandbox:3000 + E2B_API_KEY=dummy # any non-empty string +""" from __future__ import annotations @@ -15,7 +22,7 @@ from e2b import Sandbox from e2b.sandbox_sync.commands.command_handle import CommandHandle -from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle +from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxBackend, SandboxHandle class E2BBgJob: diff --git a/envs/opencode_env/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py similarity index 98% rename from envs/opencode_env/sandbox/interception.py rename to src/openenv/core/harness/sandbox/interception.py index 131d41024..dc3dbe5be 100644 --- a/envs/opencode_env/sandbox/interception.py +++ b/src/openenv/core/harness/sandbox/interception.py @@ -6,15 +6,15 @@ """Transparent OpenAI-compatible forwarding proxy with logprob capture. -The proxy is a small FastAPI app that OpenCode talks to instead of the upstream -LLM endpoint. It: +The proxy is a small FastAPI app that agent CLIs (OpenCode, Claude Code, +Codex, Pi, etc.) talk to instead of the upstream LLM endpoint. It: 1. Forwards every ``POST /v1/chat/completions`` request to the real upstream URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream returns per-token logprobs. 2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines trace file. -3. Returns the upstream response to OpenCode verbatim (minus the ``logprobs`` +3. Returns the upstream response to the agent verbatim (minus the ``logprobs`` field, which we strip so the CLI never sees anything unexpected). The proxy is stateless beyond the trace file. One proxy instance runs per @@ -22,7 +22,7 @@ Run standalone:: - OPENCODE_UPSTREAM_API_KEY=... python -m opencode_env.interception \\ + UPSTREAM_API_KEY=... python -m openenv.core.harness.sandbox.interception \\ --upstream-url https://vllm.example/v1 \\ --trace /tmp/trace.jsonl \\ --port 7000 diff --git a/tests/envs/test_opencode_env.py b/tests/envs/test_opencode_env.py index 812ade194..6014c9199 100644 --- a/tests/envs/test_opencode_env.py +++ b/tests/envs/test_opencode_env.py @@ -309,7 +309,7 @@ def _exec_with_retry(self, *args, **kwargs): def test_interception_cli_reads_upstream_key_from_env( monkeypatch: pytest.MonkeyPatch, ) -> None: - from opencode_env.sandbox import interception + from openenv.core.harness.sandbox import interception captured = {} From 024e9042c3833fbd253f7c9981eb6909f29fb0f1 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Tue, 12 May 2026 23:00:13 +0530 Subject: [PATCH 02/35] feat: CLIAgentDriver Abstraction --- envs/opencode_env/__init__.py | 19 +- envs/opencode_env/client.py | 2 +- envs/opencode_env/config.py | 4 +- envs/opencode_env/harness.py | 422 +++-------- envs/opencode_env/opencode_runtime.py | 4 +- envs/opencode_env/sandbox/__init__.py | 12 +- envs/opencode_env/sandbox/build_template.py | 5 +- envs/opencode_env/server/app.py | 10 +- envs/opencode_env/server/gradio_ui.py | 95 ++- .../server/opencode_environment.py | 19 +- src/openenv/core/harness/agents/__init__.py | 107 +++ src/openenv/core/harness/agents/base.py | 251 ++++++ src/openenv/core/harness/agents/cli_driver.py | 716 ++++++++++++++++++ src/openenv/core/harness/agents/opencode.py | 191 +++++ 14 files changed, 1467 insertions(+), 390 deletions(-) create mode 100644 src/openenv/core/harness/agents/__init__.py create mode 100644 src/openenv/core/harness/agents/base.py create mode 100644 src/openenv/core/harness/agents/cli_driver.py create mode 100644 src/openenv/core/harness/agents/opencode.py diff --git a/envs/opencode_env/__init__.py b/envs/opencode_env/__init__.py index 17cd145b3..dcd48a01c 100644 --- a/envs/opencode_env/__init__.py +++ b/envs/opencode_env/__init__.py @@ -8,35 +8,30 @@ Two layers in this package: -1. **Harness primitive** — :class:`OpenCodeSessionFactory` / +1. **Harness primitive** -- :class:`OpenCodeSessionFactory` / :class:`OpenCodeSession` / :class:`OpenCodeConfig` / - :class:`E2BSandboxBackend`. Used in-process to drive one rollout - inside an E2B sandbox. See ``harness.py``. + :class:`E2BSandboxBackend`. Built on the generic + :class:`CLIAgentDriver` from ``openenv.core.harness.agents``. -2. **Deployable env** — :class:`OpenCodeEnv` (MCP client) talks to the +2. **Deployable env** -- :class:`OpenCodeEnv` (MCP client) talks to the FastAPI server at ``server/app.py`` over HTTP. Use this when the sandbox + agent live behind an HTTP boundary (e.g. an HF Space). See ``client.py`` and ``server/``. """ from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction +from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle from .client import OpenCodeEnv from .config import OpenCodeConfig, Provider from .harness import OpenCodeSession, OpenCodeSessionFactory -from .models import ( - CommandResult, - OpenCodeState, - RolloutResult, - RolloutTurn, -) -from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle +from .models import CommandResult, OpenCodeState, RolloutResult, RolloutTurn +from .task import OpenCodeTask try: from openenv.core.harness.sandbox import E2BSandboxBackend except ImportError: # e2b not installed E2BSandboxBackend = None # type: ignore[assignment,misc] -from .task import OpenCodeTask __all__ = [ # Deployed-env client diff --git a/envs/opencode_env/client.py b/envs/opencode_env/client.py index a00afc4e1..52e76e2d5 100644 --- a/envs/opencode_env/client.py +++ b/envs/opencode_env/client.py @@ -51,7 +51,7 @@ def run_rollout( self, *, # Endpoint — pass either the shorthand selector OR explicit fields. - endpoint: str = "", # "vllm" | "openai" | "hf_router" + endpoint: str = "", # "vllm" | "openai" | "hf_router" base_url: str = "", api_key: str = "", model: str = "", diff --git a/envs/opencode_env/config.py b/envs/opencode_env/config.py index 57273b9eb..2b6bae0a2 100644 --- a/envs/opencode_env/config.py +++ b/envs/opencode_env/config.py @@ -34,9 +34,7 @@ class OpenCodeConfig(BaseModel): # --- OpenCode CLI --------------------------------------------------------- opencode_version: str = "latest" - disabled_tools: list[str] = Field( - default_factory=lambda: ["webfetch", "question"] - ) + disabled_tools: list[str] = Field(default_factory=lambda: ["webfetch", "question"]) enabled_tools: list[str] | None = None system_prompt: str | None = None extra_opencode_json: dict[str, Any] = Field(default_factory=dict) diff --git a/envs/opencode_env/harness.py b/envs/opencode_env/harness.py index dc0eb55be..600aafa82 100644 --- a/envs/opencode_env/harness.py +++ b/envs/opencode_env/harness.py @@ -4,42 +4,32 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""OpenCode session factory + session implementation. +"""OpenCode session factory + session — backed by CLIAgentDriver. -Implements the :class:`ResourceSessionFactory` / :class:`ResourceSession` -contracts from ``openenv.core.harness`` (PR #471). The session wraps one -sandbox running the ``opencode`` CLI agent. +This module exposes :class:`OpenCodeSession` and +:class:`OpenCodeSessionFactory` built on top of the generic +:class:`CLIAgentDriver` / :class:`CLIAgentSession` / +:class:`CLIAgentSessionFactory` from ``openenv.core.harness.agents``. -Two operating modes: - - - ``mode="black_box"`` — opencode talks directly to ``config.base_url``. - No proxy, no logprob capture. Use for smoke tests / SFT / eval. - - ``mode="transparent_proxy"`` (default) — an in-sandbox FastAPI proxy - sits between opencode and the upstream LLM. It injects ``logprobs=true`` - on every request and writes per-turn ``(messages, completion_tokens, - per_token_logps)`` to ``proxy_trace.jsonl`` for GRPO consumption. - -Single driver path: opencode is started as a background subprocess via -``opencode run --format json --dangerously-skip-permissions ...`` and we -poll its exit code. The previous ``opencode serve`` driver was removed — -opencode CLI is the only path now. +OpenCode-specific configuration (``opencode.json`` generation, provider +mapping, tool enable/disable) is handled by +:mod:`opencode_env.opencode_runtime` builders wired into the +:data:`OPENCODE_SPEC` via callable hooks. """ from __future__ import annotations -import json -import shlex from pathlib import Path -from typing import Any, Callable, Literal - -from openenv.core.env_server.mcp_types import Tool -from openenv.core.harness import ( - Message, - ResourceSession, - ResourceSessionFactory, - ToolResult, - VerifyResult, +from typing import Any, Literal + +from openenv.core.harness import ResourceSessionFactory +from openenv.core.harness.agents.cli_driver import ( + CLIAgentDriver, + CLIAgentSession, + Verifier, ) +from openenv.core.harness.agents.opencode import OPENCODE_SPEC +from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle from .config import OpenCodeConfig from .opencode_runtime import ( @@ -52,7 +42,6 @@ opencode_config_path, system_prompt_path, ) -from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle from .task import OpenCodeTask @@ -61,28 +50,24 @@ _PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl" _PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log" -# Where the proxy source lives on disk (in this repo). Uploaded into the -# sandbox at /home/user/proxy/interception.py before each rollout, unless -# the sandbox was created from a template that already has it baked in. _PROXY_SOURCE_PATH = ( Path(__file__).resolve().parents[2] - / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py" + / "src" + / "openenv" + / "core" + / "harness" + / "sandbox" + / "interception.py" ) -Verifier = Callable[[SandboxHandle, OpenCodeTask], VerifyResult] - - -class OpenCodeSession(ResourceSession): +class OpenCodeSession(CLIAgentSession): """One live OpenCode rollout inside a sandbox. - The session is created already-running: :meth:`OpenCodeSessionFactory.create` - calls :meth:`start_agent` before returning. Typical usage:: - - session = factory.create(task) - session.wait_for_completion() - result = session.verify([]) - session.close() + Extends :class:`CLIAgentSession` with OpenCode-specific convenience + methods (``fetch_trace``, ``wait_for_completion`` with config-aware + timeout). Fully backward-compatible with code that used the old + ``OpenCodeSession`` API. """ def __init__( @@ -95,100 +80,43 @@ def __init__( base_url_override: str | None = None, proxy_trace_path: str | None = None, proxy_bg_job: BgJob | None = None, + agent_bg_job: BgJob | None = None, ) -> None: - self.sandbox = sandbox - self.config = config - self.task = task - self._verifier = verifier - self._base_url_override = base_url_override - self._bg_job: BgJob | None = None - self._proxy_trace_path = proxy_trace_path - self._proxy_bg_job = proxy_bg_job - - # ------------------------------------------------------------------ - # ResourceSession contract (PR #471) - # ------------------------------------------------------------------ - def initial_messages(self) -> list[Message]: - return [{"role": "user", "content": self.task.instruction}] - - def list_tools(self) -> list[Tool]: - # OpenCode owns its own tool loop — none are exposed to the harness. - return [] - - def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult: - return ToolResult( - error=( - "OpenCodeSession does not expose external tool calls; the " - "CLI agent owns its own tool loop." - ) + super().__init__( + spec=OPENCODE_SPEC, + sandbox=sandbox, + task=task, + config=config, + verifier=verifier, + base_url_override=base_url_override, + proxy_trace_path=proxy_trace_path, + proxy_bg_job=proxy_bg_job, + agent_bg_job=agent_bg_job, ) - def verify( - self, - transcript: list[Message], - final_state: Any | None = None, - ) -> VerifyResult: - if self._verifier is None: - return VerifyResult(env_reward=None, done=True) - return self._verifier(self.sandbox, self.task) - - def close(self) -> None: - if self._bg_job is not None: - try: - self._bg_job.kill() - except Exception: - pass - self._bg_job = None - if self._proxy_bg_job is not None: - try: - self._proxy_bg_job.kill() - except Exception: - pass - self._proxy_bg_job = None - self.sandbox.kill() - - # ------------------------------------------------------------------ - # OpenCode-specific session API - # ------------------------------------------------------------------ - def start_agent(self) -> None: - """Launch ``opencode run`` as a background subprocess in the sandbox.""" - if self._bg_job is not None: - return - cmd = build_run_cmd(self.config) - envs = build_env_vars(self.config, base_url_override=self._base_url_override) - self._bg_job = self.sandbox.start_bg(cmd, envs=envs) + def fetch_trace(self) -> str: + """Return the raw ``opencode run`` log (JSONL when ``run_format=json``).""" + return self.sandbox.read_text(agent_log_path(self.config)) def wait_for_completion(self, timeout_s: float | None = None) -> int: """Block until the agent exits, returning its exit code.""" budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s - if self._bg_job is None: + if self._agent_bg_job is None: raise RuntimeError("Agent not started; call start_agent() first.") - return self._bg_job.wait(timeout=budget) + return self._agent_bg_job.wait(timeout=budget) - def fetch_trace(self) -> str: - """Return the raw ``opencode run`` log (JSON-lines when ``run_format=json``).""" - return self.sandbox.read_text(agent_log_path(self.config)) - - def fetch_proxy_trace(self) -> list[dict[str, Any]]: - """Return per-turn proxy-captured records (Mode B only). + def start_agent(self) -> None: + """Launch ``opencode run`` as a background subprocess in the sandbox. - Each entry has ``request``, ``response``, ``completion_tokens``, - ``completion_token_ids``, ``per_token_logps``, ``finish_reason``, - and ``latency_s``. Returns ``[]`` in Mode A. + Provided for backward compatibility — the factory now starts the + agent during ``create()``, so calling this manually is a no-op + if the agent is already running. """ - if self._proxy_trace_path is None: - return [] - try: - content = self.sandbox.read_text(self._proxy_trace_path) - except Exception: - return [] - records: list[dict[str, Any]] = [] - for line in content.splitlines(): - line = line.strip() - if not line: - continue - records.append(json.loads(line)) - return records + if self._agent_bg_job is not None: + return + cmd = build_run_cmd(self.config) + envs = build_env_vars(self.config, base_url_override=self._base_url_override) + self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs) class OpenCodeSessionFactory(ResourceSessionFactory): @@ -197,6 +125,10 @@ class OpenCodeSessionFactory(ResourceSessionFactory): The factory owns sandbox provisioning, opencode install, config injection, and (Mode B) proxy startup. Each :meth:`create` call returns a fresh sandbox with a running agent. + + Internally delegates to :class:`CLIAgentDriver` for the generic + sandbox lifecycle (readiness probing, install retry, proxy startup). + OpenCode-specific config generation uses ``opencode_runtime`` builders. """ def __init__( @@ -218,6 +150,18 @@ def __init__( self._install_timeout_s = install_timeout_s self._setup_timeout_s = setup_timeout_s + # Build a CLIAgentDriver for the shared lifecycle. + self._driver = CLIAgentDriver( + spec=OPENCODE_SPEC, + sandbox_backend=sandbox_backend, + mode=mode, + install_timeout_s=install_timeout_s, + setup_timeout_s=setup_timeout_s, + proxy_top_logprobs=config.proxy_top_logprobs, + proxy_max_tokens_cap=config.proxy_max_tokens_cap, + proxy_disable_thinking=config.proxy_disable_thinking, + ) + def create( self, task: Any, @@ -225,6 +169,7 @@ def create( episode_id: str | None = None, ) -> OpenCodeSession: import logging + _log = logging.getLogger(__name__) oc_task = OpenCodeTask.coerce(task) @@ -232,17 +177,16 @@ def create( _log.info( "factory.create: creating sandbox timeout=%ds mode=%s", - sandbox_timeout, self._mode, + sandbox_timeout, + self._mode, ) sandbox = self._backend.create( timeout_s=sandbox_timeout, metadata={"episode_id": episode_id} if episode_id else None, ) - sid = ( - getattr(sandbox, "sandbox_id", None) - or getattr(getattr(sandbox, "raw", None), "sandbox_id", "?") - ) + sid = getattr(sandbox, "sandbox_id", "?") _log.info("factory.create: sandbox=%s — bootstrapping…", sid) + try: self._bootstrap_sandbox(sandbox, oc_task) except Exception as exc: @@ -256,18 +200,20 @@ def create( if self._mode == "transparent_proxy": _log.info( "factory.create: starting interception proxy on :%d → %s", - _PROXY_PORT, self._config.base_url, + _PROXY_PORT, + self._config.base_url, ) - proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy( - sandbox + proxy_bg_job, base_url_override, proxy_trace_path = ( + self._driver._start_proxy( + sandbox, + base_url=self._config.base_url, + api_key=self._config.api_key, + model=self._config.model, + ) ) _log.info("factory.create: proxy up at %s", base_url_override) - # Rewrite opencode.json so opencode points at the proxy. Force - # ``openai_compatible`` so opencode hits ``/v1/chat/completions`` - # (which the proxy serves) rather than provider-specific paths. - from .config import OpenCodeConfig as _OCC - - proxy_cfg = _OCC( + # Rewrite opencode.json so opencode points at the proxy. + proxy_cfg = OpenCodeConfig( **{ **self._config.model_dump(), "provider": "openai_compatible", @@ -292,92 +238,8 @@ def create( return session # ------------------------------------------------------------------ - def _wait_for_sandbox_ready( - self, - sandbox: SandboxHandle, - *, - attempts: int = 15, - delay_s: float = 1.0, - ) -> None: - """Probe the sandbox until ``echo ok`` succeeds. - - E2B (and other backends) sometimes return the handle before the - guest is fully ready. Issue ``echo ok`` with short timeouts until - it succeeds. Returns silently on success; raises ``RuntimeError`` - on prolonged failure. - """ - import time - - last_err = "" - for _ in range(attempts): - try: - r = sandbox.exec("echo ok", timeout=5) - if r.exit_code == 0 and "ok" in (r.stdout or ""): - return - last_err = (r.stderr or r.stdout or "").strip() or f"exit={r.exit_code}" - except Exception as exc: # noqa: BLE001 - last_err = f"{type(exc).__name__}: {exc}" - time.sleep(delay_s) - raise RuntimeError( - f"sandbox did not become ready within {attempts * delay_s:.0f}s " - f"(last error: {last_err})" - ) - - def _exec_with_retry( - self, - sandbox: SandboxHandle, - cmd: str, - *, - timeout: float, - attempts: int = 3, - backoff_s: float = 3.0, - label: str = "cmd", - ): - """Run ``sandbox.exec`` with exponential backoff on transient failure. - - Transient = ``exit_code != 0`` AND empty stderr (SIGKILL / network - blip signature) OR an exception during exec. Final failure is raised - as ``RuntimeError`` carrying the last exit code + stderr. - """ - import time - - last_stdout = "" - last_stderr = "" - last_exit = 0 - for i in range(attempts): - try: - r = sandbox.exec(cmd, timeout=timeout) - if r.exit_code == 0: - return r - last_stdout = r.stdout or "" - last_stderr = r.stderr or "" - last_exit = r.exit_code - if last_stderr.strip(): - break - except Exception as exc: # noqa: BLE001 - last_stderr = f"{type(exc).__name__}: {exc}" - last_exit = -1 - if i + 1 < attempts: - time.sleep(backoff_s * (2**i)) - raise RuntimeError( - f"{label} failed after {attempts} attempts " - f"(exit={last_exit}, stderr={last_stderr!r}, stdout_tail={last_stdout[-400:]!r})" - ) - - def _opencode_already_installed(self, sandbox: SandboxHandle) -> bool: - """Cheap probe — returns True if opencode is on disk in the sandbox. - - Used to skip the slow ``curl install`` step when running against a - prebaked template that already ships opencode. - """ - try: - r = sandbox.exec( - "/home/user/.opencode/bin/opencode --version", - timeout=10, - ) - return r.exit_code == 0 - except Exception: - return False + # Bootstrap — delegates to CLIAgentDriver utilities + # ------------------------------------------------------------------ def _bootstrap_sandbox( self, @@ -387,12 +249,11 @@ def _bootstrap_sandbox( """Install opencode, write config + task files, run optional setup.""" # Stage 1: wait for the sandbox to be responsive. - self._wait_for_sandbox_ready(sandbox) + self._driver._wait_for_sandbox_ready(sandbox) - # Stage 2: install opencode (skipped if a prebaked template already - # has it). curl|bash is flaky — retry with backoff. - if not self._opencode_already_installed(sandbox): - self._exec_with_retry( + # Stage 2: install opencode (skipped if pre-baked). + if not self._driver._agent_already_installed(sandbox): + self._driver._exec_with_retry( sandbox, build_install_cmd(self._config), timeout=self._install_timeout_s, @@ -401,6 +262,7 @@ def _bootstrap_sandbox( label="opencode install", ) + # Stage 3: write opencode.json + task files. sandbox.write_text( opencode_config_path(self._config), build_opencode_json(self._config), @@ -416,8 +278,9 @@ def _bootstrap_sandbox( for remote_path, content in task.upload_files.items(): sandbox.write_text(remote_path, content) + # Stage 4: extra setup if self._config.extra_setup_shell: - self._exec_with_retry( + self._driver._exec_with_retry( sandbox, self._config.extra_setup_shell, timeout=self._setup_timeout_s, @@ -437,95 +300,14 @@ def _start_proxy( self, sandbox: SandboxHandle, ) -> tuple[BgJob, str, str]: - """Install proxy deps + start the proxy as a bg job inside the sandbox. - - Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``. - Skips the pip install + source-upload steps when the prebaked - template already has them in place. - """ - proxy_already_present = sandbox.exists( - "/home/user/proxy/interception.py" + """Start proxy — delegates to driver.""" + return self._driver._start_proxy( + sandbox, + base_url=self._config.base_url, + api_key=self._config.api_key, + model=self._config.model, ) - if not proxy_already_present: - # Install proxy deps (idempotent on retries). - self._exec_with_retry( - sandbox, - "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' " - "'httpx>=0.27' 2>&1 | tail -20", - timeout=180, - attempts=3, - backoff_s=2.0, - label="proxy deps install", - ) - # Upload the proxy module into the sandbox. - sandbox.write_text( - "/home/user/proxy/interception.py", - _PROXY_SOURCE_PATH.read_text(), - ) - sandbox.write_text("/home/user/proxy/__init__.py", "") - - proxy_args = [ - "python", - "interception.py", - "--upstream-url", - self._config.base_url, - "--trace", - _PROXY_TRACE_PATH, - "--port", - str(_PROXY_PORT), - "--top-logprobs", - str(self._config.proxy_top_logprobs), - ] - if self._config.proxy_max_tokens_cap is not None: - proxy_args.extend( - ["--max-tokens-cap", str(self._config.proxy_max_tokens_cap)] - ) - if self._config.proxy_disable_thinking: - proxy_args.append("--disable-thinking") - # Force the upstream model id on every forwarded request — opencode's - # internal title-gen call sometimes strips the provider prefix. - if self._config.model: - proxy_args.extend(["--model-override", self._config.model]) - - quoted_proxy_args = " ".join(shlex.quote(arg) for arg in proxy_args) - proxy_cmd = ( - "cd /home/user/proxy && " - f"{quoted_proxy_args} " - f"> {shlex.quote(_PROXY_LOG_PATH)} 2>&1" - ) - proxy_env = {"OPENCODE_UPSTREAM_API_KEY": self._config.api_key} - proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env) - - # Wait for the proxy to start listening. Cold uvicorn boot inside - # E2B can take anywhere from <1s to ~30s depending on cache state. - import time - - attempts = 120 - interval_s = 0.5 - for _ in range(attempts): - r = sandbox.exec( - f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz", - timeout=5, - ) - if r.exit_code == 0: - break - time.sleep(interval_s) - else: - log = "" - try: - log = sandbox.read_text(_PROXY_LOG_PATH) - except Exception: - pass - proxy_job.kill() - raise RuntimeError( - f"proxy did not start within {attempts * interval_s:.0f}s. " - f"log:\n{log[-2000:]}" - ) - - base_url_override = f"http://127.0.0.1:{_PROXY_PORT}/v1" - return proxy_job, base_url_override, _PROXY_TRACE_PATH - __all__ = [ "OpenCodeSession", diff --git a/envs/opencode_env/opencode_runtime.py b/envs/opencode_env/opencode_runtime.py index 07fd5322d..75fed41e3 100644 --- a/envs/opencode_env/opencode_runtime.py +++ b/envs/opencode_env/opencode_runtime.py @@ -111,7 +111,9 @@ def build_run_cmd(config: OpenCodeConfig) -> str: ).strip() -def build_env_vars(config: OpenCodeConfig, *, base_url_override: str | None = None) -> dict[str, str]: +def build_env_vars( + config: OpenCodeConfig, *, base_url_override: str | None = None +) -> dict[str, str]: """Return env vars to set on the OpenCode process. When a proxy is wrapping ``config.base_url`` the factory passes the proxy's diff --git a/envs/opencode_env/sandbox/__init__.py b/envs/opencode_env/sandbox/__init__.py index a3496a2b1..8a2477104 100644 --- a/envs/opencode_env/sandbox/__init__.py +++ b/envs/opencode_env/sandbox/__init__.py @@ -4,13 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Sandbox backends — re-exported from ``openenv.core.harness.sandbox``. +"""Sandbox backends live in ``openenv.core.harness.sandbox``. -The canonical source for sandbox protocols and implementations now lives in -``src/openenv/core/harness/sandbox/``. This package re-exports everything -so that ``from opencode_env.sandbox import ...`` keeps working, but all new -code should import from ``openenv.core.harness.sandbox`` directly. +This package exists only for the ``build_template`` helper used by E2B +template builds. Import sandbox protocols and backends from +``openenv.core.harness.sandbox`` directly. """ - -from openenv.core.harness.sandbox import * # noqa: F401,F403 -from openenv.core.harness.sandbox import __all__ # noqa: F401 diff --git a/envs/opencode_env/sandbox/build_template.py b/envs/opencode_env/sandbox/build_template.py index 084a95e64..6e0ba4f75 100644 --- a/envs/opencode_env/sandbox/build_template.py +++ b/envs/opencode_env/sandbox/build_template.py @@ -41,7 +41,7 @@ import sys from pathlib import Path -from e2b import Template, default_build_logger +from e2b import default_build_logger, Template _REPO_ROOT = Path(__file__).resolve().parents[3] @@ -128,8 +128,7 @@ def main(argv: list[str] | None = None) -> int: print("ERROR: E2B_API_KEY required.", file=sys.stderr) return 2 - print(f"Building template '{args.name}' " - f"(proxy source: {_PROXY_SOURCE})") + print(f"Building template '{args.name}' (proxy source: {_PROXY_SOURCE})") print(f"Skip cache: {args.skip_cache}") print() diff --git a/envs/opencode_env/server/app.py b/envs/opencode_env/server/app.py index 200c7f2d7..0757ef229 100644 --- a/envs/opencode_env/server/app.py +++ b/envs/opencode_env/server/app.py @@ -56,19 +56,13 @@ def _load_env_file() -> None: try: from openenv.core.env_server.http_server import create_app - from openenv.core.env_server.mcp_types import ( - CallToolAction, - CallToolObservation, - ) + from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation from .gradio_ui import opencode_gradio_builder from .opencode_environment import OpenCodeEnvironment except ImportError: # pragma: no cover from openenv.core.env_server.http_server import create_app - from openenv.core.env_server.mcp_types import ( - CallToolAction, - CallToolObservation, - ) + from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation from server.gradio_ui import opencode_gradio_builder # type: ignore from server.opencode_environment import OpenCodeEnvironment # type: ignore diff --git a/envs/opencode_env/server/gradio_ui.py b/envs/opencode_env/server/gradio_ui.py index 79a696d75..d1ee6e403 100644 --- a/envs/opencode_env/server/gradio_ui.py +++ b/envs/opencode_env/server/gradio_ui.py @@ -31,10 +31,14 @@ import gradio as gr try: - from .catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint + from .catalog import catalog_summary, ENDPOINT_KINDS, resolve_endpoint from .opencode_environment import OpenCodeEnvironment except ImportError: # pragma: no cover - from server.catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint # type: ignore + from server.catalog import ( # type: ignore + catalog_summary, + ENDPOINT_KINDS, + resolve_endpoint, + ) from server.opencode_environment import OpenCodeEnvironment # type: ignore @@ -144,7 +148,9 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]: cmd if len(cmd) <= 80 else cmd[:77] + "...", str(it.get("exit_code", "")), f"{it.get('duration_s', 0):.2f}s", - (it.get("stderr") or "").splitlines()[-1][:80] if it.get("exit_code") else "", + (it.get("stderr") or "").splitlines()[-1][:80] + if it.get("exit_code") + else "", ] ) return rows @@ -175,7 +181,8 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str: finishes[f] = finishes.get(f, 0) + 1 if finishes: lines.append( - "**finish_reasons**: " + " ".join(f"`{k}={v}`" for k, v in finishes.items()) + "**finish_reasons**: " + + " ".join(f"`{k}={v}`" for k, v in finishes.items()) ) productive_rows = [t for t in turns if t.get("completion_tokens")] if productive_rows: @@ -249,12 +256,12 @@ def _catalog_banner() -> str: def opencode_gradio_builder( - web_manager, # noqa: ARG001 (unused: we instantiate the env directly) - action_fields, # noqa: ARG001 - metadata, # noqa: ARG001 - is_chat_env, # noqa: ARG001 + web_manager, # noqa: ARG001 (unused: we instantiate the env directly) + action_fields, # noqa: ARG001 + metadata, # noqa: ARG001 + is_chat_env, # noqa: ARG001 title, - quick_start_md, # noqa: ARG001 + quick_start_md, # noqa: ARG001 ) -> gr.Blocks: """Build the opencode_env console. @@ -355,7 +362,12 @@ def _worker(): # First yield: announce we've started. Empty result panels. yield ( f"### running…\n\n_endpoint=`{resolved.kind}` model=`{resolved.model}` mode=`{mode}`_", - [], [], "", "", "", {}, + [], + [], + "", + "", + "", + {}, ) status_lines: list[tuple[float, str]] = [] @@ -374,7 +386,9 @@ def _worker(): # Render the live status pane. elapsed = time.time() - t_start - md = _live_status_md(resolved.kind, resolved.model, mode, elapsed, status_lines) + md = _live_status_md( + resolved.kind, resolved.model, mode, elapsed, status_lines + ) yield (md, [], [], "", "", "", {}) # Drain any final messages still in the queue. @@ -390,9 +404,17 @@ def _worker(): err = result_holder.get("error", "unknown error") yield ( f"### error\n\n```\n{err}\n```", - [], [], "", "", - _live_status_md(resolved.kind, resolved.model, mode, - time.time() - t_start, status_lines), + [], + [], + "", + "", + _live_status_md( + resolved.kind, + resolved.model, + mode, + time.time() - t_start, + status_lines, + ), {"error": err}, ) return @@ -406,8 +428,13 @@ def _worker(): _logprobs_md(result.get("proxy_turns") or []), ( f"### live phase log\n\n" - + _live_status_md(resolved.kind, resolved.model, mode, - time.time() - t_start, status_lines) + + _live_status_md( + resolved.kind, + resolved.model, + mode, + time.time() - t_start, + status_lines, + ) + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n" f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```" ), @@ -436,17 +463,21 @@ def apply_preset(name: str) -> tuple[str, str, str]: scale=1, ) model = gr.Textbox( - label="Model (blank → catalog default)", placeholder="gpt-4o-mini", + label="Model (blank → catalog default)", + placeholder="gpt-4o-mini", scale=2, ) with gr.Row(): base_url = gr.Textbox( label="Base URL (blank → env / catalog default)", - placeholder="https://api.openai.com/v1", scale=2, + placeholder="https://api.openai.com/v1", + scale=2, ) api_key = gr.Textbox( label="API key (blank → server env var)", - placeholder="(server env)", type="password", scale=1, + placeholder="(server env)", + type="password", + scale=1, ) instruction = gr.Textbox( @@ -536,14 +567,28 @@ def apply_preset(name: str) -> tuple[str, str, str]: run_btn.click( fn=run, inputs=[ - endpoint, model, base_url, api_key, - instruction, setup_text, verify_text, - mode, disable_thinking, template, - max_tokens_cap, top_logprobs, agent_timeout_s, + endpoint, + model, + base_url, + api_key, + instruction, + setup_text, + verify_text, + mode, + disable_thinking, + template, + max_tokens_cap, + top_logprobs, + agent_timeout_s, ], outputs=[ - summary_md, setup_table, verify_table, - files_md, logprobs_md, logs_md, raw_json, + summary_md, + setup_table, + verify_table, + files_md, + logprobs_md, + logs_md, + raw_json, ], ) diff --git a/envs/opencode_env/server/opencode_environment.py b/envs/opencode_env/server/opencode_environment.py index 07f0d69ed..638dd5473 100644 --- a/envs/opencode_env/server/opencode_environment.py +++ b/envs/opencode_env/server/opencode_environment.py @@ -189,9 +189,7 @@ def reset( reward=None, metadata={ "status": "ready", - "message": ( - "opencode_env ready. Call run_rollout(...) with a task." - ), + "message": ("opencode_env ready. Call run_rollout(...) with a task."), }, ) @@ -399,8 +397,12 @@ def _emit(msg: str) -> None: result.error = f"{type(exc).__name__}: {exc}" _emit(f"ERROR: {result.error}") if session is not None: - result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:] - result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:] + result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[ + -2000: + ] + result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[ + -2000: + ] finally: if session is not None: try: @@ -450,9 +452,7 @@ def _read_reward(self, sandbox: Any) -> float | None: except ValueError: return None - def _collect_files( - self, sandbox: Any - ) -> tuple[dict[str, str], list[str]]: + def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]: listing = sandbox.exec( f"find {WORKDIR} -maxdepth 2 -type f -size -64k 2>/dev/null | head -32", timeout=10, @@ -491,7 +491,8 @@ def _collect_proxy_turns(self, session: Any) -> list[Any]: completion_tokens=list(rec.get("completion_tokens") or []), completion_token_ids=list(rec.get("completion_token_ids") or []), per_token_logps=[ - float(x) for x in (rec.get("per_token_logps") or []) + float(x) + for x in (rec.get("per_token_logps") or []) if x is not None ], latency_s=float(rec.get("latency_s") or 0.0), diff --git a/src/openenv/core/harness/agents/__init__.py b/src/openenv/core/harness/agents/__init__.py new file mode 100644 index 000000000..8ef31976b --- /dev/null +++ b/src/openenv/core/harness/agents/__init__.py @@ -0,0 +1,107 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Agent registry and public API for CLI-based agentic harnesses. + +The registry maps agent names (``"opencode"``, ``"claude-code"``, etc.) to +their :class:`CLIAgentSpec` declarations. Each agent module registers itself +via :func:`register_agent` at import time. + +Usage:: + + from openenv.core.harness.agents import get_agent_spec, list_agents + + spec = get_agent_spec("opencode") + print(list_agents()) # ["opencode"] +""" + +from __future__ import annotations + +from .base import ( + AgentConfig, + AgentEvent, + AgentTask, + ArtifactSpec, + CLIAgentSpec, + MCPConfigSpec, +) + +# Registry + +_REGISTRY: dict[str, CLIAgentSpec] = {} + + +def register_agent(spec: CLIAgentSpec) -> None: + """Register a :class:`CLIAgentSpec` under ``spec.name``. + + Raises :class:`ValueError` if the name is already registered with a + *different* spec object (re-registering the same object is a no-op, + which makes ``importlib.reload`` safe). + """ + existing = _REGISTRY.get(spec.name) + if existing is not None and existing is not spec: + raise ValueError( + f"Agent {spec.name!r} is already registered. " + "Use a unique name or call unregister_agent() first." + ) + _REGISTRY[spec.name] = spec + + +def unregister_agent(name: str) -> CLIAgentSpec | None: + """Remove a registered agent spec, returning it (or ``None``).""" + return _REGISTRY.pop(name, None) + + +def get_agent_spec(name: str) -> CLIAgentSpec: + """Look up a registered agent spec by name. + + Raises :class:`KeyError` if not found. To trigger auto-registration of + built-in agents, import the specific module first (e.g. + ``import openenv.core.harness.agents.opencode``). + """ + if name not in _REGISTRY: + # Auto-import built-in agent modules to trigger registration. + _auto_import(name) + try: + return _REGISTRY[name] + except KeyError: + available = ", ".join(sorted(_REGISTRY)) or "(none)" + raise KeyError( + f"Unknown agent {name!r}. Registered agents: {available}" + ) from None + + +def list_agents() -> list[str]: + """Return sorted names of all registered agents.""" + return sorted(_REGISTRY) + + +def _auto_import(name: str) -> None: + """Try to import the built-in module for ``name`` to trigger registration.""" + # Map agent names to module names (handles hyphens). + module_name = name.replace("-", "_") + try: + __import__(f"openenv.core.harness.agents.{module_name}", fromlist=["_"]) + except ImportError: + pass + + +# Convenience re-exports + +__all__ = [ + # Registry + "get_agent_spec", + "list_agents", + "register_agent", + "unregister_agent", + # Base types + "AgentConfig", + "AgentEvent", + "AgentTask", + "ArtifactSpec", + "CLIAgentSpec", + "MCPConfigSpec", +] diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py new file mode 100644 index 000000000..145d3001e --- /dev/null +++ b/src/openenv/core/harness/agents/base.py @@ -0,0 +1,251 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Agent spec and event protocols for CLI-based agentic harnesses. + +Defines the declarative :class:`CLIAgentSpec` data model that captures +*everything* a CLI harness needs — install commands, file uploads, MCP +config format, environment variables, artifacts to collect, and three +small callables (command builder, MCP config builder, event parser). + +The :class:`CLIAgentDriver` reads these fields mechanically without knowing +anything about the specific agent. Adding a new agent is filling in a +dataclass, not writing driver code. + +Pattern borrowed from `verifiers `_ +(Prime Intellect), where OpenCode, MiniSWEAgent, Pi, and RLM all express +their differences through constructor data passed to ``CLIHarness.__init__()``. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Callable, Literal, Protocol + + +# MCP config injection + + +@dataclass(frozen=True) +class MCPConfigSpec: + """How a harness discovers MCP tools. + + ``method`` controls how the driver injects MCP server configuration: + + - ``"config_file"`` — write a JSON file at ``path_template`` (e.g. + ``"{workdir}/mcp.json"``). The template receives ``{workdir}`` + and ``{home}`` substitutions at runtime. + - ``"cli_flags"`` — the driver passes MCP configuration via CLI + flags built by :attr:`CLIAgentSpec.build_command`. + - ``"settings_file"`` — write into a global settings file (e.g. + Gemini's ``~/.gemini/settings.json``). + """ + + method: Literal["config_file", "cli_flags", "settings_file"] + path_template: str | None = None + + +# Artifacts + + +@dataclass(frozen=True) +class ArtifactSpec: + """Declares a file to collect from the sandbox after the agent exits. + + The driver iterates :attr:`CLIAgentSpec.artifacts` and calls + ``sandbox.read_text(spec.path)`` for each entry. No per-agent collection + methods needed — the spec declares *what* to collect, the driver collects + it. + """ + + path: str + format: Literal["text", "json", "jsonl"] = "text" + optional: bool = True + + +# Agent events (normalized across harnesses) + + +@dataclass +class AgentEvent: + """Normalized event from any CLI harness's stdout. + + The :attr:`CLIAgentSpec.parse_events` callable converts raw JSONL lines + into these events so the driver can log and observe the agent's progress + without knowing which agent is running. + """ + + type: Literal[ + "assistant", + "tool_call", + "tool_result", + "reasoning", + "error", + "done", + ] + data: dict[str, Any] = field(default_factory=dict) + raw: str = "" + + +# Task protocol + + +class AgentTask(Protocol): + """Minimal interface a task must satisfy for the CLI agent driver.""" + + @property + def instruction(self) -> str: ... + + @property + def setup_shell(self) -> str | None: ... + + @property + def upload_files(self) -> dict[str, str]: ... + + @property + def metadata(self) -> dict[str, Any]: ... + + +# Agent config protocol + + +class AgentConfig(Protocol): + """Minimal interface a config must satisfy for the CLI agent driver. + + This is intentionally thin — concrete configs like :class:`OpenCodeConfig` + carry much more, but the generic driver only accesses these. + """ + + @property + def base_url(self) -> str: ... + + @property + def api_key(self) -> str: ... + + @property + def model(self) -> str: ... + + @property + def agent_timeout_s(self) -> float: ... + + +# CLIAgentSpec — the core declarative data model + + +@dataclass +class CLIAgentSpec: + """Declarative specification for a CLI-based agentic harness. + + Following the pattern established by verifiers' ``CLIHarness`` (Prime + Intellect), as much per-agent knowledge as possible is expressed as + *data* rather than imperative code. The :class:`CLIAgentDriver` + iterates these fields mechanically — it never needs to know what + ``"pi"`` or ``"claude-code"`` means. + + Three callables cover the remaining agent-specific logic that can't + be expressed as pure data: + + - :attr:`build_command` — constructs the CLI argv + - :attr:`build_mcp_config` — serializes MCP server configuration + - :attr:`parse_events` — converts raw stdout lines to :class:`AgentEvent` + + Everything else — file uploads, env vars, install scripts, artifact + collection — is pure data. + """ + + name: str + """Unique identifier: ``"opencode"``, ``"claude-code"``, ``"codex"``, etc.""" + + install_check_cmd: list[str] + """Command to probe whether the agent is already installed. + + Example: ``["claude", "--version"]`` + """ + + base_command: list[str] + """Base CLI invocation (before task-specific flags). + + Example: ``["claude", "--print", "--output-format", "stream-json"]`` + """ + + mcp_config: MCPConfigSpec + """How MCP tool configuration is injected.""" + + supports_logprob_proxy: bool = True + """Whether this agent can be routed through the interception proxy.""" + + default_timeout_s: float = 600.0 + """Default per-rollout timeout in seconds.""" + + setup: str | list[str] | None = None + """Shell command(s) to install the agent CLI inside the sandbox. + + Run once after the sandbox is created, before any files are written. + Skipped when ``install_check_cmd`` succeeds (pre-baked template). + Can be a single string or a list of strings executed in order. + """ + + files: dict[str, str | Callable] | None = None + """Files to upload into the sandbox before the agent starts. + + Keys are absolute sandbox paths. Values are either literal strings or + callables ``(task, config) -> str`` resolved at rollout time. + """ + + artifacts: dict[str, ArtifactSpec] | None = None + """Files to collect from the sandbox after the agent exits. + + The driver iterates this dict and calls ``sandbox.read_text(spec.path)`` + for each entry. + """ + + env: dict[str, str] | None = None + """Environment variables for the agent process. + + Values can contain ``{model}``, ``{base_url}``, ``{api_key}`` placeholders + resolved from the rollout config at runtime. + """ + + build_command: Callable[..., str] | None = None + """``(spec, config, task, mcp_config_path) -> str`` + + Build the full shell command line for launching the agent. Returns a + string (not a list) because sandbox ``start_bg`` / ``exec`` take shell + strings. + """ + + build_mcp_config: Callable[..., str] | None = None + """``(spec, tools, workdir) -> str`` + + Serialize MCP server configuration in the format the agent expects. + Returns the file content (for ``config_file``/``settings_file`` methods) + or empty string (for ``cli_flags``, where the command builder handles it). + """ + + parse_events: Callable[[str], AgentEvent | None] | None = None + """``(line: str) -> AgentEvent | None`` + + Parse one line of the agent's stdout into a normalized event. + Return ``None`` for lines that are not parseable events. + """ + + build_env_vars: Callable[..., dict[str, str]] | None = None + """``(spec, config) -> dict[str, str]`` + + Optional override for env var construction. When provided, this is + called *instead of* resolving placeholders in :attr:`env`. Prefer + the declarative :attr:`env` dict for new agents. + """ + + +__all__ = [ + "AgentConfig", + "AgentEvent", + "AgentTask", + "ArtifactSpec", + "CLIAgentSpec", + "MCPConfigSpec", +] diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py new file mode 100644 index 000000000..8e8179889 --- /dev/null +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -0,0 +1,716 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Shared CLI agent driver, session, and session factory. + +The :class:`CLIAgentDriver` factors out the common 70% of CLI harness +lifecycle — sandbox creation, MCP config injection, interception proxy +setup, subprocess management, and result collection. + +It is **fully generic**: it reads the :class:`CLIAgentSpec`'s declarative +data fields and executes them mechanically. No per-agent code lives here. + +The :class:`CLIAgentSession` implements :class:`ResourceSession` and +the :class:`CLIAgentSessionFactory` implements :class:`ResourceSessionFactory`, +so the CLI agent driver integrates seamlessly with the existing harness +runtime from PR #603. +""" + +from __future__ import annotations + +import json +import logging +import shlex +import time +from pathlib import Path +from typing import Any, Callable, Literal + +from openenv.core.env_server.mcp_types import Tool +from openenv.core.harness import ( + Message, + ResourceSession, + ResourceSessionFactory, + ToolResult, + VerifyResult, +) +from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle + +from .base import CLIAgentSpec + + +_log = logging.getLogger(__name__) + +# Interception proxy defaults +_PROXY_PORT = 7000 +_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl" +_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log" + +# Where the proxy source lives on disk. Uploaded into sandboxes that don't +# already have it baked in. +_PROXY_SOURCE_PATH = Path(__file__).resolve().parents[1] / "sandbox" / "interception.py" + +# Verifier type — same as opencode_env's Verifier alias +Verifier = Callable[..., VerifyResult] + + +# CLIAgentSession + + +class CLIAgentSession(ResourceSession): + """Per-rollout session wrapping one sandbox with one running agent CLI. + + The session is created already-running: :meth:`CLIAgentSessionFactory.create` + launches the agent before returning. Typical usage:: + + session = factory.create(task) + session.wait_for_completion() + result = session.verify([]) + session.close() + """ + + def __init__( + self, + *, + spec: CLIAgentSpec, + sandbox: SandboxHandle, + task: Any, + config: Any, + verifier: Verifier | None = None, + base_url_override: str | None = None, + proxy_trace_path: str | None = None, + proxy_bg_job: BgJob | None = None, + agent_bg_job: BgJob | None = None, + ) -> None: + self.spec = spec + self.sandbox = sandbox + self.task = task + self.config = config + self._verifier = verifier + self._base_url_override = base_url_override + self._proxy_trace_path = proxy_trace_path + self._proxy_bg_job = proxy_bg_job + self._agent_bg_job = agent_bg_job + + # ResourceSession contract + + def initial_messages(self) -> list[Message]: + instruction = ( + self.task.instruction + if hasattr(self.task, "instruction") + else str(self.task) + ) + return [{"role": "user", "content": instruction}] + + def list_tools(self) -> list[Tool]: + # CLI agents own their own tool loop — none are exposed to the harness. + return [] + + def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult: + return ToolResult( + error=( + f"{self.spec.name} session does not expose external tool calls; " + "the CLI agent owns its own tool loop." + ) + ) + + def verify( + self, + transcript: list[Message], + final_state: Any | None = None, + ) -> VerifyResult: + if self._verifier is None: + return VerifyResult(env_reward=None, done=True) + return self._verifier(self.sandbox, self.task) + + def close(self) -> None: + if self._agent_bg_job is not None: + try: + self._agent_bg_job.kill() + except Exception: + pass + self._agent_bg_job = None + if self._proxy_bg_job is not None: + try: + self._proxy_bg_job.kill() + except Exception: + pass + self._proxy_bg_job = None + self.sandbox.kill() + + # CLI-agent-specific API + + def wait_for_completion(self, timeout_s: float | None = None) -> int: + """Block until the agent exits, returning its exit code.""" + budget = timeout_s if timeout_s is not None else self.spec.default_timeout_s + if hasattr(self.config, "agent_timeout_s"): + budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s + if self._agent_bg_job is None: + raise RuntimeError("Agent not started.") + return self._agent_bg_job.wait(timeout=budget) + + def collect_artifacts(self) -> dict[str, Any]: + """Collect all artifacts declared in ``spec.artifacts`` from the sandbox. + + Returns a dict keyed by artifact name. Missing optional artifacts are + silently skipped. + """ + result: dict[str, Any] = {} + if not self.spec.artifacts: + return result + for name, artifact_spec in self.spec.artifacts.items(): + try: + content = self.sandbox.read_text(artifact_spec.path) + if artifact_spec.format == "json": + result[name] = json.loads(content) + elif artifact_spec.format == "jsonl": + result[name] = [ + json.loads(line) + for line in content.splitlines() + if line.strip() + ] + else: + result[name] = content + except Exception: + if not artifact_spec.optional: + raise + _log.debug( + "Optional artifact %r (%s) not found, skipping", + name, + artifact_spec.path, + ) + return result + + def fetch_proxy_trace(self) -> list[dict[str, Any]]: + """Return per-turn proxy-captured records (transparent_proxy mode only). + + Each entry has ``request``, ``response``, ``completion_tokens``, + ``completion_token_ids``, ``per_token_logps``, ``finish_reason``, + and ``latency_s``. Returns ``[]`` in black_box mode. + """ + if self._proxy_trace_path is None: + return [] + try: + content = self.sandbox.read_text(self._proxy_trace_path) + except Exception: + return [] + records: list[dict[str, Any]] = [] + for line in content.splitlines(): + line = line.strip() + if not line: + continue + records.append(json.loads(line)) + return records + + +# CLIAgentDriver — shared lifecycle + + +class CLIAgentDriver: + """Shared driver for all CLI-based agentic harnesses. + + Implements the common lifecycle: + + 1. Create sandbox (via :class:`SandboxBackend`) + 2. Wait for sandbox ready (``echo ok`` probe) + 3. Install agent CLI — run ``spec.setup`` commands (skipped if + ``spec.install_check_cmd`` succeeds, i.e. pre-baked template) + 4. Upload ``spec.files`` into the sandbox + 5. Write MCP config (via ``spec.build_mcp_config``) + 6. Set environment variables from ``spec.env`` (with placeholder + resolution) + 7. Optionally start interception proxy (transparent_proxy mode) + 8. Build CLI command (via ``spec.build_command``) + 9. Launch agent as bg process + 10. Return a :class:`CLIAgentSession` + """ + + def __init__( + self, + spec: CLIAgentSpec, + sandbox_backend: SandboxBackend, + mode: Literal["black_box", "transparent_proxy"] = "black_box", + *, + install_timeout_s: int = 240, + setup_timeout_s: int = 300, + proxy_top_logprobs: int = 5, + proxy_max_tokens_cap: int | None = 16384, + proxy_disable_thinking: bool = False, + ) -> None: + if mode not in {"black_box", "transparent_proxy"}: + raise ValueError(f"Unknown mode: {mode!r}") + self.spec = spec + self.sandbox_backend = sandbox_backend + self.mode = mode + self._install_timeout_s = install_timeout_s + self._setup_timeout_s = setup_timeout_s + self._proxy_top_logprobs = proxy_top_logprobs + self._proxy_max_tokens_cap = proxy_max_tokens_cap + self._proxy_disable_thinking = proxy_disable_thinking + + def create_session( + self, + task: Any, + config: Any, + *, + verifier: Verifier | None = None, + seed: int | None = None, + episode_id: str | None = None, + ) -> CLIAgentSession: + """Create a fully bootstrapped session with a running agent. + + This is the main entry point. It: + 1. Creates a sandbox + 2. Bootstraps it (install agent, upload files, write MCP config) + 3. Optionally starts the interception proxy + 4. Launches the agent subprocess + 5. Returns a ready-to-use :class:`CLIAgentSession` + """ + timeout_s = ( + config.agent_timeout_s + if hasattr(config, "agent_timeout_s") + else self.spec.default_timeout_s + ) + sandbox_timeout = int(timeout_s) + 300 + + _log.info( + "%s driver: creating sandbox timeout=%ds mode=%s", + self.spec.name, + sandbox_timeout, + self.mode, + ) + sandbox = self.sandbox_backend.create( + timeout_s=sandbox_timeout, + metadata={"episode_id": episode_id} if episode_id else None, + ) + sid = getattr(sandbox, "sandbox_id", "?") + _log.info("%s driver: sandbox=%s — bootstrapping…", self.spec.name, sid) + + try: + self._bootstrap_sandbox(sandbox, task, config) + except Exception as exc: + _log.error("%s driver: bootstrap failed: %r", self.spec.name, exc) + sandbox.kill() + raise + + base_url_override: str | None = None + proxy_trace_path: str | None = None + proxy_bg_job: BgJob | None = None + + if self.mode == "transparent_proxy": + base_url = config.base_url if hasattr(config, "base_url") else "" + api_key = config.api_key if hasattr(config, "api_key") else "intercepted" + model = config.model if hasattr(config, "model") else "" + + _log.info( + "%s driver: starting interception proxy on :%d → %s", + self.spec.name, + _PROXY_PORT, + base_url, + ) + proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy( + sandbox, + base_url=base_url, + api_key=api_key, + model=model, + ) + _log.info("%s driver: proxy up at %s", self.spec.name, base_url_override) + + agent_bg_job = self._start_agent( + sandbox, + task, + config, + base_url_override=base_url_override, + ) + + return CLIAgentSession( + spec=self.spec, + sandbox=sandbox, + task=task, + config=config, + verifier=verifier, + base_url_override=base_url_override, + proxy_trace_path=proxy_trace_path, + proxy_bg_job=proxy_bg_job, + agent_bg_job=agent_bg_job, + ) + + # Bootstrap stages + + def _bootstrap_sandbox( + self, + sandbox: SandboxHandle, + task: Any, + config: Any, + ) -> None: + """Install agent, upload files, write MCP config.""" + + # Stage 1: wait for sandbox readiness + self._wait_for_sandbox_ready(sandbox) + + # Stage 2: install agent CLI (skip if pre-baked) + if not self._agent_already_installed(sandbox): + self._install_agent(sandbox) + + # Stage 3: upload spec.files + self._upload_files(sandbox, task, config) + + # Stage 4: write MCP config (if the spec provides a builder) + self._write_mcp_config(sandbox, config) + + # Stage 5: run task.setup_shell if present + setup_shell = task.setup_shell if hasattr(task, "setup_shell") else None + if setup_shell: + r = sandbox.exec(setup_shell, timeout=self._setup_timeout_s) + if r.exit_code != 0: + raise RuntimeError( + f"task.setup_shell failed ({r.exit_code}): {r.stderr}" + ) + + def _wait_for_sandbox_ready( + self, + sandbox: SandboxHandle, + *, + attempts: int = 15, + delay_s: float = 1.0, + ) -> None: + """Probe sandbox until ``echo ok`` succeeds.""" + last_err = "" + for _ in range(attempts): + try: + r = sandbox.exec("echo ok", timeout=5) + if r.exit_code == 0 and "ok" in (r.stdout or ""): + return + last_err = (r.stderr or r.stdout or "").strip() or f"exit={r.exit_code}" + except Exception as exc: + last_err = f"{type(exc).__name__}: {exc}" + time.sleep(delay_s) + raise RuntimeError( + f"sandbox did not become ready within {attempts * delay_s:.0f}s " + f"(last error: {last_err})" + ) + + def _agent_already_installed(self, sandbox: SandboxHandle) -> bool: + """Check if the agent CLI is already available in the sandbox.""" + cmd = " ".join(shlex.quote(c) for c in self.spec.install_check_cmd) + try: + r = sandbox.exec(cmd, timeout=10) + return r.exit_code == 0 + except Exception: + return False + + def _install_agent(self, sandbox: SandboxHandle) -> None: + """Run ``spec.setup`` commands to install the agent CLI.""" + if self.spec.setup is None: + raise RuntimeError( + f"Agent {self.spec.name!r} is not installed in the sandbox " + "and no setup commands are provided in the spec." + ) + commands = ( + [self.spec.setup] if isinstance(self.spec.setup, str) else self.spec.setup + ) + for cmd in commands: + self._exec_with_retry( + sandbox, + cmd, + timeout=self._install_timeout_s, + attempts=3, + backoff_s=3.0, + label=f"{self.spec.name} install", + ) + + def _upload_files( + self, + sandbox: SandboxHandle, + task: Any, + config: Any, + ) -> None: + """Upload ``spec.files`` into the sandbox, resolving callables.""" + if not self.spec.files: + return + for path, content_or_fn in self.spec.files.items(): + if callable(content_or_fn): + content = content_or_fn(task, config) + else: + content = content_or_fn + if content is not None: + sandbox.write_text(path, content) + + # Also upload task.upload_files if the task has them. + upload_files = task.upload_files if hasattr(task, "upload_files") else {} + for path, content in upload_files.items(): + sandbox.write_text(path, content) + + def _write_mcp_config( + self, + sandbox: SandboxHandle, + config: Any, + ) -> None: + """Write MCP configuration using the spec's builder.""" + if self.spec.build_mcp_config is None: + return + if ( + self.spec.mcp_config.method == "config_file" + and self.spec.mcp_config.path_template + ): + workdir = ( + config.sandbox_home + "/workdir" + if hasattr(config, "sandbox_home") + else "/home/user/workdir" + ) + home = ( + config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" + ) + mcp_path = self.spec.mcp_config.path_template.format( + workdir=workdir, + home=home, + ) + mcp_content = self.spec.build_mcp_config(self.spec, [], workdir) + sandbox.write_text(mcp_path, mcp_content) + + # Agent launch + + def _start_agent( + self, + sandbox: SandboxHandle, + task: Any, + config: Any, + *, + base_url_override: str | None = None, + ) -> BgJob: + """Build CLI command, resolve env vars, and launch as bg process.""" + # Build command via spec hook + if self.spec.build_command is not None: + cmd = self.spec.build_command(self.spec, config, task, None) + else: + cmd = " ".join(shlex.quote(c) for c in self.spec.base_command) + + # Resolve environment variables + envs = self._resolve_env_vars(config, base_url_override=base_url_override) + + _log.info("%s driver: launching agent", self.spec.name) + return sandbox.start_bg(cmd, envs=envs) + + def _resolve_env_vars( + self, + config: Any, + *, + base_url_override: str | None = None, + ) -> dict[str, str]: + """Build the env var dict for the agent process. + + If ``spec.build_env_vars`` is provided, delegate to it. + Otherwise resolve ``{placeholder}`` substitutions in ``spec.env``. + """ + if self.spec.build_env_vars is not None: + return self.spec.build_env_vars(self.spec, config) + + if not self.spec.env: + return {} + + base_url = base_url_override or ( + config.base_url if hasattr(config, "base_url") else "" + ) + api_key = config.api_key if hasattr(config, "api_key") else "intercepted" + model = config.model if hasattr(config, "model") else "" + + substitutions = { + "base_url": base_url, + "api_key": api_key, + "model": model, + } + + resolved: dict[str, str] = {} + for key, value in self.spec.env.items(): + try: + resolved[key] = value.format(**substitutions) + except KeyError: + # If a placeholder isn't in our substitutions, keep it as-is. + resolved[key] = value + return resolved + + # Interception proxy + + def _start_proxy( + self, + sandbox: SandboxHandle, + *, + base_url: str, + api_key: str, + model: str, + ) -> tuple[BgJob, str, str]: + """Install deps, start proxy as bg job, wait for healthz. + + Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``. + """ + proxy_already_present = sandbox.exists("/home/user/proxy/interception.py") + + if not proxy_already_present: + self._exec_with_retry( + sandbox, + "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' " + "'httpx>=0.27' 2>&1 | tail -20", + timeout=180, + attempts=3, + backoff_s=2.0, + label="proxy deps install", + ) + sandbox.write_text( + "/home/user/proxy/interception.py", + _PROXY_SOURCE_PATH.read_text(), + ) + sandbox.write_text("/home/user/proxy/__init__.py", "") + + proxy_args = [ + "python", + "interception.py", + "--upstream-url", + base_url, + "--trace", + _PROXY_TRACE_PATH, + "--port", + str(_PROXY_PORT), + "--top-logprobs", + str(self._proxy_top_logprobs), + ] + if self._proxy_max_tokens_cap is not None: + proxy_args.extend(["--max-tokens-cap", str(self._proxy_max_tokens_cap)]) + if self._proxy_disable_thinking: + proxy_args.append("--disable-thinking") + if model: + proxy_args.extend(["--model-override", model]) + + quoted = " ".join(shlex.quote(a) for a in proxy_args) + proxy_cmd = ( + f"cd /home/user/proxy && {quoted} > {shlex.quote(_PROXY_LOG_PATH)} 2>&1" + ) + proxy_env = {"OPENCODE_UPSTREAM_API_KEY": api_key} + proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env) + + # Wait for proxy healthz + attempts = 120 + interval_s = 0.5 + for _ in range(attempts): + r = sandbox.exec( + f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz", + timeout=5, + ) + if r.exit_code == 0: + break + time.sleep(interval_s) + else: + log_content = "" + try: + log_content = sandbox.read_text(_PROXY_LOG_PATH) + except Exception: + pass + proxy_job.kill() + raise RuntimeError( + f"proxy did not start within {attempts * interval_s:.0f}s. " + f"log:\n{log_content[-2000:]}" + ) + + override_url = f"http://127.0.0.1:{_PROXY_PORT}/v1" + return proxy_job, override_url, _PROXY_TRACE_PATH + + # Utilities + + def _exec_with_retry( + self, + sandbox: SandboxHandle, + cmd: str, + *, + timeout: float, + attempts: int = 3, + backoff_s: float = 3.0, + label: str = "cmd", + ) -> Any: + """Run ``sandbox.exec`` with exponential backoff on transient failure.""" + last_stdout = "" + last_stderr = "" + last_exit = 0 + for i in range(attempts): + try: + r = sandbox.exec(cmd, timeout=timeout) + if r.exit_code == 0: + return r + last_stdout = r.stdout or "" + last_stderr = r.stderr or "" + last_exit = r.exit_code + if last_stderr.strip(): + break + except Exception as exc: + last_stderr = f"{type(exc).__name__}: {exc}" + last_exit = -1 + if i + 1 < attempts: + time.sleep(backoff_s * (2**i)) + raise RuntimeError( + f"{label} failed after {attempts} attempts " + f"(exit={last_exit}, stderr={last_stderr!r}, " + f"stdout_tail={last_stdout[-400:]!r})" + ) + + +# CLIAgentSessionFactory + + +class CLIAgentSessionFactory(ResourceSessionFactory): + """Factory that produces :class:`CLIAgentSession` instances for any + registered agent. + + Wraps :class:`CLIAgentDriver` to satisfy the + :class:`ResourceSessionFactory` contract from PR #603. + """ + + def __init__( + self, + *, + spec: CLIAgentSpec, + config: Any, + sandbox_backend: SandboxBackend, + mode: Literal["black_box", "transparent_proxy"] = "black_box", + verifier: Verifier | None = None, + install_timeout_s: int = 240, + setup_timeout_s: int = 300, + proxy_top_logprobs: int = 5, + proxy_max_tokens_cap: int | None = 16384, + proxy_disable_thinking: bool = False, + ) -> None: + self._spec = spec + self._config = config + self._verifier = verifier + self._driver = CLIAgentDriver( + spec=spec, + sandbox_backend=sandbox_backend, + mode=mode, + install_timeout_s=install_timeout_s, + setup_timeout_s=setup_timeout_s, + proxy_top_logprobs=proxy_top_logprobs, + proxy_max_tokens_cap=proxy_max_tokens_cap, + proxy_disable_thinking=proxy_disable_thinking, + ) + + def create( + self, + task: Any, + seed: int | None = None, + episode_id: str | None = None, + ) -> CLIAgentSession: + """Create one isolated session for a rollout.""" + return self._driver.create_session( + task=task, + config=self._config, + verifier=self._verifier, + seed=seed, + episode_id=episode_id, + ) + + +__all__ = [ + "CLIAgentDriver", + "CLIAgentSession", + "CLIAgentSessionFactory", + "Verifier", +] diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py new file mode 100644 index 000000000..b179e9c9f --- /dev/null +++ b/src/openenv/core/harness/agents/opencode.py @@ -0,0 +1,191 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""OpenCode agent adapter. + +Expresses the OpenCode harness as a purely declarative :class:`CLIAgentSpec`. +All builders (command construction, config generation, env var resolution) +are self-contained with no imports from ``envs/opencode_env/``. + +Registered on import:: + + import openenv.core.harness.agents.opencode + # OPENCODE_SPEC is now in the registry +""" + +from __future__ import annotations + +import json +from typing import Any + +from . import register_agent +from .base import AgentEvent, ArtifactSpec, CLIAgentSpec, MCPConfigSpec + + +# Command / config / env builders + + +def _build_opencode_command( + spec: CLIAgentSpec, + config: Any, + task: Any, + mcp_config_path: str | None, +) -> str: + """Build the ``opencode run`` shell command.""" + home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" + run_format = config.run_format if hasattr(config, "run_format") else "json" + format_flag = "--format json" if run_format == "json" else "" + instruction_file = f"{home}/task/instruction.md" + log_file = f"{home}/logs/agent/opencode.jsonl" + workdir = f"{home}/workdir" + + return ( + f'export PATH="$HOME/.opencode/bin:$PATH" && ' + f"cd {workdir} && " + f'opencode run {format_flag} "$(cat {instruction_file})" ' + f"2>&1 | tee {log_file}" + ).strip() + + +def _build_opencode_mcp_config( + spec: CLIAgentSpec, + tools: list[Any], + workdir: str, +) -> str: + """Build the ``opencode.json`` content for the MCP config file.""" + return json.dumps( + { + "$schema": "https://opencode.ai/config.json", + "model": "intercepted/model", + "provider": { + "intercepted": { + "npm": "@ai-sdk/openai-compatible", + "name": "Intercepted", + "options": { + "baseURL": "http://127.0.0.1:7000/v1", + "apiKey": "intercepted", + "timeout": 600000, + }, + "models": { + "model": {"name": "Intercepted Model"}, + }, + } + }, + }, + indent=2, + ) + + +def _build_opencode_env_vars( + spec: CLIAgentSpec, + config: Any, +) -> dict[str, str]: + """Build env vars for the OpenCode process.""" + home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" + base_url = config.base_url if hasattr(config, "base_url") else "" + api_key = config.api_key if hasattr(config, "api_key") else "intercepted" + extra_env = config.extra_env if hasattr(config, "extra_env") else {} + + env = dict(extra_env) + env["OPENAI_BASE_URL"] = base_url + env["OPENAI_API_KEY"] = api_key + env["OPENCODE_CONFIG"] = f"{home}/.config/opencode/opencode.json" + return env + + +def _parse_opencode_event(line: str) -> AgentEvent | None: + """Parse one line of OpenCode's JSONL stdout.""" + line = line.strip() + if not line: + return None + try: + data = json.loads(line) + except json.JSONDecodeError: + return None + + event_type = data.get("type", "") + if event_type in ("assistant", "message"): + return AgentEvent(type="assistant", data=data, raw=line) + elif event_type in ("tool_call", "tool_use"): + return AgentEvent(type="tool_call", data=data, raw=line) + elif event_type in ("tool_result", "tool_response"): + return AgentEvent(type="tool_result", data=data, raw=line) + elif event_type == "error": + return AgentEvent(type="error", data=data, raw=line) + elif event_type in ("done", "complete", "end"): + return AgentEvent(type="done", data=data, raw=line) + return AgentEvent(type="assistant", data=data, raw=line) + + +# File resolvers + + +def _instruction_file_content(task: Any, config: Any) -> str: + return task.instruction if hasattr(task, "instruction") else str(task) + + +def _system_prompt_content(task: Any, config: Any) -> str | None: + if hasattr(config, "system_prompt") and config.system_prompt: + return config.system_prompt + return None + + +# Spec definition + + +OPENCODE_SPEC = CLIAgentSpec( + name="opencode", + install_check_cmd=["/home/user/.opencode/bin/opencode", "--version"], + base_command=[ + "opencode", + "run", + "--format", + "json", + "--dangerously-skip-permissions", + ], + mcp_config=MCPConfigSpec( + method="config_file", + path_template="{home}/.config/opencode/opencode.json", + ), + supports_logprob_proxy=True, + default_timeout_s=900.0, + setup=( + "set -e && " + "mkdir -p /home/user/.config/opencode /home/user/logs/agent " + "/home/user/logs/verifier /home/user/task /home/user/workdir && " + "curl -fsSL https://opencode.ai/install | bash && " + 'export PATH="$HOME/.opencode/bin:$PATH" && ' + "opencode --version" + ), + files={ + "/home/user/task/instruction.md": _instruction_file_content, + "/home/user/task/system.md": _system_prompt_content, + }, + artifacts={ + "agent_log": ArtifactSpec( + path="/home/user/logs/agent/opencode.jsonl", + format="jsonl", + ), + }, + env={ + "PATH": "/home/user/.opencode/bin:$PATH", + "OPENAI_BASE_URL": "{base_url}", + "OPENAI_API_KEY": "{api_key}", + }, + build_command=_build_opencode_command, + build_mcp_config=_build_opencode_mcp_config, + parse_events=_parse_opencode_event, + build_env_vars=_build_opencode_env_vars, +) + + +# Auto-register on import +register_agent(OPENCODE_SPEC) + + +__all__ = [ + "OPENCODE_SPEC", +] From 455b0e9e46b266655ec05558a321814593ee5cfd Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Tue, 12 May 2026 23:02:10 +0530 Subject: [PATCH 03/35] feat: add tests --- tests/core/test_cli_agent_driver.py | 1064 +++++++++++++++++++++++++++ tests/envs/test_opencode_env.py | 8 +- 2 files changed, 1067 insertions(+), 5 deletions(-) create mode 100644 tests/core/test_cli_agent_driver.py diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py new file mode 100644 index 000000000..b26f01d67 --- /dev/null +++ b/tests/core/test_cli_agent_driver.py @@ -0,0 +1,1064 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for the CLI agent driver abstraction (Phase 2). + +Covers: + - Agent spec + event protocols (base.py) + - Agent registry (__init__.py) + - CLIAgentDriver / CLIAgentSession / CLIAgentSessionFactory (cli_driver.py) + - OpenCode adapter spec (opencode.py) + +All tests run without external dependencies (no E2B, no LLM, no network). +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any + +import pytest + + +# Fake sandbox infrastructure (mirrors test_opencode_env.py pattern) + + +@dataclass +class FakeExecResult: + exit_code: int = 0 + stdout: str = "ok" + stderr: str = "" + + +@dataclass +class FakeBgJob: + cmd: str = "" + envs: dict[str, str] | None = None + _exit_code: int = 0 + + @property + def pid(self) -> int: + return 12345 + + def wait(self, timeout: float | None = None) -> int: + return self._exit_code + + def kill(self) -> None: + pass + + +class FakeSandbox: + """In-memory sandbox for unit testing.""" + + def __init__( + self, + *, + install_check_succeeds: bool = False, + healthz_succeeds: bool = True, + ) -> None: + self.sandbox_id = "fake-sandbox-001" + self.written: dict[str, str] = {} + self.executed: list[str] = [] + self.bg_commands: list[tuple[str, dict[str, str] | None]] = [] + self._install_check_succeeds = install_check_succeeds + self._healthz_succeeds = healthz_succeeds + self._killed = False + + def exec( + self, + cmd: str, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + timeout: float | None = 60, + ) -> FakeExecResult: + self.executed.append(cmd) + if cmd == "echo ok": + return FakeExecResult(exit_code=0, stdout="ok") + # install check — only standalone version-check commands (short, just + # binary + --version) should be treated as install probes. Multi-part + # setup scripts that happen to end with --version should succeed. + if "--version" in cmd and len(cmd) < 80 and "&&" not in cmd: + if self._install_check_succeeds: + return FakeExecResult(exit_code=0, stdout="1.0.0") + return FakeExecResult(exit_code=127, stderr="not found") + # healthz check + if "healthz" in cmd: + if self._healthz_succeeds: + return FakeExecResult(exit_code=0, stdout='{"status":"ok"}') + return FakeExecResult(exit_code=7, stderr="connection refused") + # All other commands succeed + return FakeExecResult(exit_code=0, stdout="") + + def start_bg( + self, + cmd: str, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + ) -> FakeBgJob: + self.bg_commands.append((cmd, envs)) + return FakeBgJob(cmd=cmd, envs=envs) + + def write_text(self, path: str, content: str) -> None: + self.written[path] = content + + def read_text(self, path: str) -> str: + if path not in self.written: + raise FileNotFoundError(f"No such file: {path}") + return self.written[path] + + def exists(self, path: str) -> bool: + return path in self.written + + def kill(self) -> None: + self._killed = True + + +class FakeSandboxBackend: + """Backend that returns FakeSandbox instances.""" + + def __init__( + self, + *, + install_check_succeeds: bool = False, + healthz_succeeds: bool = True, + ) -> None: + self._install_check_succeeds = install_check_succeeds + self._healthz_succeeds = healthz_succeeds + self.created: list[FakeSandbox] = [] + + def create( + self, + *, + timeout_s: int = 900, + envs: dict[str, str] | None = None, + metadata: dict[str, str] | None = None, + ) -> FakeSandbox: + sbx = FakeSandbox( + install_check_succeeds=self._install_check_succeeds, + healthz_succeeds=self._healthz_succeeds, + ) + self.created.append(sbx) + return sbx + + +@dataclass +class FakeTask: + instruction: str = "Write hello.py" + setup_shell: str | None = None + upload_files: dict[str, str] = field(default_factory=dict) + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class FakeConfig: + base_url: str = "https://api.example.com/v1" + api_key: str = "sk-test-key" + model: str = "test-model" + agent_timeout_s: float = 300.0 + sandbox_home: str = "/home/user" + extra_env: dict[str, str] = field(default_factory=dict) + + +# PR 2.1: Agent Spec and Event Parser Protocols + + +class TestAgentSpecProtocols: + """Tests for base.py data models.""" + + def test_mcp_config_spec_frozen(self): + from openenv.core.harness.agents.base import MCPConfigSpec + + spec = MCPConfigSpec(method="config_file", path_template="{workdir}/mcp.json") + assert spec.method == "config_file" + assert spec.path_template == "{workdir}/mcp.json" + with pytest.raises(AttributeError): + spec.method = "cli_flags" # type: ignore[misc] + + def test_artifact_spec_defaults(self): + from openenv.core.harness.agents.base import ArtifactSpec + + a = ArtifactSpec(path="/logs/agent/out.log") + assert a.format == "text" + assert a.optional is True + + def test_artifact_spec_json(self): + from openenv.core.harness.agents.base import ArtifactSpec + + a = ArtifactSpec(path="/data/traj.json", format="json", optional=False) + assert a.format == "json" + assert a.optional is False + + def test_agent_event_creation(self): + from openenv.core.harness.agents.base import AgentEvent + + e = AgentEvent( + type="tool_call", data={"name": "bash"}, raw='{"type":"tool_call"}' + ) + assert e.type == "tool_call" + assert e.data["name"] == "bash" + + def test_cli_agent_spec_minimal(self): + from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec + + spec = CLIAgentSpec( + name="test-agent", + install_check_cmd=["test-agent", "--version"], + base_command=["test-agent", "run"], + mcp_config=MCPConfigSpec(method="cli_flags"), + ) + assert spec.name == "test-agent" + assert spec.supports_logprob_proxy is True + assert spec.default_timeout_s == 600.0 + assert spec.setup is None + assert spec.files is None + assert spec.artifacts is None + assert spec.env is None + assert spec.build_command is None + + def test_cli_agent_spec_full(self): + from openenv.core.harness.agents.base import ( + ArtifactSpec, + CLIAgentSpec, + MCPConfigSpec, + ) + + spec = CLIAgentSpec( + name="full-agent", + install_check_cmd=["full-agent", "--version"], + base_command=["full-agent", "exec"], + mcp_config=MCPConfigSpec( + method="config_file", path_template="{workdir}/mcp.json" + ), + supports_logprob_proxy=True, + default_timeout_s=900.0, + setup="npm install -g full-agent", + files={ + "/task.txt": "hello", + "/dynamic.txt": lambda task, config: task.instruction, + }, + artifacts={ + "log": ArtifactSpec(path="/logs/out.log"), + "traj": ArtifactSpec(path="/logs/traj.json", format="json"), + }, + env={"API_KEY": "{api_key}", "MODEL": "{model}"}, + build_command=lambda spec, config, task, mcp: "full-agent exec", + build_mcp_config=lambda spec, tools, workdir: "{}", + parse_events=lambda line: None, + ) + assert spec.name == "full-agent" + assert len(spec.artifacts) == 2 + assert callable(spec.files["/dynamic.txt"]) + + +# PR 2.2: Agent Registry + + +class TestAgentRegistry: + """Tests for the agent registry.""" + + def test_register_and_lookup(self): + from openenv.core.harness.agents import ( + get_agent_spec, + list_agents, + register_agent, + unregister_agent, + ) + from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec + + spec = CLIAgentSpec( + name="test-registry-agent", + install_check_cmd=["tra", "--version"], + base_command=["tra", "run"], + mcp_config=MCPConfigSpec(method="cli_flags"), + ) + try: + register_agent(spec) + assert "test-registry-agent" in list_agents() + assert get_agent_spec("test-registry-agent") is spec + finally: + unregister_agent("test-registry-agent") + + def test_duplicate_registration_same_object_ok(self): + from openenv.core.harness.agents import register_agent, unregister_agent + from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec + + spec = CLIAgentSpec( + name="test-dup-ok", + install_check_cmd=["x"], + base_command=["x"], + mcp_config=MCPConfigSpec(method="cli_flags"), + ) + try: + register_agent(spec) + register_agent(spec) # same object — should be fine + finally: + unregister_agent("test-dup-ok") + + def test_duplicate_registration_different_object_raises(self): + from openenv.core.harness.agents import register_agent, unregister_agent + from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec + + spec1 = CLIAgentSpec( + name="test-dup-fail", + install_check_cmd=["x"], + base_command=["x"], + mcp_config=MCPConfigSpec(method="cli_flags"), + ) + spec2 = CLIAgentSpec( + name="test-dup-fail", + install_check_cmd=["y"], + base_command=["y"], + mcp_config=MCPConfigSpec(method="cli_flags"), + ) + try: + register_agent(spec1) + with pytest.raises(ValueError, match="already registered"): + register_agent(spec2) + finally: + unregister_agent("test-dup-fail") + + def test_unknown_agent_raises_keyerror(self): + from openenv.core.harness.agents import get_agent_spec + + with pytest.raises(KeyError, match="Unknown agent"): + get_agent_spec("nonexistent-agent-xyz") + + def test_unregister_returns_spec(self): + from openenv.core.harness.agents import register_agent, unregister_agent + from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec + + spec = CLIAgentSpec( + name="test-unreg", + install_check_cmd=["x"], + base_command=["x"], + mcp_config=MCPConfigSpec(method="cli_flags"), + ) + register_agent(spec) + removed = unregister_agent("test-unreg") + assert removed is spec + assert unregister_agent("test-unreg") is None + + def test_auto_import_opencode(self): + """Auto-import triggers registration of built-in agents.""" + from openenv.core.harness.agents import get_agent_spec + + spec = get_agent_spec("opencode") + assert spec.name == "opencode" + + +# PR 2.3: CLIAgentDriver / CLIAgentSession / CLIAgentSessionFactory + + +def _make_test_spec(**overrides: Any) -> Any: + from openenv.core.harness.agents.base import ( + ArtifactSpec, + CLIAgentSpec, + MCPConfigSpec, + ) + + defaults = dict( + name="test-agent", + install_check_cmd=["test-agent", "--version"], + base_command=["test-agent", "run", "--json"], + mcp_config=MCPConfigSpec( + method="config_file", path_template="{workdir}/mcp.json" + ), + setup="apt-get install -y test-agent", + files={ + "/home/user/task/instruction.txt": lambda task, config: task.instruction, + }, + artifacts={ + "agent_log": ArtifactSpec(path="/home/user/logs/agent.log"), + }, + env={ + "API_KEY": "{api_key}", + "BASE_URL": "{base_url}", + "MODEL": "{model}", + }, + build_command=lambda spec, config, task, mcp: ( + f"test-agent run --json '{task.instruction}' 2>&1 | tee /home/user/logs/agent.log" + ), + build_mcp_config=lambda spec, tools, workdir: json.dumps({"tools": []}), + parse_events=lambda line: None, + ) + defaults.update(overrides) + return CLIAgentSpec(**defaults) + + +class TestCLIAgentDriver: + """Tests for the shared CLI agent driver.""" + + def test_create_session_full_lifecycle(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + task = FakeTask(instruction="Write hello.py") + config = FakeConfig() + session = driver.create_session(task=task, config=config) + + # Verify sandbox was created + assert len(backend.created) == 1 + sbx = backend.created[0] + + # Verify sandbox readiness was probed + assert "echo ok" in sbx.executed + + # Verify install was attempted (agent not pre-installed) + assert any("apt-get install" in cmd for cmd in sbx.executed) + + # Verify files were uploaded + assert "/home/user/task/instruction.txt" in sbx.written + assert sbx.written["/home/user/task/instruction.txt"] == "Write hello.py" + + # Verify MCP config was written + assert "/home/user/workdir/mcp.json" in sbx.written + + # Verify agent was launched as bg process + assert len(sbx.bg_commands) == 1 + bg_cmd, bg_envs = sbx.bg_commands[0] + assert "test-agent run" in bg_cmd + + # Verify env vars were resolved + assert bg_envs["API_KEY"] == "sk-test-key" + assert bg_envs["BASE_URL"] == "https://api.example.com/v1" + assert bg_envs["MODEL"] == "test-model" + + # Session API + assert session.initial_messages() == [ + {"role": "user", "content": "Write hello.py"} + ] + assert session.list_tools() == [] + assert session.call_tool("x", {}).error is not None + assert session.wait_for_completion() == 0 + + session.close() + assert sbx._killed + + def test_create_session_skips_install_when_prebaked(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend(install_check_succeeds=True) + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + session = driver.create_session( + task=FakeTask(), + config=FakeConfig(), + ) + + sbx = backend.created[0] + # install should have been skipped + assert not any("apt-get install" in cmd for cmd in sbx.executed) + session.close() + + def test_create_session_with_proxy(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver( + spec=spec, + sandbox_backend=backend, + mode="transparent_proxy", + ) + + session = driver.create_session( + task=FakeTask(), + config=FakeConfig(), + ) + + sbx = backend.created[0] + + # Proxy source should have been uploaded + assert "/home/user/proxy/interception.py" in sbx.written + assert "/home/user/proxy/__init__.py" in sbx.written + + # Proxy should have been started as bg (before agent) + # and agent as second bg + assert len(sbx.bg_commands) == 2 + proxy_cmd, proxy_envs = sbx.bg_commands[0] + assert "interception.py" in proxy_cmd + assert proxy_envs == {"OPENCODE_UPSTREAM_API_KEY": "sk-test-key"} + + # Agent env should point at proxy + agent_cmd, agent_envs = sbx.bg_commands[1] + assert agent_envs["BASE_URL"] == "http://127.0.0.1:7000/v1" + + session.close() + + def test_create_session_uploads_task_files(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + task = FakeTask( + instruction="Write code", + upload_files={"/extra/data.json": '{"key": "value"}'}, + ) + session = driver.create_session(task=task, config=FakeConfig()) + + sbx = backend.created[0] + assert sbx.written["/extra/data.json"] == '{"key": "value"}' + session.close() + + def test_create_session_runs_task_setup_shell(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + task = FakeTask( + instruction="Write code", + setup_shell="pip install pandas", + ) + session = driver.create_session(task=task, config=FakeConfig()) + + sbx = backend.created[0] + assert "pip install pandas" in sbx.executed + session.close() + + def test_create_session_with_verifier(self): + from openenv.core.harness import VerifyResult + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + def verifier(sandbox, task): + return VerifyResult(env_reward=1.0, done=True, metrics={"correct": True}) + + session = driver.create_session( + task=FakeTask(), + config=FakeConfig(), + verifier=verifier, + ) + + result = session.verify([]) + assert result.env_reward == 1.0 + assert result.metrics["correct"] is True + session.close() + + def test_session_verify_without_verifier(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + session = driver.create_session(task=FakeTask(), config=FakeConfig()) + + result = session.verify([]) + assert result.env_reward is None + assert result.done is True + session.close() + + def test_invalid_mode_raises(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + with pytest.raises(ValueError, match="Unknown mode"): + CLIAgentDriver( + spec=spec, + sandbox_backend=FakeSandboxBackend(), + mode="invalid", # type: ignore[arg-type] + ) + + +class TestCLIAgentSession: + """Tests for CLIAgentSession.""" + + def test_collect_artifacts_text(self): + from openenv.core.harness.agents.base import ArtifactSpec + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec( + artifacts={ + "log": ArtifactSpec(path="/logs/out.log"), + }, + ) + sbx = FakeSandbox() + sbx.written["/logs/out.log"] = "line1\nline2\n" + + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + ) + arts = session.collect_artifacts() + assert arts["log"] == "line1\nline2\n" + + def test_collect_artifacts_json(self): + from openenv.core.harness.agents.base import ArtifactSpec + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec( + artifacts={ + "traj": ArtifactSpec(path="/logs/traj.json", format="json"), + }, + ) + sbx = FakeSandbox() + sbx.written["/logs/traj.json"] = json.dumps({"steps": [1, 2, 3]}) + + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + ) + arts = session.collect_artifacts() + assert arts["traj"] == {"steps": [1, 2, 3]} + + def test_collect_artifacts_jsonl(self): + from openenv.core.harness.agents.base import ArtifactSpec + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec( + artifacts={ + "events": ArtifactSpec(path="/logs/events.jsonl", format="jsonl"), + }, + ) + sbx = FakeSandbox() + sbx.written["/logs/events.jsonl"] = '{"a":1}\n{"b":2}\n' + + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + ) + arts = session.collect_artifacts() + assert arts["events"] == [{"a": 1}, {"b": 2}] + + def test_collect_artifacts_missing_optional(self): + from openenv.core.harness.agents.base import ArtifactSpec + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec( + artifacts={ + "log": ArtifactSpec(path="/missing/file.log", optional=True), + }, + ) + sbx = FakeSandbox() + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + ) + arts = session.collect_artifacts() + assert "log" not in arts + + def test_collect_artifacts_missing_required_raises(self): + from openenv.core.harness.agents.base import ArtifactSpec + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec( + artifacts={ + "log": ArtifactSpec(path="/missing/file.log", optional=False), + }, + ) + sbx = FakeSandbox() + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + ) + with pytest.raises(FileNotFoundError): + session.collect_artifacts() + + def test_fetch_proxy_trace_black_box(self): + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec() + session = CLIAgentSession( + spec=spec, + sandbox=FakeSandbox(), + task=FakeTask(), + config=FakeConfig(), + proxy_trace_path=None, + ) + assert session.fetch_proxy_trace() == [] + + def test_fetch_proxy_trace_with_data(self): + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec() + sbx = FakeSandbox() + trace_path = "/logs/proxy_trace.jsonl" + sbx.written[trace_path] = ( + json.dumps({"turn": 1, "latency_s": 0.5}) + + "\n" + + json.dumps({"turn": 2, "latency_s": 0.3}) + + "\n" + ) + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + proxy_trace_path=trace_path, + ) + trace = session.fetch_proxy_trace() + assert len(trace) == 2 + assert trace[0]["turn"] == 1 + + def test_close_kills_sandbox_and_jobs(self): + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + spec = _make_test_spec() + sbx = FakeSandbox() + agent_job = FakeBgJob() + proxy_job = FakeBgJob() + + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + agent_bg_job=agent_job, + proxy_bg_job=proxy_job, + ) + session.close() + assert sbx._killed + assert session._agent_bg_job is None + assert session._proxy_bg_job is None + + +class TestCLIAgentSessionFactory: + """Tests for the ResourceSessionFactory wrapper.""" + + def test_factory_creates_sessions(self): + from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory + + spec = _make_test_spec() + backend = FakeSandboxBackend() + + factory = CLIAgentSessionFactory( + spec=spec, + config=FakeConfig(), + sandbox_backend=backend, + mode="black_box", + ) + + session = factory.create(task=FakeTask()) + assert len(backend.created) == 1 + assert session.initial_messages()[0]["content"] == "Write hello.py" + session.close() + + def test_factory_with_verifier(self): + from openenv.core.harness import VerifyResult + from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory + + spec = _make_test_spec() + backend = FakeSandboxBackend() + + def verifier(sandbox, task): + return VerifyResult(env_reward=0.5, done=True) + + factory = CLIAgentSessionFactory( + spec=spec, + config=FakeConfig(), + sandbox_backend=backend, + mode="black_box", + verifier=verifier, + ) + + session = factory.create(task=FakeTask()) + result = session.verify([]) + assert result.env_reward == 0.5 + session.close() + + def test_factory_implements_resource_session_factory(self): + from openenv.core.harness import ResourceSessionFactory + from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory + + assert issubclass(CLIAgentSessionFactory, ResourceSessionFactory) + + def test_session_implements_resource_session(self): + from openenv.core.harness import ResourceSession + from openenv.core.harness.agents.cli_driver import CLIAgentSession + + assert issubclass(CLIAgentSession, ResourceSession) + + +# PR 2.4: OpenCode Adapter Spec + + +class TestOpenCodeSpec: + """Tests for the OpenCode declarative spec.""" + + def test_spec_is_registered(self): + from openenv.core.harness.agents import get_agent_spec + + spec = get_agent_spec("opencode") + assert spec.name == "opencode" + + def test_spec_fields(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + assert OPENCODE_SPEC.name == "opencode" + assert OPENCODE_SPEC.install_check_cmd == [ + "/home/user/.opencode/bin/opencode", + "--version", + ] + assert OPENCODE_SPEC.supports_logprob_proxy is True + assert OPENCODE_SPEC.default_timeout_s == 900.0 + assert OPENCODE_SPEC.mcp_config.method == "config_file" + assert "{home}" in OPENCODE_SPEC.mcp_config.path_template + assert OPENCODE_SPEC.artifacts is not None + assert "agent_log" in OPENCODE_SPEC.artifacts + assert OPENCODE_SPEC.artifacts["agent_log"].format == "jsonl" + + def test_build_command(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + @dataclass + class OcConfig: + sandbox_home: str = "/home/user" + run_format: str = "json" + + cmd = OPENCODE_SPEC.build_command( + OPENCODE_SPEC, + OcConfig(), + FakeTask(instruction="Write hello.py"), + None, + ) + assert "opencode run" in cmd + assert "--format json" in cmd + assert "/home/user/task/instruction.md" in cmd + + def test_build_mcp_config(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + config_str = OPENCODE_SPEC.build_mcp_config( + OPENCODE_SPEC, + [], + "/home/user/workdir", + ) + config = json.loads(config_str) + assert "$schema" in config + assert "provider" in config + + def test_parse_events_assistant(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + line = json.dumps({"type": "assistant", "content": "hello"}) + event = OPENCODE_SPEC.parse_events(line) + assert event is not None + assert event.type == "assistant" + + def test_parse_events_tool_call(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + line = json.dumps({"type": "tool_call", "name": "bash", "args": {}}) + event = OPENCODE_SPEC.parse_events(line) + assert event is not None + assert event.type == "tool_call" + + def test_parse_events_error(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + line = json.dumps({"type": "error", "message": "boom"}) + event = OPENCODE_SPEC.parse_events(line) + assert event is not None + assert event.type == "error" + + def test_parse_events_done(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + line = json.dumps({"type": "done"}) + event = OPENCODE_SPEC.parse_events(line) + assert event is not None + assert event.type == "done" + + def test_parse_events_invalid_json(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + assert OPENCODE_SPEC.parse_events("not json") is None + assert OPENCODE_SPEC.parse_events("") is None + + def test_build_env_vars(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + config = FakeConfig() + config.extra_env = {"EXTRA": "val"} + envs = OPENCODE_SPEC.build_env_vars(OPENCODE_SPEC, config) + assert envs["OPENAI_BASE_URL"] == "https://api.example.com/v1" + assert envs["OPENAI_API_KEY"] == "sk-test-key" + assert envs["OPENCODE_CONFIG"] == "/home/user/.config/opencode/opencode.json" + assert envs["EXTRA"] == "val" + + def test_files_instruction_resolver(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + task = FakeTask(instruction="Build a REST API") + config = FakeConfig() + instruction_fn = OPENCODE_SPEC.files["/home/user/task/instruction.md"] + assert callable(instruction_fn) + assert instruction_fn(task, config) == "Build a REST API" + + def test_files_system_prompt_resolver(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + task = FakeTask() + config = FakeConfig() + system_fn = OPENCODE_SPEC.files["/home/user/task/system.md"] + assert callable(system_fn) + # No system prompt on FakeConfig → returns None + assert system_fn(task, config) is None + + def test_opencode_driver_integration(self): + """End-to-end: create a session using the OpenCode spec via the driver.""" + from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + backend = FakeSandboxBackend() + factory = CLIAgentSessionFactory( + spec=OPENCODE_SPEC, + config=FakeConfig(), + sandbox_backend=backend, + mode="black_box", + ) + + session = factory.create(task=FakeTask(instruction="Hello")) + assert session.spec.name == "opencode" + assert session.initial_messages()[0]["content"] == "Hello" + + sbx = backend.created[0] + # Instruction file should have been written + assert sbx.written.get("/home/user/task/instruction.md") == "Hello" + + session.close() + + +# Env var resolution + + +class TestEnvVarResolution: + """Tests for environment variable placeholder resolution.""" + + def test_resolve_placeholders(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec( + env={ + "KEY": "{api_key}", + "URL": "{base_url}", + "MDL": "{model}", + "STATIC": "fixed_value", + }, + build_env_vars=None, # use placeholder resolution + ) + driver = CLIAgentDriver( + spec=spec, + sandbox_backend=FakeSandboxBackend(), + mode="black_box", + ) + envs = driver._resolve_env_vars(FakeConfig()) + assert envs["KEY"] == "sk-test-key" + assert envs["URL"] == "https://api.example.com/v1" + assert envs["MDL"] == "test-model" + assert envs["STATIC"] == "fixed_value" + + def test_resolve_with_proxy_override(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec( + env={"URL": "{base_url}"}, + build_env_vars=None, + ) + driver = CLIAgentDriver( + spec=spec, + sandbox_backend=FakeSandboxBackend(), + mode="black_box", + ) + envs = driver._resolve_env_vars( + FakeConfig(), + base_url_override="http://127.0.0.1:7000/v1", + ) + assert envs["URL"] == "http://127.0.0.1:7000/v1" + + def test_build_env_vars_hook_takes_precedence(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + def custom_env(spec, config): + return {"CUSTOM": "yes", "MODEL": config.model} + + spec = _make_test_spec( + env={"SHOULD_NOT": "appear"}, + build_env_vars=custom_env, + ) + driver = CLIAgentDriver( + spec=spec, + sandbox_backend=FakeSandboxBackend(), + mode="black_box", + ) + envs = driver._resolve_env_vars(FakeConfig()) + assert envs == {"CUSTOM": "yes", "MODEL": "test-model"} + assert "SHOULD_NOT" not in envs + + def test_empty_env_dict(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec(env=None, build_env_vars=None) + driver = CLIAgentDriver( + spec=spec, + sandbox_backend=FakeSandboxBackend(), + mode="black_box", + ) + envs = driver._resolve_env_vars(FakeConfig()) + assert envs == {} + + +# Multiple setup commands + + +class TestMultiStepSetup: + """Tests for specs with multi-step setup commands.""" + + def test_list_of_setup_commands(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec( + setup=[ + "apt-get update", + "apt-get install -y nodejs", + "npm install -g test-agent", + ], + ) + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + session = driver.create_session(task=FakeTask(), config=FakeConfig()) + sbx = backend.created[0] + + # All three setup commands should have been executed + assert any("apt-get update" in cmd for cmd in sbx.executed) + assert any("apt-get install" in cmd for cmd in sbx.executed) + assert any("npm install" in cmd for cmd in sbx.executed) + session.close() + + def test_no_setup_and_not_installed_raises(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec(setup=None) + backend = FakeSandboxBackend(install_check_succeeds=False) + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + with pytest.raises(RuntimeError, match="not installed"): + driver.create_session(task=FakeTask(), config=FakeConfig()) diff --git a/tests/envs/test_opencode_env.py b/tests/envs/test_opencode_env.py index 6014c9199..5e930b8bc 100644 --- a/tests/envs/test_opencode_env.py +++ b/tests/envs/test_opencode_env.py @@ -276,10 +276,6 @@ def exists(self, path: str) -> bool: def kill(self) -> None: pass - class NoopInstallFactory(OpenCodeSessionFactory): - def _exec_with_retry(self, *args, **kwargs): - return FakeExecResult() - secret = "sk-test '$(leak)" model = "provider/model'; touch /tmp/pwn #" config = OpenCodeConfig( @@ -288,12 +284,14 @@ def _exec_with_retry(self, *args, **kwargs): model=model, ) sandbox = FakeSandbox() - factory = NoopInstallFactory( + factory = OpenCodeSessionFactory( config=config, sandbox_backend=object(), # unused by this protected-method test mode="transparent_proxy", ) + # _start_proxy delegates to CLIAgentDriver._start_proxy which runs the + # proxy inside the sandbox. The driver handles dep install + source upload. factory._start_proxy(sandbox) assert sandbox.started_cmd is not None From e97fda0e0144cc566b17f262dc5c304749567b11 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Wed, 13 May 2026 11:22:09 +0530 Subject: [PATCH 04/35] feat: impl Docker sandbox backend - related tests --- src/openenv/core/harness/sandbox/__init__.py | 37 +- src/openenv/core/harness/sandbox/base.py | 2 +- .../core/harness/sandbox/docker_backend.py | 328 +++++++++++++++++ .../core/harness/sandbox/e2b_backend.py | 7 +- .../core/harness/sandbox/interception.py | 19 +- tests/core/test_docker_sandbox_backend.py | 335 ++++++++++++++++++ 6 files changed, 707 insertions(+), 21 deletions(-) create mode 100644 src/openenv/core/harness/sandbox/docker_backend.py create mode 100644 tests/core/test_docker_sandbox_backend.py diff --git a/src/openenv/core/harness/sandbox/__init__.py b/src/openenv/core/harness/sandbox/__init__.py index d0324a7d7..83d37fb48 100644 --- a/src/openenv/core/harness/sandbox/__init__.py +++ b/src/openenv/core/harness/sandbox/__init__.py @@ -7,25 +7,52 @@ """Sandbox backends for harness-driven rollouts. Provides the :class:`SandboxBackend` / :class:`SandboxHandle` protocols and -concrete implementations. Any harness adapter can use any backend — the +concrete implementations. Any harness adapter can use any backend -- the sandbox layer is orthogonal to the agent CLI choice. -The ``e2b`` import is wrapped in ``try/except`` so this package loads cleanly -in environments where ``e2b`` isn't installed (CI smoke tests, lint runs). +Optional backend imports are wrapped in ``try/except`` so this package +loads cleanly when dependencies aren't installed (CI smoke tests, lint). """ +from typing import Any, Literal + from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle +from .docker_backend import DockerBgJob, DockerSandboxBackend, DockerSandboxHandle __all__ = [ "BgJob", + "DockerBgJob", + "DockerSandboxBackend", + "DockerSandboxHandle", "ExecResult", "SandboxBackend", "SandboxHandle", + "create_sandbox_backend", ] try: - from .e2b_backend import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle + from .e2b_backend import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle # noqa: F401 __all__.extend(["E2BBgJob", "E2BSandboxBackend", "E2BSandboxHandle"]) except ImportError: - pass # e2b not installed — stubs live in envs/opencode_env/sandbox/__init__.py + pass # e2b not installed + + +def create_sandbox_backend( + backend: Literal["e2b", "docker"] = "e2b", + **kwargs: Any, +) -> SandboxBackend: + """Create a sandbox backend by name. + + For ``"e2b"``: works with both E2B cloud and CubeSandbox + (set ``E2B_API_URL``). + + For ``"docker"``: local Docker, no external dependencies. + """ + if backend == "e2b": + from .e2b_backend import E2BSandboxBackend + + return E2BSandboxBackend(**kwargs) + elif backend == "docker": + return DockerSandboxBackend(**kwargs) + raise ValueError(f"Unknown sandbox backend: {backend!r}. Use 'e2b' or 'docker'.") diff --git a/src/openenv/core/harness/sandbox/base.py b/src/openenv/core/harness/sandbox/base.py index 4b2620799..d84e267e1 100644 --- a/src/openenv/core/harness/sandbox/base.py +++ b/src/openenv/core/harness/sandbox/base.py @@ -17,7 +17,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, Protocol, runtime_checkable +from typing import Protocol, runtime_checkable @dataclass diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py new file mode 100644 index 000000000..aeaacad7c --- /dev/null +++ b/src/openenv/core/harness/sandbox/docker_backend.py @@ -0,0 +1,328 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Docker implementation of :class:`SandboxBackend`. + +Runs each sandbox as a ``docker run -d`` container on the local machine. +Commands execute via ``docker exec``, files transfer via ``docker exec`` +with stdin piping. Suitable for CI, local dev, and environments without +KVM or cloud sandbox credentials. + +Usage:: + + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create() + result = sandbox.exec("echo hello") + print(result.stdout) # "hello" + sandbox.kill() +""" + +from __future__ import annotations + +import logging +import subprocess +import threading +import time +import uuid +from pathlib import PurePosixPath + +from openenv.core.harness.sandbox.base import BgJob, ExecResult + +_log = logging.getLogger(__name__) + + +class DockerBgJob: + """Handle to a background process running inside a Docker container. + + Launches the command via ``docker exec -d`` and tracks the wrapper + shell PID. Completion is detected by polling whether the PID is still + alive inside the container. + """ + + def __init__( + self, container_id: str, pid: int, poll_thread: threading.Thread + ) -> None: + self._container_id = container_id + self._pid = pid + self._exit_code: int | None = None + self._error: BaseException | None = None + self._done = threading.Event() + self._poll_thread = poll_thread + + @property + def pid(self) -> int: + return self._pid + + def wait(self, timeout: float | None = None) -> int: + if not self._done.wait(timeout=timeout): + raise TimeoutError( + f"Background command (pid={self._pid}) did not exit within {timeout}s" + ) + if self._error is not None: + raise self._error + return self._exit_code if self._exit_code is not None else 0 + + def kill(self) -> None: + try: + subprocess.run( + ["docker", "exec", self._container_id, "kill", "-9", str(self._pid)], + capture_output=True, + timeout=5, + ) + except Exception: + pass + self._done.set() + + +class DockerSandboxHandle: + """Wraps a running Docker container to satisfy :class:`SandboxHandle`.""" + + def __init__(self, container_id: str, *, user: str | None = None) -> None: + self._container_id = container_id + self._user = user + self._bg_jobs: list[DockerBgJob] = [] + + @property + def sandbox_id(self) -> str: + return self._container_id[:12] + + def exec( + self, + cmd: str, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + timeout: float | None = 60, + ) -> ExecResult: + docker_cmd = self._build_exec_cmd(envs=envs, cwd=cwd) + docker_cmd.extend(["bash", "-c", cmd]) + try: + result = subprocess.run( + docker_cmd, + capture_output=True, + text=True, + timeout=timeout, + ) + return ExecResult( + exit_code=result.returncode, + stdout=result.stdout, + stderr=result.stderr, + ) + except subprocess.TimeoutExpired: + return ExecResult( + exit_code=-1, stdout="", stderr=f"Command timed out after {timeout}s" + ) + except Exception as exc: + return ExecResult(exit_code=-1, stdout="", stderr=str(exc)) + + def start_bg( + self, + cmd: str, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + ) -> BgJob: + marker = f"/tmp/.bg_{uuid.uuid4().hex[:8]}" + wrapped = f"bash -c {_shell_quote(cmd + f'; echo $? > {marker}')} &\necho $!" + docker_cmd = self._build_exec_cmd(envs=envs, cwd=cwd) + docker_cmd.extend(["bash", "-c", wrapped]) + result = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=10) + if result.returncode != 0: + raise RuntimeError(f"Failed to start background command: {result.stderr}") + pid = int(result.stdout.strip().splitlines()[-1]) + + job = DockerBgJob(self._container_id, pid, poll_thread=None) # type: ignore[arg-type] + poll_thread = threading.Thread( + target=self._poll_bg_job, + args=(job, marker), + daemon=True, + ) + job._poll_thread = poll_thread + self._bg_jobs.append(job) + poll_thread.start() + return job + + def write_text(self, path: str, content: str) -> None: + parent = str(PurePosixPath(path).parent) + if parent not in ("", "/"): + subprocess.run( + ["docker", "exec", self._container_id, "mkdir", "-p", parent], + capture_output=True, + timeout=10, + ) + subprocess.run( + [ + "docker", + "exec", + "-i", + self._container_id, + "bash", + "-c", + f"cat > {_shell_quote(path)}", + ], + input=content.encode(), + capture_output=True, + timeout=30, + ) + + def read_text(self, path: str) -> str: + result = subprocess.run( + ["docker", "exec", self._container_id, "cat", path], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + raise FileNotFoundError(f"No such file in container: {path}") + return result.stdout + + def exists(self, path: str) -> bool: + result = subprocess.run( + ["docker", "exec", self._container_id, "test", "-e", path], + capture_output=True, + timeout=10, + ) + return result.returncode == 0 + + def kill(self) -> None: + for job in self._bg_jobs: + try: + job.kill() + except Exception: + pass + self._bg_jobs.clear() + try: + subprocess.run( + ["docker", "rm", "-f", self._container_id], + capture_output=True, + timeout=15, + ) + except Exception: + pass + + def _build_exec_cmd( + self, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + ) -> list[str]: + cmd = ["docker", "exec"] + if self._user: + cmd.extend(["-u", self._user]) + if cwd: + cmd.extend(["-w", cwd]) + for k, v in (envs or {}).items(): + cmd.extend(["-e", f"{k}={v}"]) + cmd.append(self._container_id) + return cmd + + def _poll_bg_job(self, job: DockerBgJob, marker: str) -> None: + while not job._done.is_set(): + try: + result = subprocess.run( + ["docker", "exec", self._container_id, "cat", marker], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout.strip(): + job._exit_code = int(result.stdout.strip()) + job._done.set() + return + except Exception: + pass + + # Also check if PID is gone (crash without writing marker). + try: + check = subprocess.run( + ["docker", "exec", self._container_id, "kill", "-0", str(job._pid)], + capture_output=True, + timeout=5, + ) + if check.returncode != 0: + job._exit_code = 1 + job._done.set() + return + except Exception: + pass + + time.sleep(0.5) + + +class DockerSandboxBackend: + """Creates Docker container sandboxes. + + Each :meth:`create` call spawns a fresh ``docker run -d`` container + that stays alive until :meth:`SandboxHandle.kill` is called or the + container's ``timeout_s`` sleep expires. + """ + + def __init__( + self, + *, + image: str = "ubuntu:22.04", + docker_args: list[str] | None = None, + user: str | None = None, + ) -> None: + self._image = image + self._docker_args = docker_args or [] + self._user = user + + try: + subprocess.run( + ["docker", "version"], + capture_output=True, + check=True, + timeout=5, + ) + except ( + subprocess.CalledProcessError, + FileNotFoundError, + subprocess.TimeoutExpired, + ) as exc: + raise RuntimeError( + "DockerSandboxBackend requires a running Docker daemon." + ) from exc + + def create( + self, + *, + timeout_s: int = 900, + envs: dict[str, str] | None = None, + metadata: dict[str, str] | None = None, + ) -> DockerSandboxHandle: + cmd = [ + "docker", + "run", + "-d", + "--label", + "openenv.sandbox=true", + ] + if metadata: + for k, v in metadata.items(): + cmd.extend(["--label", f"openenv.{k}={v}"]) + for k, v in (envs or {}).items(): + cmd.extend(["-e", f"{k}={v}"]) + cmd.extend(self._docker_args) + cmd.extend([self._image, "sleep", str(timeout_s)]) + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + raise RuntimeError( + f"Failed to create Docker sandbox: {result.stderr.strip()}" + ) + container_id = result.stdout.strip() + _log.info( + "Docker sandbox created: %s (image=%s)", container_id[:12], self._image + ) + return DockerSandboxHandle(container_id, user=self._user) + + +def _shell_quote(s: str) -> str: + """Single-quote a string for shell, escaping embedded single quotes.""" + return "'" + s.replace("'", "'\\''") + "'" diff --git a/src/openenv/core/harness/sandbox/e2b_backend.py b/src/openenv/core/harness/sandbox/e2b_backend.py index f344346ba..29c9d952d 100644 --- a/src/openenv/core/harness/sandbox/e2b_backend.py +++ b/src/openenv/core/harness/sandbox/e2b_backend.py @@ -21,8 +21,7 @@ from e2b import Sandbox from e2b.sandbox_sync.commands.command_handle import CommandHandle - -from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxBackend, SandboxHandle +from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxHandle class E2BBgJob: @@ -53,9 +52,7 @@ def pid(self) -> int: def wait(self, timeout: float | None = None) -> int: self._thread.join(timeout) if self._thread.is_alive(): - raise TimeoutError( - f"Background command did not exit within {timeout}s" - ) + raise TimeoutError(f"Background command did not exit within {timeout}s") if self._error is not None: # E2B raises CommandExitException on non-zero; treat as exit code. code = getattr(self._error, "exit_code", None) diff --git a/src/openenv/core/harness/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py index dc3dbe5be..d943fd755 100644 --- a/src/openenv/core/harness/sandbox/interception.py +++ b/src/openenv/core/harness/sandbox/interception.py @@ -130,9 +130,7 @@ async def chat_completions(request: Request) -> Response: try: body = json.loads(raw_body) except json.JSONDecodeError: - return JSONResponse( - status_code=400, content={"error": "invalid json body"} - ) + return JSONResponse(status_code=400, content={"error": "invalid json body"}) forwarded_body = _prepare_forwarded_body(body, cfg) headers = { @@ -338,7 +336,7 @@ async def _stream() -> Any: yield line + "\n" if not line.startswith("data:"): continue - data = line[len("data:"):].strip() + data = line[len("data:") :].strip() if data == "[DONE]": continue try: @@ -381,7 +379,11 @@ def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None tc_idx = tc.get("index", 0) bucket = acc["tool_calls_by_idx"].setdefault( (idx, tc_idx), - {"id": None, "type": "function", "function": {"name": "", "arguments": ""}}, + { + "id": None, + "type": "function", + "function": {"name": "", "arguments": ""}, + }, ) if tc.get("id"): bucket["id"] = tc["id"] @@ -487,8 +489,7 @@ def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]: choices = out.get("choices") if isinstance(choices, list): out["choices"] = [ - {k: v for k, v in (ch or {}).items() if k != "logprobs"} - for ch in choices + {k: v for k, v in (ch or {}).items() if k != "logprobs"} for ch in choices ] return out @@ -537,9 +538,7 @@ def start(self) -> None: lifespan="on", ) self._server = uvicorn.Server(config) - self._thread = threading.Thread( - target=self._run_server, daemon=True - ) + self._thread = threading.Thread(target=self._run_server, daemon=True) self._thread.start() # Wait for the server to accept connections. deadline = time.time() + 10 diff --git a/tests/core/test_docker_sandbox_backend.py b/tests/core/test_docker_sandbox_backend.py new file mode 100644 index 000000000..b47f6bd4e --- /dev/null +++ b/tests/core/test_docker_sandbox_backend.py @@ -0,0 +1,335 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for the Docker sandbox backend. + +Tests marked ``@pytest.mark.docker`` require a running Docker daemon and +are skipped in CI when Docker is unavailable. They exercise the real +``docker run`` / ``docker exec`` / ``docker rm`` lifecycle. +""" + +from __future__ import annotations + +import subprocess +import time + +import pytest + +_DOCKER_AVAILABLE = False +try: + subprocess.run( + ["docker", "version"], + capture_output=True, + check=True, + timeout=5, + ) + _DOCKER_AVAILABLE = True +except Exception: + pass + +docker = pytest.mark.skipif(not _DOCKER_AVAILABLE, reason="Docker not available") + + +class TestDockerSandboxBackendUnit: + """Unit tests that don't require Docker.""" + + def test_import(self): + from openenv.core.harness.sandbox.docker_backend import ( + DockerBgJob, + DockerSandboxBackend, + DockerSandboxHandle, + ) + + assert DockerSandboxBackend is not None + assert DockerSandboxHandle is not None + assert DockerBgJob is not None + + def test_exported_from_package(self): + from openenv.core.harness.sandbox import ( + DockerBgJob, + DockerSandboxBackend, + DockerSandboxHandle, + ) + + assert DockerSandboxBackend is not None + assert DockerSandboxHandle is not None + assert DockerBgJob is not None + + def test_create_sandbox_backend_factory(self): + from openenv.core.harness.sandbox import create_sandbox_backend + + assert callable(create_sandbox_backend) + + def test_create_sandbox_backend_unknown_raises(self): + from openenv.core.harness.sandbox import create_sandbox_backend + + with pytest.raises(ValueError, match="Unknown sandbox backend"): + create_sandbox_backend("bogus") # type: ignore[arg-type] + + @pytest.mark.skipif(_DOCKER_AVAILABLE, reason="Only test error when Docker missing") + def test_backend_raises_without_docker(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + with pytest.raises(RuntimeError, match="Docker daemon"): + DockerSandboxBackend() + + +@docker +class TestDockerSandboxBackendIntegration: + """Integration tests against a real Docker daemon.""" + + def test_create_and_kill(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + assert sandbox.sandbox_id + assert len(sandbox.sandbox_id) == 12 + finally: + sandbox.kill() + + def test_exec_echo(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + result = sandbox.exec("echo hello world") + assert result.exit_code == 0 + assert "hello world" in result.stdout + finally: + sandbox.kill() + + def test_exec_nonzero_exit(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + result = sandbox.exec("exit 42") + assert result.exit_code == 42 + finally: + sandbox.kill() + + def test_exec_with_env(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + result = sandbox.exec("echo $MY_VAR", envs={"MY_VAR": "test123"}) + assert result.exit_code == 0 + assert "test123" in result.stdout + finally: + sandbox.kill() + + def test_exec_with_cwd(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + result = sandbox.exec("pwd", cwd="/tmp") + assert result.exit_code == 0 + assert "/tmp" in result.stdout + finally: + sandbox.kill() + + def test_write_and_read_text(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + sandbox.write_text("/tmp/test.txt", "hello from test") + content = sandbox.read_text("/tmp/test.txt") + assert content == "hello from test" + finally: + sandbox.kill() + + def test_write_creates_parent_dirs(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + sandbox.write_text("/home/user/deep/nested/file.txt", "nested content") + content = sandbox.read_text("/home/user/deep/nested/file.txt") + assert content == "nested content" + finally: + sandbox.kill() + + def test_write_special_chars(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + text = "line1\nline2\n'quotes' and \"doubles\" and $vars" + sandbox.write_text("/tmp/special.txt", text) + content = sandbox.read_text("/tmp/special.txt") + assert content == text + finally: + sandbox.kill() + + def test_read_missing_file_raises(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + with pytest.raises(FileNotFoundError): + sandbox.read_text("/nonexistent/path.txt") + finally: + sandbox.kill() + + def test_exists(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + assert not sandbox.exists("/tmp/check_me.txt") + sandbox.write_text("/tmp/check_me.txt", "exists") + assert sandbox.exists("/tmp/check_me.txt") + finally: + sandbox.kill() + + def test_start_bg_and_wait(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + job = sandbox.start_bg("sleep 1 && echo done > /tmp/bg_out.txt") + exit_code = job.wait(timeout=10) + assert exit_code == 0 + content = sandbox.read_text("/tmp/bg_out.txt") + assert "done" in content + finally: + sandbox.kill() + + def test_start_bg_kill(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + job = sandbox.start_bg("sleep 300") + time.sleep(0.5) + job.kill() + # Should be able to wait without hanging + exit_code = job.wait(timeout=5) + # Exit code after kill is implementation-defined + assert isinstance(exit_code, int) + finally: + sandbox.kill() + + def test_start_bg_timeout(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + job = sandbox.start_bg("sleep 300") + with pytest.raises(TimeoutError): + job.wait(timeout=1) + job.kill() + finally: + sandbox.kill() + + def test_create_with_envs(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60, envs={"INIT_VAR": "from_create"}) + try: + result = sandbox.exec("echo $INIT_VAR") + assert "from_create" in result.stdout + finally: + sandbox.kill() + + def test_create_with_metadata(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create( + timeout_s=60, + metadata={"episode_id": "ep-123"}, + ) + try: + result = subprocess.run( + [ + "docker", + "inspect", + "--format", + '{{index .Config.Labels "openenv.episode_id"}}', + sandbox._container_id, + ], + capture_output=True, + text=True, + ) + assert "ep-123" in result.stdout + finally: + sandbox.kill() + + def test_factory_creates_docker_backend(self): + from openenv.core.harness.sandbox import create_sandbox_backend + + backend = create_sandbox_backend("docker", image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + result = sandbox.exec("echo ok") + assert result.exit_code == 0 + finally: + sandbox.kill() + + def test_satisfies_sandbox_handle_protocol(self): + from openenv.core.harness.sandbox import SandboxBackend + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + assert isinstance(sandbox, SandboxBackend) or hasattr(sandbox, "exec") + assert hasattr(sandbox, "sandbox_id") + assert hasattr(sandbox, "exec") + assert hasattr(sandbox, "start_bg") + assert hasattr(sandbox, "write_text") + assert hasattr(sandbox, "read_text") + assert hasattr(sandbox, "exists") + assert hasattr(sandbox, "kill") + finally: + sandbox.kill() + + def test_satisfies_sandbox_backend_protocol(self): + from openenv.core.harness.sandbox import SandboxBackend + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + assert issubclass(DockerSandboxBackend, SandboxBackend) + + def test_satisfies_bg_job_protocol(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + try: + job = sandbox.start_bg("sleep 1") + assert hasattr(job, "pid") + assert hasattr(job, "wait") + assert hasattr(job, "kill") + job.kill() + finally: + sandbox.kill() + + def test_kill_is_idempotent(self): + from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend + + backend = DockerSandboxBackend(image="ubuntu:22.04") + sandbox = backend.create(timeout_s=60) + sandbox.kill() + sandbox.kill() # should not raise From 9a350062eabbcf903e1a938939ee9d1cefb79cc8 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Wed, 13 May 2026 23:38:58 +0530 Subject: [PATCH 05/35] feat: pi agent adapter - fix agent handling --- src/openenv/core/harness/agents/base.py | 2 +- src/openenv/core/harness/agents/cli_driver.py | 25 ++- src/openenv/core/harness/agents/opencode.py | 71 ++++++--- src/openenv/core/harness/agents/pi.py | 145 ++++++++++++++++++ .../core/harness/sandbox/docker_backend.py | 13 +- 5 files changed, 225 insertions(+), 31 deletions(-) create mode 100644 src/openenv/core/harness/agents/pi.py diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py index 145d3001e..72cc9a6cf 100644 --- a/src/openenv/core/harness/agents/base.py +++ b/src/openenv/core/harness/agents/base.py @@ -41,7 +41,7 @@ class MCPConfigSpec: - ``"cli_flags"`` — the driver passes MCP configuration via CLI flags built by :attr:`CLIAgentSpec.build_command`. - ``"settings_file"`` — write into a global settings file (e.g. - Gemini's ``~/.gemini/settings.json``). + e.g. ``~/.config/agent/settings.json``). """ method: Literal["config_file", "cli_flags", "settings_file"] diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 8e8179889..760218687 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -166,11 +166,23 @@ def collect_artifacts(self) -> dict[str, Any]: if artifact_spec.format == "json": result[name] = json.loads(content) elif artifact_spec.format == "jsonl": - result[name] = [ - json.loads(line) - for line in content.splitlines() - if line.strip() - ] + # Parse valid JSON lines, skip non-JSON preamble + # (e.g. opencode emits database migration messages + # before the first JSON event). + records = [] + for line in content.splitlines(): + line = line.strip() + if not line: + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + _log.debug( + "Skipping non-JSON line in %s: %s", + artifact_spec.path, + line[:120], + ) + result[name] = records else: result[name] = content except Exception: @@ -468,7 +480,8 @@ def _write_mcp_config( home=home, ) mcp_content = self.spec.build_mcp_config(self.spec, [], workdir) - sandbox.write_text(mcp_path, mcp_content) + if mcp_content: + sandbox.write_text(mcp_path, mcp_content) # Agent launch diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py index b179e9c9f..875882f27 100644 --- a/src/openenv/core/harness/agents/opencode.py +++ b/src/openenv/core/harness/agents/opencode.py @@ -25,9 +25,6 @@ from .base import AgentEvent, ArtifactSpec, CLIAgentSpec, MCPConfigSpec -# Command / config / env builders - - def _build_opencode_command( spec: CLIAgentSpec, config: Any, @@ -44,7 +41,7 @@ def _build_opencode_command( return ( f'export PATH="$HOME/.opencode/bin:$PATH" && ' - f"cd {workdir} && " + f"cd {workdir} && git init -q 2>/dev/null; " f'opencode run {format_flag} "$(cat {instruction_file})" ' f"2>&1 | tee {log_file}" ).strip() @@ -55,22 +52,54 @@ def _build_opencode_mcp_config( tools: list[Any], workdir: str, ) -> str: - """Build the ``opencode.json`` content for the MCP config file.""" + """Build ``opencode.json`` content. + + Returns an empty string so the driver skips writing this file. + The actual config is written via ``spec.files`` using + ``_build_opencode_config_file`` which has access to the rollout + config (base_url, api_key, model). + """ + return "" + + +def _build_opencode_config_file(task: Any, config: Any) -> str: + """Build the full ``opencode.json`` dynamically from config fields.""" + base_url = ( + config.base_url if hasattr(config, "base_url") else "http://127.0.0.1:7000/v1" + ) + api_key = config.api_key if hasattr(config, "api_key") else "intercepted" + model = config.model if hasattr(config, "model") else "model" + timeout = ( + int(config.agent_timeout_s * 1000) + if hasattr(config, "agent_timeout_s") + else 600000 + ) + + # Split model into provider_name/model_id for the opencode config format. + # e.g. "zai-org/GLM-5.1:zai-org" becomes provider "hf", model_id as-is. + provider_name = "default" + model_id = model + if hasattr(config, "provider_name") and config.provider_name: + provider_name = config.provider_name + return json.dumps( { "$schema": "https://opencode.ai/config.json", - "model": "intercepted/model", + "model": f"{provider_name}/{model_id}", "provider": { - "intercepted": { + provider_name: { "npm": "@ai-sdk/openai-compatible", - "name": "Intercepted", + "name": provider_name.title(), "options": { - "baseURL": "http://127.0.0.1:7000/v1", - "apiKey": "intercepted", - "timeout": 600000, + "baseURL": base_url, + "apiKey": api_key, + "timeout": timeout, }, "models": { - "model": {"name": "Intercepted Model"}, + model_id: { + "name": model_id, + "id": model_id, + }, }, } }, @@ -107,12 +136,16 @@ def _parse_opencode_event(line: str) -> AgentEvent | None: return None event_type = data.get("type", "") - if event_type in ("assistant", "message"): + if event_type in ("assistant", "message", "text"): return AgentEvent(type="assistant", data=data, raw=line) elif event_type in ("tool_call", "tool_use"): return AgentEvent(type="tool_call", data=data, raw=line) elif event_type in ("tool_result", "tool_response"): return AgentEvent(type="tool_result", data=data, raw=line) + elif event_type in ("step_start",): + return AgentEvent(type="assistant", data=data, raw=line) + elif event_type in ("step_finish",): + return AgentEvent(type="done", data=data, raw=line) elif event_type == "error": return AgentEvent(type="error", data=data, raw=line) elif event_type in ("done", "complete", "end"): @@ -120,9 +153,6 @@ def _parse_opencode_event(line: str) -> AgentEvent | None: return AgentEvent(type="assistant", data=data, raw=line) -# File resolvers - - def _instruction_file_content(task: Any, config: Any) -> str: return task.instruction if hasattr(task, "instruction") else str(task) @@ -133,9 +163,6 @@ def _system_prompt_content(task: Any, config: Any) -> str | None: return None -# Spec definition - - OPENCODE_SPEC = CLIAgentSpec( name="opencode", install_check_cmd=["/home/user/.opencode/bin/opencode", "--version"], @@ -154,15 +181,16 @@ def _system_prompt_content(task: Any, config: Any) -> str | None: default_timeout_s=900.0, setup=( "set -e && " + "curl -fsSL https://opencode.ai/install | bash && " "mkdir -p /home/user/.config/opencode /home/user/logs/agent " "/home/user/logs/verifier /home/user/task /home/user/workdir && " - "curl -fsSL https://opencode.ai/install | bash && " 'export PATH="$HOME/.opencode/bin:$PATH" && ' "opencode --version" ), files={ "/home/user/task/instruction.md": _instruction_file_content, "/home/user/task/system.md": _system_prompt_content, + "/home/user/.config/opencode/opencode.json": _build_opencode_config_file, }, artifacts={ "agent_log": ArtifactSpec( @@ -181,11 +209,8 @@ def _system_prompt_content(task: Any, config: Any) -> str | None: build_env_vars=_build_opencode_env_vars, ) - -# Auto-register on import register_agent(OPENCODE_SPEC) - __all__ = [ "OPENCODE_SPEC", ] diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py new file mode 100644 index 000000000..7e0fa29c1 --- /dev/null +++ b/src/openenv/core/harness/agents/pi.py @@ -0,0 +1,145 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Pi coding agent adapter. + +Pi runs in print mode for non-interactive harness usage:: + + pi --no-session --no-context-files --provider

--model --thinking off \\ + -p @/home/user/task/instruction.txt 2>&1 | tee /home/user/logs/agent/pi.txt + +The provider and model are passed as CLI flags so the spec's ``env`` dict +only needs auth credentials (``HF_TOKEN``, ``OPENAI_API_KEY``, etc.). + +Registered on import:: + + import openenv.core.harness.agents.pi + # PI_SPEC is now in the registry +""" + +from __future__ import annotations + +import json +import shlex +from typing import Any + +from . import register_agent +from .base import AgentEvent, ArtifactSpec, CLIAgentSpec, MCPConfigSpec + + +def _instruction(task: Any, config: Any) -> str: + return task.instruction if hasattr(task, "instruction") else str(task) + + +def _system_prompt(task: Any, config: Any) -> str | None: + if hasattr(config, "system_prompt") and config.system_prompt: + return config.system_prompt + return None + + +def _build_command( + spec: CLIAgentSpec, + config: Any, + task: Any, + mcp_config_path: str | None, +) -> str: + home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" + instruction_file = f"{home}/task/instruction.txt" + log_file = f"{home}/logs/agent/pi.txt" + workdir = f"{home}/workdir" + + provider = "" + if hasattr(config, "provider") and config.provider: + provider = f" --provider {shlex.quote(config.provider)}" + model = "" + if hasattr(config, "model") and config.model: + model = f" --model {shlex.quote(config.model)}" + thinking = " --thinking off" + if hasattr(config, "thinking") and config.thinking: + thinking = f" --thinking {shlex.quote(config.thinking)}" + + return ( + f"cd {workdir} && git init -q 2>/dev/null; " + f"pi --no-session --no-context-files" + f"{provider}{model}{thinking}" + f" -p @{instruction_file}" + f" 2>&1 | tee {log_file}" + ) + + +def _build_mcp_config( + spec: CLIAgentSpec, + tools: list[Any], + workdir: str, +) -> str: + return json.dumps({"mcpServers": {}}, indent=2) + + +def _parse_events(line: str) -> AgentEvent | None: + line = line.strip() + if not line: + return None + try: + data = json.loads(line) + except json.JSONDecodeError: + return AgentEvent(type="assistant", data={"text": line}, raw=line) + + event_type = data.get("type", "") + if event_type in ("assistant", "message", "response"): + return AgentEvent(type="assistant", data=data, raw=line) + if event_type in ("tool_call", "tool_use", "function_call"): + return AgentEvent(type="tool_call", data=data, raw=line) + if event_type in ("tool_result", "tool_response"): + return AgentEvent(type="tool_result", data=data, raw=line) + if event_type in ("thinking", "reasoning"): + return AgentEvent(type="reasoning", data=data, raw=line) + if event_type == "error": + return AgentEvent(type="error", data=data, raw=line) + if event_type in ("done", "complete", "end"): + return AgentEvent(type="done", data=data, raw=line) + return AgentEvent(type="assistant", data=data, raw=line) + + +PI_SPEC = CLIAgentSpec( + name="pi", + install_check_cmd=["pi", "--version"], + base_command=["pi", "--no-session", "--no-context-files"], + mcp_config=MCPConfigSpec( + method="config_file", + path_template="{workdir}/.mcp.json", + ), + supports_logprob_proxy=True, + default_timeout_s=600.0, + setup=( + "set -e && " + "apt-get update -qq && apt-get install -y -qq curl ca-certificates gnupg && " + "curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && " + "apt-get install -y -qq nodejs && " + "curl -fsSL https://pi.dev/install.sh | sh && " + "mkdir -p /home/user/logs/agent /home/user/task /home/user/workdir && " + 'export PATH="$HOME/.local/bin:$HOME/.pi/bin:$PATH" && ' + "pi --version" + ), + files={ + "/home/user/task/instruction.txt": _instruction, + "/home/user/task/system.txt": _system_prompt, + }, + artifacts={ + "agent_log": ArtifactSpec(path="/home/user/logs/agent/pi.txt"), + }, + env={ + "HF_TOKEN": "{api_key}", + "PI_SKIP_VERSION_CHECK": "1", + "PI_TELEMETRY": "0", + }, + build_command=_build_command, + build_mcp_config=_build_mcp_config, + parse_events=_parse_events, +) + +register_agent(PI_SPEC) + +__all__ = ["PI_SPEC"] diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py index aeaacad7c..559817d1b 100644 --- a/src/openenv/core/harness/sandbox/docker_backend.py +++ b/src/openenv/core/harness/sandbox/docker_backend.py @@ -134,7 +134,18 @@ def start_bg( result = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=10) if result.returncode != 0: raise RuntimeError(f"Failed to start background command: {result.stderr}") - pid = int(result.stdout.strip().splitlines()[-1]) + # Extract PID from the last numeric-only line (commands may print + # banners like "Database migration complete." before the PID). + pid_line = None + for line in reversed(result.stdout.strip().splitlines()): + if line.strip().isdigit(): + pid_line = line.strip() + break + if pid_line is None: + raise RuntimeError( + f"Could not extract PID from start_bg output: {result.stdout!r}" + ) + pid = int(pid_line) job = DockerBgJob(self._container_id, pid, poll_thread=None) # type: ignore[arg-type] poll_thread = threading.Thread( From 06df791958922f7d08e35a36a2bab873e7c6f258 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Thu, 14 May 2026 10:44:34 +0530 Subject: [PATCH 06/35] chore: agent specifications and improve interception module ref --- src/openenv/core/harness/agents/cli_driver.py | 2 +- src/openenv/core/harness/agents/opencode.py | 2 +- src/openenv/core/harness/agents/pi.py | 2 ++ src/openenv/core/harness/sandbox/interception.py | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 760218687..42ac460f1 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -52,7 +52,7 @@ # already have it baked in. _PROXY_SOURCE_PATH = Path(__file__).resolve().parents[1] / "sandbox" / "interception.py" -# Verifier type — same as opencode_env's Verifier alias +# Verifier type — callable that checks the agent's work and returns a result Verifier = Callable[..., VerifyResult] diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py index 875882f27..d0146b008 100644 --- a/src/openenv/core/harness/agents/opencode.py +++ b/src/openenv/core/harness/agents/opencode.py @@ -8,7 +8,7 @@ Expresses the OpenCode harness as a purely declarative :class:`CLIAgentSpec`. All builders (command construction, config generation, env var resolution) -are self-contained with no imports from ``envs/opencode_env/``. +are self-contained with no imports from any environment package. Registered on import:: diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index 7e0fa29c1..63e2eb0c3 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -132,6 +132,8 @@ def _parse_events(line: str) -> AgentEvent | None: }, env={ "HF_TOKEN": "{api_key}", + "OPENAI_API_KEY": "{api_key}", + "OPENAI_BASE_URL": "{base_url}", "PI_SKIP_VERSION_CHECK": "1", "PI_TELEMETRY": "0", }, diff --git a/src/openenv/core/harness/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py index d943fd755..4e7c857ac 100644 --- a/src/openenv/core/harness/sandbox/interception.py +++ b/src/openenv/core/harness/sandbox/interception.py @@ -511,7 +511,7 @@ class InterceptionProxy: Used by unit tests and by any in-process driver that wants a short-lived proxy on the local machine. Inside a sandbox we invoke :func:`serve` - directly via ``python -m opencode_env.interception``. + directly via ``python -m openenv.core.harness.sandbox.interception``. """ def __init__(self, cfg: ProxyConfig) -> None: @@ -602,7 +602,7 @@ def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]: def main() -> None: - parser = argparse.ArgumentParser(prog="opencode_env.interception") + parser = argparse.ArgumentParser(prog="openenv.core.harness.sandbox.interception") parser.add_argument("--upstream-url", required=True) parser.add_argument( "--upstream-api-key", From a3c4a3d487dfd4bd9047e6ddadf5bdce8c0e4af3 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Thu, 14 May 2026 11:02:13 +0530 Subject: [PATCH 07/35] feat: add tests for opencode + pi harness adapters --- tests/core/test_cli_agent_driver.py | 51 +++++++++------- tests/core/test_harness_adapters.py | 93 +++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 21 deletions(-) create mode 100644 tests/core/test_harness_adapters.py diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index b26f01d67..29bf06caa 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -22,16 +22,10 @@ from typing import Any import pytest +from openenv.core.harness.sandbox.base import ExecResult, SandboxHandle -# Fake sandbox infrastructure (mirrors test_opencode_env.py pattern) - - -@dataclass -class FakeExecResult: - exit_code: int = 0 - stdout: str = "ok" - stderr: str = "" +# Fake sandbox infrastructure (mirrors test_coding_agent_env.py pattern) @dataclass @@ -75,24 +69,24 @@ def exec( envs: dict[str, str] | None = None, cwd: str | None = None, timeout: float | None = 60, - ) -> FakeExecResult: + ) -> ExecResult: self.executed.append(cmd) if cmd == "echo ok": - return FakeExecResult(exit_code=0, stdout="ok") + return ExecResult(exit_code=0, stdout="ok", stderr="") # install check — only standalone version-check commands (short, just # binary + --version) should be treated as install probes. Multi-part # setup scripts that happen to end with --version should succeed. if "--version" in cmd and len(cmd) < 80 and "&&" not in cmd: if self._install_check_succeeds: - return FakeExecResult(exit_code=0, stdout="1.0.0") - return FakeExecResult(exit_code=127, stderr="not found") + return ExecResult(exit_code=0, stdout="1.0.0", stderr="") + return ExecResult(exit_code=127, stdout="", stderr="not found") # healthz check if "healthz" in cmd: if self._healthz_succeeds: - return FakeExecResult(exit_code=0, stdout='{"status":"ok"}') - return FakeExecResult(exit_code=7, stderr="connection refused") + return ExecResult(exit_code=0, stdout='{"status":"ok"}', stderr="") + return ExecResult(exit_code=7, stdout="", stderr="connection refused") # All other commands succeed - return FakeExecResult(exit_code=0, stdout="") + return ExecResult(exit_code=0, stdout="", stderr="") def start_bg( self, @@ -138,7 +132,7 @@ def create( timeout_s: int = 900, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, - ) -> FakeSandbox: + ) -> SandboxHandle: sbx = FakeSandbox( install_check_succeeds=self._install_check_succeeds, healthz_succeeds=self._healthz_succeeds, @@ -252,7 +246,9 @@ def test_cli_agent_spec_full(self): parse_events=lambda line: None, ) assert spec.name == "full-agent" + assert spec.artifacts is not None assert len(spec.artifacts) == 2 + assert spec.files is not None assert callable(spec.files["/dynamic.txt"]) @@ -355,14 +351,14 @@ def test_auto_import_opencode(self): # PR 2.3: CLIAgentDriver / CLIAgentSession / CLIAgentSessionFactory -def _make_test_spec(**overrides: Any) -> Any: +def _make_test_spec(**overrides: Any): from openenv.core.harness.agents.base import ( ArtifactSpec, CLIAgentSpec, MCPConfigSpec, ) - defaults = dict( + defaults: dict[str, Any] = dict( name="test-agent", install_check_cmd=["test-agent", "--version"], base_command=["test-agent", "run", "--json"], @@ -428,6 +424,7 @@ def test_create_session_full_lifecycle(self): assert "test-agent run" in bg_cmd # Verify env vars were resolved + assert bg_envs is not None assert bg_envs["API_KEY"] == "sk-test-key" assert bg_envs["BASE_URL"] == "https://api.example.com/v1" assert bg_envs["MODEL"] == "test-model" @@ -491,6 +488,7 @@ def test_create_session_with_proxy(self): # Agent env should point at proxy agent_cmd, agent_envs = sbx.bg_commands[1] + assert agent_envs is not None assert agent_envs["BASE_URL"] == "http://127.0.0.1:7000/v1" session.close() @@ -819,6 +817,7 @@ def test_spec_fields(self): assert OPENCODE_SPEC.supports_logprob_proxy is True assert OPENCODE_SPEC.default_timeout_s == 900.0 assert OPENCODE_SPEC.mcp_config.method == "config_file" + assert OPENCODE_SPEC.mcp_config.path_template is not None assert "{home}" in OPENCODE_SPEC.mcp_config.path_template assert OPENCODE_SPEC.artifacts is not None assert "agent_log" in OPENCODE_SPEC.artifacts @@ -832,6 +831,7 @@ class OcConfig: sandbox_home: str = "/home/user" run_format: str = "json" + assert OPENCODE_SPEC.build_command is not None cmd = OPENCODE_SPEC.build_command( OPENCODE_SPEC, OcConfig(), @@ -845,18 +845,20 @@ class OcConfig: def test_build_mcp_config(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC + assert OPENCODE_SPEC.build_mcp_config is not None config_str = OPENCODE_SPEC.build_mcp_config( OPENCODE_SPEC, [], "/home/user/workdir", ) - config = json.loads(config_str) - assert "$schema" in config - assert "provider" in config + # OpenCode returns empty string because the config is written + # via spec.files using _build_opencode_config_file instead. + assert config_str == "" def test_parse_events_assistant(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC + assert OPENCODE_SPEC.parse_events is not None line = json.dumps({"type": "assistant", "content": "hello"}) event = OPENCODE_SPEC.parse_events(line) assert event is not None @@ -865,6 +867,7 @@ def test_parse_events_assistant(self): def test_parse_events_tool_call(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC + assert OPENCODE_SPEC.parse_events is not None line = json.dumps({"type": "tool_call", "name": "bash", "args": {}}) event = OPENCODE_SPEC.parse_events(line) assert event is not None @@ -873,6 +876,7 @@ def test_parse_events_tool_call(self): def test_parse_events_error(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC + assert OPENCODE_SPEC.parse_events is not None line = json.dumps({"type": "error", "message": "boom"}) event = OPENCODE_SPEC.parse_events(line) assert event is not None @@ -881,6 +885,7 @@ def test_parse_events_error(self): def test_parse_events_done(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC + assert OPENCODE_SPEC.parse_events is not None line = json.dumps({"type": "done"}) event = OPENCODE_SPEC.parse_events(line) assert event is not None @@ -889,6 +894,7 @@ def test_parse_events_done(self): def test_parse_events_invalid_json(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC + assert OPENCODE_SPEC.parse_events is not None assert OPENCODE_SPEC.parse_events("not json") is None assert OPENCODE_SPEC.parse_events("") is None @@ -897,6 +903,7 @@ def test_build_env_vars(self): config = FakeConfig() config.extra_env = {"EXTRA": "val"} + assert OPENCODE_SPEC.build_env_vars is not None envs = OPENCODE_SPEC.build_env_vars(OPENCODE_SPEC, config) assert envs["OPENAI_BASE_URL"] == "https://api.example.com/v1" assert envs["OPENAI_API_KEY"] == "sk-test-key" @@ -908,6 +915,7 @@ def test_files_instruction_resolver(self): task = FakeTask(instruction="Build a REST API") config = FakeConfig() + assert OPENCODE_SPEC.files is not None instruction_fn = OPENCODE_SPEC.files["/home/user/task/instruction.md"] assert callable(instruction_fn) assert instruction_fn(task, config) == "Build a REST API" @@ -917,6 +925,7 @@ def test_files_system_prompt_resolver(self): task = FakeTask() config = FakeConfig() + assert OPENCODE_SPEC.files is not None system_fn = OPENCODE_SPEC.files["/home/user/task/system.md"] assert callable(system_fn) # No system prompt on FakeConfig → returns None diff --git a/tests/core/test_harness_adapters.py b/tests/core/test_harness_adapters.py new file mode 100644 index 000000000..f5e1dc260 --- /dev/null +++ b/tests/core/test_harness_adapters.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for currently implemented harness adapters (OpenCode + Pi).""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any + +import pytest + + +@dataclass +class FakeTask: + instruction: str = "Write hello.py" + setup_shell: str | None = None + upload_files: dict[str, str] = field(default_factory=dict) + metadata: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class FakeConfig: + base_url: str = "https://api.example.com/v1" + api_key: str = "sk-test" + model: str = "test-model" + agent_timeout_s: float = 300.0 + sandbox_home: str = "/home/user" + system_prompt: str | None = None + + +class TestPiSpec: + def test_registered(self): + from openenv.core.harness.agents import get_agent_spec + + spec = get_agent_spec("pi") + assert spec.name == "pi" + + def test_fields(self): + from openenv.core.harness.agents.pi import PI_SPEC + + assert PI_SPEC.install_check_cmd == ["pi", "--version"] + assert PI_SPEC.mcp_config.method == "config_file" + assert PI_SPEC.mcp_config.path_template is not None + assert ".mcp.json" in PI_SPEC.mcp_config.path_template + assert PI_SPEC.env is not None + assert "HF_TOKEN" in PI_SPEC.env + assert "PI_SKIP_VERSION_CHECK" in PI_SPEC.env + + def test_build_command(self): + from openenv.core.harness.agents.pi import PI_SPEC + + assert PI_SPEC.build_command is not None + cmd = PI_SPEC.build_command(PI_SPEC, FakeConfig(), FakeTask(), None) + assert "pi --no-session" in cmd + assert "--no-context-files" in cmd + + def test_build_mcp_config(self): + from openenv.core.harness.agents.pi import PI_SPEC + + assert PI_SPEC.build_mcp_config is not None + content = PI_SPEC.build_mcp_config(PI_SPEC, [], "/workdir") + assert "mcpServers" in json.loads(content) + + +class TestOpenCodeSpec: + def test_registered(self): + from openenv.core.harness.agents import get_agent_spec + + spec = get_agent_spec("opencode") + assert spec.name == "opencode" + + +class TestRegistryAutoImport: + @pytest.mark.parametrize("name", ["pi", "opencode"]) + def test_auto_import(self, name): + from openenv.core.harness.agents import get_agent_spec + + spec = get_agent_spec(name) + assert spec.name == name + + def test_list_agents_includes_current(self): + import openenv.core.harness.agents.opencode # noqa: F401 + import openenv.core.harness.agents.pi # noqa: F401 + from openenv.core.harness.agents import list_agents + + agents = list_agents() + for name in ["opencode", "pi"]: + assert name in agents, f"{name} not in {agents}" From 81e37a2c4bced06024bc1bf434b33d36c73f99a8 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Thu, 14 May 2026 13:48:48 +0530 Subject: [PATCH 08/35] chore: migrate opencode_env to coding_agent_env --- docs/source/environments.md | 8 +- docs/source/environments/coding_agent.md | 2 + docs/source/environments/opencode.md | 2 - .../.dockerignore | 0 .../.gitignore | 0 .../README.md | 104 +++---- .../__init__.py | 30 +- .../client.py | 32 ++- .../config.py | 6 +- .../harness.py | 55 ++-- .../models.py | 8 +- .../opencode_runtime.py | 24 +- .../openenv.yaml | 2 +- .../pyproject.toml | 17 +- .../sandbox/__init__.py | 0 .../sandbox/build_template.py | 12 +- .../server/Dockerfile | 6 +- .../server/__init__.py | 2 +- .../server/app.py | 22 +- .../server/catalog.py | 0 .../server/coding_environment.py} | 262 ++++++++++++++---- .../server/gradio_ui.py | 59 ++-- .../task.py | 14 +- .../uv.lock | 70 ++--- ...v_simple.py => coding_agent_env_simple.py} | 24 +- ...encode_env.py => test_coding_agent_env.py} | 140 +++++++--- 26 files changed, 566 insertions(+), 335 deletions(-) create mode 100644 docs/source/environments/coding_agent.md delete mode 100644 docs/source/environments/opencode.md rename envs/{opencode_env => coding_agent_env}/.dockerignore (100%) rename envs/{opencode_env => coding_agent_env}/.gitignore (100%) rename envs/{opencode_env => coding_agent_env}/README.md (67%) rename envs/{opencode_env => coding_agent_env}/__init__.py (63%) rename envs/{opencode_env => coding_agent_env}/client.py (85%) rename envs/{opencode_env => coding_agent_env}/config.py (94%) rename envs/{opencode_env => coding_agent_env}/harness.py (87%) rename envs/{opencode_env => coding_agent_env}/models.py (90%) rename envs/{opencode_env => coding_agent_env}/opencode_runtime.py (86%) rename envs/{opencode_env => coding_agent_env}/openenv.yaml (76%) rename envs/{opencode_env => coding_agent_env}/pyproject.toml (68%) rename envs/{opencode_env => coding_agent_env}/sandbox/__init__.py (100%) rename envs/{opencode_env => coding_agent_env}/sandbox/build_template.py (91%) rename envs/{opencode_env => coding_agent_env}/server/Dockerfile (91%) rename envs/{opencode_env => coding_agent_env}/server/__init__.py (79%) rename envs/{opencode_env => coding_agent_env}/server/app.py (81%) rename envs/{opencode_env => coding_agent_env}/server/catalog.py (100%) rename envs/{opencode_env/server/opencode_environment.py => coding_agent_env/server/coding_environment.py} (70%) rename envs/{opencode_env => coding_agent_env}/server/gradio_ui.py (92%) rename envs/{opencode_env => coding_agent_env}/task.py (73%) rename envs/{opencode_env => coding_agent_env}/uv.lock (99%) rename examples/{opencode_env_simple.py => coding_agent_env_simple.py} (83%) rename tests/envs/{test_opencode_env.py => test_coding_agent_env.py} (73%) diff --git a/docs/source/environments.md b/docs/source/environments.md index a14564a1a..58f36c155 100644 --- a/docs/source/environments.md +++ b/docs/source/environments.md @@ -549,13 +549,13 @@ AgentWorldModel-1K — 1,000 synthetic MCP tool-use environments with 10,000 tas ``` ```` -````{grid-item-card} Opencode +````{grid-item-card} Coding Agent :class-card: sd-border-1 -`opencode_env` runs the OpenCode coding agent inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logpr... +`coding_agent_env` runs coding-agent harnesses (currently OpenCode + Pi) inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logpr... +++ -```{button-link} environments/opencode.html +```{button-link} environments/coding_agent.html :color: primary :outline: @@ -633,5 +633,5 @@ environments/tbench2 environments/unity environments/wildfire environments/agent_world_model -environments/opencode +environments/coding_agent ``` diff --git a/docs/source/environments/coding_agent.md b/docs/source/environments/coding_agent.md new file mode 100644 index 000000000..2903e2322 --- /dev/null +++ b/docs/source/environments/coding_agent.md @@ -0,0 +1,2 @@ +```{include} ../../../envs/coding_agent_env/README.md +``` diff --git a/docs/source/environments/opencode.md b/docs/source/environments/opencode.md deleted file mode 100644 index 9a93ebe33..000000000 --- a/docs/source/environments/opencode.md +++ /dev/null @@ -1,2 +0,0 @@ -```{include} ../../../envs/opencode_env/README.md -``` diff --git a/envs/opencode_env/.dockerignore b/envs/coding_agent_env/.dockerignore similarity index 100% rename from envs/opencode_env/.dockerignore rename to envs/coding_agent_env/.dockerignore diff --git a/envs/opencode_env/.gitignore b/envs/coding_agent_env/.gitignore similarity index 100% rename from envs/opencode_env/.gitignore rename to envs/coding_agent_env/.gitignore diff --git a/envs/opencode_env/README.md b/envs/coding_agent_env/README.md similarity index 67% rename from envs/opencode_env/README.md rename to envs/coding_agent_env/README.md index 79ebc6ed3..11fb88188 100644 --- a/envs/opencode_env/README.md +++ b/envs/coding_agent_env/README.md @@ -1,5 +1,5 @@ --- -title: OpenCode Environment Server +title: Coding Agent Environment Server emoji: 🛠️ colorFrom: indigo colorTo: purple @@ -9,23 +9,24 @@ app_port: 8000 base_path: /web tags: - openenv -short_description: OpenCode coding agent in an E2B sandbox with logprob capture +short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B with logprob capture --- -# OpenCode Environment for OpenEnv +# Coding Agent Environment for OpenEnv -`opencode_env` runs the [OpenCode](https://opencode.ai) coding agent inside -an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible +`coding_agent_env` runs coding-agent harnesses (currently +[OpenCode](https://opencode.ai) and [Pi](https://github.com/badlogic/pi-mono)) +inside an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logprobs for GRPO training. -**🚀 Try it live**: [`AdithyaSK/opencode-env`](https://huggingface.co/spaces/AdithyaSK/opencode-env) +**🚀 Try it live**: [`AdithyaSK/coding-agent-env`](https://huggingface.co/spaces/AdithyaSK/coding-agent-env) The deployed Space exposes: -- **Web UI** at [`/web`](https://adithyask-opencode-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward + logprobs. -- **MCP tool API** at [`/mcp`](https://adithyask-opencode-env.hf.space/mcp) — programmatic `run_rollout` calls. -- **OpenAPI docs** at [`/docs`](https://adithyask-opencode-env.hf.space/docs). -- **Health** at [`/health`](https://adithyask-opencode-env.hf.space/health). +- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward + logprobs. +- **MCP tool API** at [`/mcp`](https://adithyask-coding-agent-env.hf.space/mcp) — programmatic `run_rollout` calls. +- **OpenAPI docs** at [`/docs`](https://adithyask-coding-agent-env.hf.space/docs). +- **Health** at [`/health`](https://adithyask-coding-agent-env.hf.space/health). The env is **task-agnostic** — every rollout is configured at call-time with a uniform Task shape: @@ -47,20 +48,21 @@ a float to `/home/user/logs/verifier/reward.txt` (override). ```python import asyncio import os -from opencode_env import OpenCodeEnv -from opencode_env.client import _extract_text -from opencode_env.models import RolloutResult +from coding_agent_env import CodingAgentEnv +from coding_agent_env.client import _extract_text +from coding_agent_env.models import RolloutResult async def main(): - SPACE = "https://adithyask-opencode-env.hf.space" + SPACE = "https://adithyask-coding-agent-env.hf.space" - async with OpenCodeEnv(base_url=SPACE) as env: + async with CodingAgentEnv(base_url=SPACE) as env: await env.reset() # The MCP tool returns JSON; deserialize via the typed model. raw = await env.call_tool( "run_rollout", + agent="opencode", # opencode | pi endpoint="openai", # vllm | openai | hf_router api_key=os.environ["OPENAI_API_KEY"], # or set as a Space secret instruction=( @@ -75,7 +77,7 @@ async def main(): "import binary_search; " "assert binary_search.binary_search([1,2,3], 2) == 1; print('OK')\"", ], - template="opencode-rl", # prebaked E2B template + template="coding-agent-rl", # prebaked E2B template task_id="binary_search_v1", ) result = RolloutResult.model_validate_json(_extract_text(raw)) @@ -102,10 +104,10 @@ wall: 19.8 s ```python import os -from opencode_env import OpenCodeEnv +from coding_agent_env import CodingAgentEnv # .sync() returns a synchronous wrapper around the async client. -with OpenCodeEnv(base_url="https://adithyask-opencode-env.hf.space").sync() as env: +with CodingAgentEnv(base_url="https://adithyask-coding-agent-env.hf.space").sync() as env: env.reset() # MCP tools are reachable via env.call_tool(...) / env.step(...) sync-wrapped. # See the async example above for the full run_rollout signature. @@ -120,12 +122,12 @@ For trainers that want to drive a sandbox directly without an HTTP boundary: ```python import os -from opencode_env import ( - OpenCodeConfig, OpenCodeSessionFactory, OpenCodeTask, E2BSandboxBackend, +from coding_agent_env import ( + CodingAgentConfig, CodingAgentSessionFactory, CodingAgentTask, E2BSandboxBackend, ) -factory = OpenCodeSessionFactory( - config=OpenCodeConfig( +factory = CodingAgentSessionFactory( + config=CodingAgentConfig( provider="openai_compatible", base_url="https://api.openai.com/v1", api_key=os.environ["OPENAI_API_KEY"], @@ -134,7 +136,7 @@ factory = OpenCodeSessionFactory( sandbox_backend=E2BSandboxBackend(), mode="transparent_proxy", # captures per-token logprobs ) -session = factory.create(task=OpenCodeTask(instruction="...")) +session = factory.create(task=CodingAgentTask(instruction="...")) session.wait_for_completion() turns = session.fetch_proxy_trace() # per-turn (tokens, logprobs) session.close() @@ -146,22 +148,22 @@ The Dockerfile lives at `server/Dockerfile`. Use the `openenv` CLI from the env root: ```bash -cd envs/opencode_env +cd envs/coding_agent_env openenv validate # check pyproject.toml + openenv.yaml + server/app.py + uv.lock -openenv build -t opencode-env # builds the image (uses server/Dockerfile) +openenv build -t coding-agent-env # builds the image (uses server/Dockerfile) # run locally with E2B credentials -docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env +docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env # push to HF Spaces (Docker variant) -openenv push --repo-id /opencode-env +openenv push --repo-id /coding-agent-env ``` Or build directly without the CLI: ```bash -docker build -t opencode-env -f envs/opencode_env/server/Dockerfile envs/opencode_env +docker build -t coding-agent-env -f envs/coding_agent_env/server/Dockerfile envs/coding_agent_env ``` The image: @@ -174,7 +176,7 @@ The image: ## The MCP Tool: `run_rollout` -Single tool, two ways to specify the LLM endpoint: +Single tool, with an ``agent`` selector plus two ways to specify the LLM endpoint: **Option A — endpoint shorthand (recommended)**: pass `endpoint="vllm"` (or `"openai"` / `"hf_router"`). The server resolves @@ -186,9 +188,10 @@ directly. | Arg | Type | Default | Notes | |---|---|---|---| +| `agent` | `str` | `"opencode"` | Harness to run: `"opencode"` or `"pi"`. | | `endpoint` | `str` | `""` | One of `"vllm"` / `"openai"` / `"hf_router"`. | | `base_url` / `api_key` / `model` | `str` | `""` | Override / supply explicitly. | -| `instruction` | `str` | required | Prompt passed to `opencode run`. | +| `instruction` | `str` | required | Prompt passed to the selected harness CLI. | | `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. | | `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. | | `task_id` | `str` | `""` | Echoed back in result. | @@ -196,8 +199,8 @@ directly. | `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. | | `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. | | `top_logprobs` | `int` | `5` | HF Router cap is 5; OpenAI 0–20; vLLM unbounded. | -| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for opencode. | -| `template` | `str` | `""` | E2B template name; `"opencode-rl"` skips ~2 min of install per rollout. | +| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for the selected harness. | +| `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. | Returns `RolloutResult` JSON with: `reward`, `setup_results[]`, `verify_results[]`, `proxy_turns[]`, `files{}`, `agent_log_tail`, @@ -207,8 +210,8 @@ Returns `RolloutResult` JSON with: `reward`, `setup_results[]`, | Mode | What it does | Best for | |---|---|---| -| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards opencode's LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. | -| **`black_box`** | No proxy. opencode talks straight to `base_url`. | Smoke tests, eval, SFT data collection. | +| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards harness LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. | +| **`black_box`** | No proxy. The selected harness talks straight to `base_url`. | Smoke tests, eval, SFT data collection. | ## Environment Variables @@ -240,21 +243,21 @@ Hyperbolic / Featherless (silent drop) and Groq (HTTP 400). ## Pre-baked E2B Template The first rollout in a fresh E2B sandbox spends ~2 min installing -opencode and the proxy's Python deps. Build a one-time template that +harness tooling and the proxy's Python deps. Build a one-time template that ships those pre-installed: ```bash -.venv/bin/python envs/opencode_env/sandbox/build_template.py -# → builds `opencode-rl` template in your E2B account (~1m20s, one-time) +.venv/bin/python envs/coding_agent_env/sandbox/build_template.py +# → builds `coding-agent-rl` template in your E2B account (~1m20s, one-time) ``` -After this, pass `template="opencode-rl"` on every `run_rollout` call — +After this, pass `template="coding-agent-rl"` on every `run_rollout` call — each rollout drops to ~20–30s end-to-end. ## Project Structure ``` -opencode_env/ +coding_agent_env/ ├── README.md # this file ├── openenv.yaml # OpenEnv space spec ├── pyproject.toml # deps + ``server`` entrypoint @@ -262,33 +265,38 @@ opencode_env/ ├── .gitignore / .dockerignore # excludes .env / __pycache__ ├── __init__.py # re-exports primitive + client + models │ -├── client.py # OpenCodeEnv(MCPToolClient) -├── models.py # RolloutResult / RolloutTurn / OpenCodeState +├── client.py # CodingAgentEnv(MCPToolClient) +├── models.py # RolloutResult / RolloutTurn / CodingAgentState │ -├── config.py # OpenCodeConfig (primitive) -├── harness.py # OpenCodeSession / OpenCodeSessionFactory (CLI-only) +├── config.py # CodingAgentConfig (primitive) +├── harness.py # CodingAgentSession / CodingAgentSessionFactory (CLI-only) ├── opencode_runtime.py # opencode.json builder + cmds -├── task.py # OpenCodeTask +├── task.py # CodingAgentTask │ ├── server/ │ ├── __init__.py │ ├── app.py # FastAPI factory; mounts Gradio at /web -│ ├── opencode_environment.py # MCPEnvironment with single ``run_rollout`` tool +│ ├── coding_environment.py # MCPEnvironment with single ``run_rollout`` tool │ ├── gradio_ui.py # the /web Gradio Blocks UI │ ├── catalog.py # endpoint shorthand resolver │ └── Dockerfile # multi-stage uv build (used by ``openenv build``) │ └── sandbox/ ├── __init__.py - ├── base.py # SandboxBackend / SandboxHandle Protocols - ├── e2b.py # E2B implementation - ├── interception.py # in-sandbox FastAPI proxy (logprob capture) └── build_template.py # one-time E2B template builder + +# Shared sandbox runtime (moved to core): +src/openenv/core/harness/sandbox/ +├── base.py # SandboxBackend / SandboxHandle protocols +├── e2b_backend.py # E2B implementation +├── docker_backend.py # local Docker backend +└── interception.py # in-sandbox FastAPI proxy (logprob capture) ``` ## References - [OpenEnv docs](https://meta-pytorch.org/OpenEnv/) - [OpenCode CLI](https://opencode.ai/docs/cli/) +- [Pi](https://github.com/badlogic/pi-mono) - [E2B Python SDK](https://e2b.dev/docs) - [HF Inference Providers logprob matrix](../../../DOCS/HF/hf_inference_providers_logprobs.md) diff --git a/envs/opencode_env/__init__.py b/envs/coding_agent_env/__init__.py similarity index 63% rename from envs/opencode_env/__init__.py rename to envs/coding_agent_env/__init__.py index dcd48a01c..6b839e7ea 100644 --- a/envs/opencode_env/__init__.py +++ b/envs/coding_agent_env/__init__.py @@ -4,16 +4,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""OpenCode environment for OpenEnv. +"""Coding-agent environment for OpenEnv. Two layers in this package: -1. **Harness primitive** -- :class:`OpenCodeSessionFactory` / - :class:`OpenCodeSession` / :class:`OpenCodeConfig` / +1. **Harness primitive** -- :class:`CodingAgentSessionFactory` / + :class:`CodingAgentSession` / :class:`CodingAgentConfig` / :class:`E2BSandboxBackend`. Built on the generic :class:`CLIAgentDriver` from ``openenv.core.harness.agents``. -2. **Deployable env** -- :class:`OpenCodeEnv` (MCP client) talks to the +2. **Deployable env** -- :class:`CodingAgentEnv` (MCP client) talks to the FastAPI server at ``server/app.py`` over HTTP. Use this when the sandbox + agent live behind an HTTP boundary (e.g. an HF Space). See ``client.py`` and ``server/``. @@ -22,11 +22,11 @@ from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle -from .client import OpenCodeEnv -from .config import OpenCodeConfig, Provider -from .harness import OpenCodeSession, OpenCodeSessionFactory -from .models import CommandResult, OpenCodeState, RolloutResult, RolloutTurn -from .task import OpenCodeTask +from .client import CodingAgentEnv +from .config import CodingAgentConfig, Provider +from .harness import CodingAgentSession, CodingAgentSessionFactory +from .models import CommandResult, CodingAgentState, RolloutResult, RolloutTurn +from .task import CodingAgentTask try: from openenv.core.harness.sandbox import E2BSandboxBackend @@ -35,19 +35,19 @@ __all__ = [ # Deployed-env client - "OpenCodeEnv", + "CodingAgentEnv", "CallToolAction", "ListToolsAction", # HTTP API models "CommandResult", - "OpenCodeState", + "CodingAgentState", "RolloutResult", "RolloutTurn", # Harness primitive - "OpenCodeConfig", - "OpenCodeSession", - "OpenCodeSessionFactory", - "OpenCodeTask", + "CodingAgentConfig", + "CodingAgentSession", + "CodingAgentSessionFactory", + "CodingAgentTask", "Provider", # Sandbox backend "E2BSandboxBackend", diff --git a/envs/opencode_env/client.py b/envs/coding_agent_env/client.py similarity index 85% rename from envs/opencode_env/client.py rename to envs/coding_agent_env/client.py index 52e76e2d5..8c512090d 100644 --- a/envs/opencode_env/client.py +++ b/envs/coding_agent_env/client.py @@ -4,16 +4,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Client for the deployed opencode_env server. +"""Client for the deployed coding_agent_env server. -The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode -rollout in an E2B sandbox and returns a JSON-serialized :class:`RolloutResult`. +The server exposes a single MCP tool ``run_rollout`` that runs one coding-agent +rollout (OpenCode or Pi) in an E2B sandbox and returns a JSON-serialized +:class:`RolloutResult`. Example:: - from opencode_env import OpenCodeEnv + from coding_agent_env import CodingAgentEnv - with OpenCodeEnv(base_url="https://adithya-sk-opencode-env.hf.space") as env: + with CodingAgentEnv(base_url="https://your-space.hf.space") as env: env.reset() result = env.run_rollout( base_url="https://api.openai.com/v1", @@ -40,8 +41,8 @@ from models import RolloutResult # type: ignore -class OpenCodeEnv(MCPToolClient): - """Typed client for the opencode_env MCP server. +class CodingAgentEnv(MCPToolClient): + """Typed client for the coding_agent_env MCP server. Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image`` / context-manager semantics from :class:`MCPToolClient`. @@ -50,7 +51,8 @@ class OpenCodeEnv(MCPToolClient): def run_rollout( self, *, - # Endpoint — pass either the shorthand selector OR explicit fields. + # Agent + endpoint — pass either shorthand endpoint or explicit fields. + agent: str = "opencode", # "opencode" | "pi" endpoint: str = "", # "vllm" | "openai" | "hf_router" base_url: str = "", api_key: str = "", @@ -68,16 +70,17 @@ def run_rollout( agent_timeout_s: float = 600.0, template: str = "", ) -> RolloutResult: - """Run one OpenCode rollout and return the typed result. + """Run one coding-agent rollout and return the typed result. Args: + agent: Harness CLI to run in sandbox (``"opencode"`` or ``"pi"``). base_url: OpenAI-compatible LLM endpoint (with trailing /v1). api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM if it doesn't enforce auth. model: Model id understood by the LLM endpoint (e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``, ``"Qwen/Qwen3-4B-Instruct-2507:nscale"``). - instruction: Prompt passed to ``opencode run``. + instruction: Prompt passed to the selected harness CLI. setup: Bash commands run sequentially **before** the agent starts. Each command runs in the sandbox; non-zero exit aborts setup. verify: Bash commands run sequentially **after** the agent exits. @@ -90,12 +93,11 @@ def run_rollout( ``chat_template_kwargs.enable_thinking=false`` on forwarded requests. Needed for Qwen3.5 vLLM; harmless on Instruct variants; rejected by OpenAI direct. - max_tokens_cap: Clamp on per-turn ``max_tokens``. OpenCode asks - for ~32k by default; gpt-4o-mini caps at 16k. + max_tokens_cap: Clamp on per-turn ``max_tokens``. top_logprobs: Top-k logprobs requested upstream. HF Router caps at 5; OpenAI accepts up to 20; vLLM is unbounded. - agent_timeout_s: Hard wall-clock budget for one ``opencode run``. - template: E2B template name (e.g. ``"opencode-rl"``). Empty + agent_timeout_s: Hard wall-clock budget for one agent run. + template: E2B template name (e.g. ``"coding-agent-rl"``). Empty string uses the default (slow) base image. Returns: @@ -104,6 +106,7 @@ def run_rollout( """ raw = self.call_tool( "run_rollout", + agent=agent, endpoint=endpoint, base_url=base_url, api_key=api_key, @@ -166,3 +169,4 @@ def _extract_text(result: Any) -> str: return text return str(result) + diff --git a/envs/opencode_env/config.py b/envs/coding_agent_env/config.py similarity index 94% rename from envs/opencode_env/config.py rename to envs/coding_agent_env/config.py index 2b6bae0a2..2eac8d16f 100644 --- a/envs/opencode_env/config.py +++ b/envs/coding_agent_env/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Configuration model for the OpenCode harness primitive.""" +"""Configuration model for the coding-agent harness primitive.""" from __future__ import annotations @@ -16,8 +16,8 @@ Provider = Literal["openai_compatible", "openai", "anthropic"] -class OpenCodeConfig(BaseModel): - """All configuration required to launch one OpenCode rollout in a sandbox. +class CodingAgentConfig(BaseModel): + """All configuration required to launch one coding-agent rollout in a sandbox. Field names are provider-agnostic. The primitive maps ``provider`` onto the correct ``opencode.json`` provider block (``@ai-sdk/openai-compatible``, diff --git a/envs/opencode_env/harness.py b/envs/coding_agent_env/harness.py similarity index 87% rename from envs/opencode_env/harness.py rename to envs/coding_agent_env/harness.py index 600aafa82..ccbfa2cfc 100644 --- a/envs/opencode_env/harness.py +++ b/envs/coding_agent_env/harness.py @@ -4,16 +4,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""OpenCode session factory + session — backed by CLIAgentDriver. +"""Coding-agent session factory + session — backed by CLIAgentDriver. -This module exposes :class:`OpenCodeSession` and -:class:`OpenCodeSessionFactory` built on top of the generic +This module exposes :class:`CodingAgentSession` and +:class:`CodingAgentSessionFactory` built on top of the generic :class:`CLIAgentDriver` / :class:`CLIAgentSession` / :class:`CLIAgentSessionFactory` from ``openenv.core.harness.agents``. -OpenCode-specific configuration (``opencode.json`` generation, provider +Agent-specific (OpenCode spec) configuration (``opencode.json`` generation, provider mapping, tool enable/disable) is handled by -:mod:`opencode_env.opencode_runtime` builders wired into the +:mod:`coding_agent_env.opencode_runtime` builders wired into the :data:`OPENCODE_SPEC` via callable hooks. """ @@ -31,7 +31,7 @@ from openenv.core.harness.agents.opencode import OPENCODE_SPEC from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle -from .config import OpenCodeConfig +from .config import CodingAgentConfig from .opencode_runtime import ( agent_log_path, build_env_vars, @@ -42,7 +42,7 @@ opencode_config_path, system_prompt_path, ) -from .task import OpenCodeTask +from .task import CodingAgentTask # Inside-sandbox proxy paths (Mode B). @@ -61,21 +61,19 @@ ) -class OpenCodeSession(CLIAgentSession): - """One live OpenCode rollout inside a sandbox. +class CodingAgentSession(CLIAgentSession): + """One live coding-agent rollout inside a sandbox. - Extends :class:`CLIAgentSession` with OpenCode-specific convenience - methods (``fetch_trace``, ``wait_for_completion`` with config-aware - timeout). Fully backward-compatible with code that used the old - ``OpenCodeSession`` API. + Extends :class:`CLIAgentSession` with Agent-specific (OpenCode spec) convenience + methods (``fetch_trace``, ``wait_for_completion`` with config-aware timeout). """ def __init__( self, *, sandbox: SandboxHandle, - config: OpenCodeConfig, - task: OpenCodeTask, + config: CodingAgentConfig, + task: CodingAgentTask, verifier: Verifier | None = None, base_url_override: str | None = None, proxy_trace_path: str | None = None, @@ -108,8 +106,7 @@ def wait_for_completion(self, timeout_s: float | None = None) -> int: def start_agent(self) -> None: """Launch ``opencode run`` as a background subprocess in the sandbox. - Provided for backward compatibility — the factory now starts the - agent during ``create()``, so calling this manually is a no-op + The factory starts the agent during ``create()``; this method is a no-op if the agent is already running. """ if self._agent_bg_job is not None: @@ -119,8 +116,8 @@ def start_agent(self) -> None: self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs) -class OpenCodeSessionFactory(ResourceSessionFactory): - """Produce isolated per-rollout :class:`OpenCodeSession` instances. +class CodingAgentSessionFactory(ResourceSessionFactory): + """Produce isolated per-rollout :class:`CodingAgentSession` instances. The factory owns sandbox provisioning, opencode install, config injection, and (Mode B) proxy startup. Each :meth:`create` call returns a fresh @@ -128,13 +125,13 @@ class OpenCodeSessionFactory(ResourceSessionFactory): Internally delegates to :class:`CLIAgentDriver` for the generic sandbox lifecycle (readiness probing, install retry, proxy startup). - OpenCode-specific config generation uses ``opencode_runtime`` builders. + Agent-specific (OpenCode spec) config generation uses ``opencode_runtime`` builders. """ def __init__( self, *, - config: OpenCodeConfig, + config: CodingAgentConfig, sandbox_backend: SandboxBackend, mode: Literal["black_box", "transparent_proxy"] = "black_box", verifier: Verifier | None = None, @@ -167,12 +164,12 @@ def create( task: Any, seed: int | None = None, episode_id: str | None = None, - ) -> OpenCodeSession: + ) -> CodingAgentSession: import logging _log = logging.getLogger(__name__) - oc_task = OpenCodeTask.coerce(task) + oc_task = CodingAgentTask.coerce(task) sandbox_timeout = int(self._config.agent_timeout_s) + 300 _log.info( @@ -213,7 +210,7 @@ def create( ) _log.info("factory.create: proxy up at %s", base_url_override) # Rewrite opencode.json so opencode points at the proxy. - proxy_cfg = OpenCodeConfig( + proxy_cfg = CodingAgentConfig( **{ **self._config.model_dump(), "provider": "openai_compatible", @@ -225,7 +222,7 @@ def create( build_opencode_json(proxy_cfg), ) - session = OpenCodeSession( + session = CodingAgentSession( sandbox=sandbox, config=self._config, task=oc_task, @@ -244,7 +241,7 @@ def create( def _bootstrap_sandbox( self, sandbox: SandboxHandle, - task: OpenCodeTask, + task: CodingAgentTask, ) -> None: """Install opencode, write config + task files, run optional setup.""" @@ -310,8 +307,8 @@ def _start_proxy( __all__ = [ - "OpenCodeSession", - "OpenCodeSessionFactory", - "OpenCodeTask", + "CodingAgentSession", + "CodingAgentSessionFactory", + "CodingAgentTask", "Verifier", ] diff --git a/envs/opencode_env/models.py b/envs/coding_agent_env/models.py similarity index 90% rename from envs/opencode_env/models.py rename to envs/coding_agent_env/models.py index b218c5f78..3e31962fb 100644 --- a/envs/opencode_env/models.py +++ b/envs/coding_agent_env/models.py @@ -4,11 +4,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Pydantic models for the deployed opencode_env HTTP server. +"""Pydantic models for the deployed coding_agent_env HTTP server. The server exposes a single MCP tool ``run_rollout`` that takes a Task (instruction + setup commands + verify commands) plus an LLM endpoint -config, runs one OpenCode rollout end-to-end inside an E2B sandbox, and +config, runs one coding-agent rollout end-to-end inside an E2B sandbox, and returns a :class:`RolloutResult` JSON. """ @@ -80,8 +80,8 @@ class RolloutResult(BaseModel): error: str | None = None -class OpenCodeState(State): - """Per-session env state across calls to one OpenCodeEnvironment instance. +class CodingAgentState(State): + """Per-session env state across calls to one CodingAgentEnvironment instance. Each HTTP session gets its own env (``SUPPORTS_CONCURRENT_SESSIONS=True`` on the server class), so this state is per-session. diff --git a/envs/opencode_env/opencode_runtime.py b/envs/coding_agent_env/opencode_runtime.py similarity index 86% rename from envs/opencode_env/opencode_runtime.py rename to envs/coding_agent_env/opencode_runtime.py index 75fed41e3..49855528b 100644 --- a/envs/opencode_env/opencode_runtime.py +++ b/envs/coding_agent_env/opencode_runtime.py @@ -16,34 +16,34 @@ import json from typing import Any -from .config import OpenCodeConfig, provider_npm_package +from .config import CodingAgentConfig, provider_npm_package -def opencode_config_path(config: OpenCodeConfig) -> str: +def opencode_config_path(config: CodingAgentConfig) -> str: return f"{config.sandbox_home}/.config/opencode/opencode.json" -def instruction_path(config: OpenCodeConfig) -> str: +def instruction_path(config: CodingAgentConfig) -> str: return f"{config.sandbox_home}/task/instruction.md" -def agent_log_path(config: OpenCodeConfig) -> str: +def agent_log_path(config: CodingAgentConfig) -> str: return f"{config.sandbox_home}/logs/agent/opencode.jsonl" -def system_prompt_path(config: OpenCodeConfig) -> str: +def system_prompt_path(config: CodingAgentConfig) -> str: return f"{config.sandbox_home}/task/system.md" -def verifier_reward_path(config: OpenCodeConfig) -> str: +def verifier_reward_path(config: CodingAgentConfig) -> str: return f"{config.sandbox_home}/logs/verifier/reward.txt" -def workdir_path(config: OpenCodeConfig) -> str: +def workdir_path(config: CodingAgentConfig) -> str: return f"{config.sandbox_home}/workdir" -def build_opencode_json(config: OpenCodeConfig) -> str: +def build_opencode_json(config: CodingAgentConfig) -> str: """Return the serialized ``opencode.json`` the sandbox should install. Provider block is keyed by a stable internal name (``intercepted``) so the @@ -79,7 +79,7 @@ def build_opencode_json(config: OpenCodeConfig) -> str: return json.dumps(doc, indent=2) -def build_install_cmd(config: OpenCodeConfig) -> str: +def build_install_cmd(config: CodingAgentConfig) -> str: """Return the shell command that installs OpenCode + ensures PATH. The upstream installer honors ``OPENCODE_VERSION=x.y.z`` for pinning; @@ -99,7 +99,7 @@ def build_install_cmd(config: OpenCodeConfig) -> str: ) -def build_run_cmd(config: OpenCodeConfig) -> str: +def build_run_cmd(config: CodingAgentConfig) -> str: """Return the shell command that launches OpenCode against a task.""" format_flag = "--format json" if config.run_format == "json" else "" @@ -112,7 +112,7 @@ def build_run_cmd(config: OpenCodeConfig) -> str: def build_env_vars( - config: OpenCodeConfig, *, base_url_override: str | None = None + config: CodingAgentConfig, *, base_url_override: str | None = None ) -> dict[str, str]: """Return env vars to set on the OpenCode process. @@ -129,7 +129,7 @@ def build_env_vars( return env -def _build_tools_block(config: OpenCodeConfig) -> dict[str, bool]: +def _build_tools_block(config: CodingAgentConfig) -> dict[str, bool]: """Translate enabled/disabled lists into opencode's ``tools`` map.""" if config.enabled_tools is not None: diff --git a/envs/opencode_env/openenv.yaml b/envs/coding_agent_env/openenv.yaml similarity index 76% rename from envs/opencode_env/openenv.yaml rename to envs/coding_agent_env/openenv.yaml index 2a534a088..be34c3a51 100644 --- a/envs/opencode_env/openenv.yaml +++ b/envs/coding_agent_env/openenv.yaml @@ -1,5 +1,5 @@ spec_version: 1 -name: opencode_env +name: coding_agent_env type: space runtime: fastapi app: server.app:app diff --git a/envs/opencode_env/pyproject.toml b/envs/coding_agent_env/pyproject.toml similarity index 68% rename from envs/opencode_env/pyproject.toml rename to envs/coding_agent_env/pyproject.toml index 50337baa2..276d3e0be 100644 --- a/envs/opencode_env/pyproject.toml +++ b/envs/coding_agent_env/pyproject.toml @@ -9,9 +9,9 @@ requires = ["setuptools>=45", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "openenv-opencode-env" +name = "openenv-coding-agent-env" version = "0.1.0" -description = "OpenCode coding-agent environment for OpenEnv — runs the OpenCode CLI in an E2B sandbox against any OpenAI-compatible LLM, optionally capturing per-token logprobs." +description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints, optionally capturing per-token logprobs." requires-python = ">=3.10" dependencies = [ # Core OpenEnv (server + MCP). 0.3.0 ships the harness runtime. @@ -40,17 +40,16 @@ dev = [ [project.scripts] # Server entrypoint — enables ``uv run --project . server``. -server = "opencode_env.server.app:main" +server = "coding_agent_env.server.app:main" [tool.setuptools] include-package-data = true packages = [ - "opencode_env", - "opencode_env.sandbox", - "opencode_env.server", - "opencode_env.tests", + "coding_agent_env", + "coding_agent_env.sandbox", + "coding_agent_env.server", ] -package-dir = { "opencode_env" = ".", "opencode_env.sandbox" = "sandbox", "opencode_env.server" = "server", "opencode_env.tests" = "tests" } +package-dir = { "coding_agent_env" = ".", "coding_agent_env.sandbox" = "sandbox", "coding_agent_env.server" = "server" } [tool.setuptools.package-data] -opencode_env = ["**/*.md"] +coding_agent_env = ["**/*.md"] diff --git a/envs/opencode_env/sandbox/__init__.py b/envs/coding_agent_env/sandbox/__init__.py similarity index 100% rename from envs/opencode_env/sandbox/__init__.py rename to envs/coding_agent_env/sandbox/__init__.py diff --git a/envs/opencode_env/sandbox/build_template.py b/envs/coding_agent_env/sandbox/build_template.py similarity index 91% rename from envs/opencode_env/sandbox/build_template.py rename to envs/coding_agent_env/sandbox/build_template.py index 6e0ba4f75..e22b30185 100644 --- a/envs/opencode_env/sandbox/build_template.py +++ b/envs/coding_agent_env/sandbox/build_template.py @@ -25,10 +25,10 @@ Usage:: - .venv/bin/python envs/opencode_env/tests/build_e2b_template.py - # → builds (or rebuilds) ``opencode-rl`` template, prints template id + .venv/bin/python envs/coding_agent_env/sandbox/build_template.py + # → builds (or rebuilds) ``coding-agent-rl`` template, prints template id -Then ``test_five_sorts_e2e.py`` will use it via ``--template opencode-rl``. +Then rollout tests can use it via ``--template coding-agent-rl``. Requires ``E2B_API_KEY`` in the environment. First build is ~3-8 min; subsequent builds reuse the cache and can finish in <60s. @@ -113,8 +113,8 @@ def main(argv: list[str] | None = None) -> int: p = argparse.ArgumentParser(prog="build_e2b_template") p.add_argument( "--name", - default="opencode-rl", - help="Template name (default: opencode-rl).", + default="coding-agent-rl", + help="Template name (default: coding-agent-rl)." ) p.add_argument( "--skip-cache", @@ -123,7 +123,7 @@ def main(argv: list[str] | None = None) -> int: ) args = p.parse_args(argv) - _load_env(_REPO_ROOT / "envs" / "opencode_env" / "sandbox" / ".env") + _load_env(_REPO_ROOT / "envs" / "coding_agent_env" / "sandbox" / ".env") if not os.environ.get("E2B_API_KEY"): print("ERROR: E2B_API_KEY required.", file=sys.stderr) return 2 diff --git a/envs/opencode_env/server/Dockerfile b/envs/coding_agent_env/server/Dockerfile similarity index 91% rename from envs/opencode_env/server/Dockerfile rename to envs/coding_agent_env/server/Dockerfile index ad8319423..97e880343 100644 --- a/envs/opencode_env/server/Dockerfile +++ b/envs/coding_agent_env/server/Dockerfile @@ -4,14 +4,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # -# opencode_env Dockerfile — mirrors the standard OpenEnv multi-stage uv +# coding_agent_env Dockerfile — mirrors the standard OpenEnv multi-stage uv # build used by echo_env / repl_env / jupyter_agent. # # Build: -# docker build -t opencode-env . +# docker build -t coding-agent-env . # # Run: -# docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env +# docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest FROM ${BASE_IMAGE} AS builder diff --git a/envs/opencode_env/server/__init__.py b/envs/coding_agent_env/server/__init__.py similarity index 79% rename from envs/opencode_env/server/__init__.py rename to envs/coding_agent_env/server/__init__.py index 56363edaa..2eac4fb05 100644 --- a/envs/opencode_env/server/__init__.py +++ b/envs/coding_agent_env/server/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Server-side for the deployed opencode_env.""" +"""Server-side for the deployed coding_agent_env.""" diff --git a/envs/opencode_env/server/app.py b/envs/coding_agent_env/server/app.py similarity index 81% rename from envs/opencode_env/server/app.py rename to envs/coding_agent_env/server/app.py index 0757ef229..df40b507f 100644 --- a/envs/opencode_env/server/app.py +++ b/envs/coding_agent_env/server/app.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""FastAPI app for the opencode_env MCP server. +"""FastAPI app for the coding_agent_env MCP server. Mirrors the standard OpenEnv pattern (echo_env / repl_env / jupyter_agent) plus the custom Gradio UI mounted at ``/web`` per the @@ -16,7 +16,7 @@ E2B_API_KEY=... uvicorn server.app:app --host 0.0.0.0 --port 8000 # Docker: - docker run -p 8000:8000 -e E2B_API_KEY=... opencode-env + docker run -p 8000:8000 -e E2B_API_KEY=... coding-agent-env # HF Space: deploys via the root ``Dockerfile``. @@ -58,13 +58,13 @@ def _load_env_file() -> None: from openenv.core.env_server.http_server import create_app from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation - from .gradio_ui import opencode_gradio_builder - from .opencode_environment import OpenCodeEnvironment + from .gradio_ui import coding_agent_gradio_builder + from .coding_environment import CodingAgentEnvironment except ImportError: # pragma: no cover from openenv.core.env_server.http_server import create_app from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation - from server.gradio_ui import opencode_gradio_builder # type: ignore - from server.opencode_environment import OpenCodeEnvironment # type: ignore + from server.gradio_ui import coding_agent_gradio_builder # type: ignore + from server.coding_environment import CodingAgentEnvironment # type: ignore # Always expose the Gradio UI at /web. Set ENABLE_WEB_INTERFACE=false to @@ -80,22 +80,22 @@ def _custom_gradio_builder( title, quick_start_md, ): - """Hand off to ``server.gradio_ui.opencode_gradio_builder``.""" - return opencode_gradio_builder( + """Hand off to ``server.gradio_ui.coding_agent_gradio_builder``.""" + return coding_agent_gradio_builder( web_manager, action_fields, metadata, is_chat_env, - title or "opencode_env", + title or "coding_agent_env", quick_start_md, ) app = create_app( - OpenCodeEnvironment, + CodingAgentEnvironment, CallToolAction, CallToolObservation, - env_name="opencode_env", + env_name="coding_agent_env", max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")), gradio_builder=_custom_gradio_builder, ) diff --git a/envs/opencode_env/server/catalog.py b/envs/coding_agent_env/server/catalog.py similarity index 100% rename from envs/opencode_env/server/catalog.py rename to envs/coding_agent_env/server/catalog.py diff --git a/envs/opencode_env/server/opencode_environment.py b/envs/coding_agent_env/server/coding_environment.py similarity index 70% rename from envs/opencode_env/server/opencode_environment.py rename to envs/coding_agent_env/server/coding_environment.py index 638dd5473..3f8eabd13 100644 --- a/envs/opencode_env/server/opencode_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -4,11 +4,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""OpenCode MCP environment. +"""Coding-agent MCP environment. -Single MCP tool ``run_rollout`` that takes a uniform Task shape: +Single MCP tool ``run_rollout`` with a uniform task shape: - - ``instruction`` — prompt for the agent + - ``instruction`` — prompt for the selected agent - ``setup`` — bash commands run BEFORE the agent (in the sandbox) - ``verify`` — bash commands run AFTER the agent @@ -28,6 +28,7 @@ from uuid import uuid4 from fastmcp import FastMCP +from pydantic import BaseModel, Field try: from openenv.core.env_server.mcp_environment import MCPEnvironment @@ -40,7 +41,7 @@ from server.catalog import ENDPOINT_KINDS, resolve_endpoint # type: ignore -# One rollout (sandbox cold start + opencode install + opencode run + +# One rollout (sandbox cold start + harness install + agent run + # verifier) typically takes 30-180s; can spike to ~600s under load. Override # OpenEnv's 30s MCP-tool default so the server doesn't cut us off. _RUN_ROLLOUT_TIMEOUT_S = 900.0 @@ -53,9 +54,27 @@ PROXY_LOG = f"{HOME}/logs/agent/proxy.log" AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl" VERIFY_TIMEOUT_S = 120 +_SUPPORTED_AGENTS = ("opencode", "pi") +_AGENT_LOG_BY_AGENT: dict[str, str] = { + "opencode": f"{HOME}/logs/agent/opencode.jsonl", + "pi": f"{HOME}/logs/agent/pi.txt", +} -class OpenCodeEnvironment(MCPEnvironment): +class _GenericAgentConfig(BaseModel): + """Minimal config shape for CLIAgentSessionFactory-backed agents.""" + + base_url: str + api_key: str + model: str + agent_timeout_s: float = 600.0 + sandbox_home: str = HOME + provider: str | None = None + thinking: str | None = "off" + extra_env: dict[str, str] = Field(default_factory=dict) + + +class CodingAgentEnvironment(MCPEnvironment): """Per-session environment exposing a single ``run_rollout`` MCP tool.""" SUPPORTS_CONCURRENT_SESSIONS = True @@ -65,33 +84,37 @@ def __init__(self) -> None: try: from ..models import ( CommandResult, - OpenCodeState, + CodingAgentState, RolloutResult, RolloutTurn, ) except ImportError: # pragma: no cover from models import ( # type: ignore CommandResult, - OpenCodeState, + CodingAgentState, RolloutResult, RolloutTurn, ) - from opencode_env import ( + from openenv.core.harness.agents import get_agent_spec + from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory + from coding_agent_env import ( E2BSandboxBackend, - OpenCodeConfig, - OpenCodeSessionFactory, - OpenCodeTask, + CodingAgentConfig, + CodingAgentSessionFactory, + CodingAgentTask, ) self._CommandResult = CommandResult self._RolloutResult = RolloutResult self._RolloutTurn = RolloutTurn - self._OpenCodeState = OpenCodeState - self._OpenCodeConfig = OpenCodeConfig - self._OpenCodeSessionFactory = OpenCodeSessionFactory - self._OpenCodeTask = OpenCodeTask + self._CodingAgentState = CodingAgentState + self._CodingAgentConfig = CodingAgentConfig + self._CodingAgentSessionFactory = CodingAgentSessionFactory + self._CodingAgentTask = CodingAgentTask self._E2BSandboxBackend = E2BSandboxBackend + self._CLIAgentSessionFactory = CLIAgentSessionFactory + self._get_agent_spec = get_agent_spec # Don't raise on missing E2B_API_KEY here — OpenEnv's web-interface # layer instantiates the env at import time for schema introspection, @@ -99,12 +122,14 @@ def __init__(self) -> None: # just exploring. The real check happens lazily in # ``_run_rollout_impl`` (any rollout without creds fails fast there # with a clear error in the result payload). - self._state = self._OpenCodeState(episode_id=str(uuid4())) + self._state = self._CodingAgentState(episode_id=str(uuid4())) - mcp = FastMCP("opencode_env") + mcp = FastMCP("coding_agent_env") @mcp.tool def run_rollout( + # Agent + endpoint. + agent: str = "opencode", # Endpoint — either a shorthand (resolved from env vars + catalog # defaults) OR explicit base_url+api_key+model. Explicit fields # always win over the catalog. @@ -125,14 +150,17 @@ def run_rollout( agent_timeout_s: float = 600.0, template: str = "", ) -> str: - """Run one OpenCode rollout end-to-end. + """Run one coding-agent rollout end-to-end. + + ``agent`` selects the harness CLI to run inside the sandbox. + Currently supported: ``"opencode"``, ``"pi"``. ``endpoint`` is the shorthand selector (one of ``"vllm"`` / ``"openai"`` / ``"hf_router"``) — the server resolves base_url / api_key / model from env vars + catalog defaults. Pass any of those explicitly to override. - See ``opencode_env.client.OpenCodeEnv.run_rollout`` for full + See ``coding_agent_env.client.CodingAgentEnv.run_rollout`` for full arg docs. Returns a JSON-serialized ``RolloutResult``. """ # Resolve via catalog when shorthand is provided. @@ -149,6 +177,11 @@ def run_rollout( if disable_thinking_resolved is None: disable_thinking_resolved = False + agent = (agent or "opencode").strip() + if agent not in _SUPPORTED_AGENTS: + raise ValueError( + f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}" + ) if not (base_url and api_key and model): raise ValueError( "must provide either ``endpoint`` (one of " @@ -158,6 +191,7 @@ def run_rollout( raise ValueError("instruction is required") return self._run_rollout_impl( + agent=agent, base_url=base_url, api_key=api_key, model=model, @@ -183,13 +217,15 @@ def reset( episode_id: Optional[str] = None, **_: Any, ) -> Observation: - self._state = self._OpenCodeState(episode_id=episode_id or str(uuid4())) + self._state = self._CodingAgentState(episode_id=episode_id or str(uuid4())) return Observation( done=False, reward=None, metadata={ "status": "ready", - "message": ("opencode_env ready. Call run_rollout(...) with a task."), + "message": ( + "coding_agent_env ready. Call run_rollout(agent=..., ...) with a task." + ), }, ) @@ -239,6 +275,7 @@ def state(self) -> Any: def _run_rollout_impl( self, *, + agent: str, base_url: str, api_key: str, model: str, @@ -279,19 +316,18 @@ def _emit(msg: str) -> None: _emit("error: E2B_API_KEY missing on server") return result.model_dump_json() - _emit(f"resolving config (model={model}, mode={mode})") + _emit(f"resolving config (agent={agent}, model={model}, mode={mode})") - # Build OpenCodeConfig + factory. We keep the proxy in charge of - # ``model_override`` / ``logprobs`` / ``max_tokens``-cap injection. - config = self._OpenCodeConfig( - provider="openai_compatible", - base_url=base_url.rstrip("/"), + config = self._build_agent_config( + agent=agent, + mode=mode, + base_url=base_url, api_key=api_key, model=model, agent_timeout_s=agent_timeout_s, - proxy_disable_thinking=disable_thinking, - proxy_top_logprobs=top_logprobs, - proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, + disable_thinking=disable_thinking, + top_logprobs=top_logprobs, + max_tokens_cap=max_tokens_cap, ) # Concatenate setup commands into a single ``set -e`` script and let @@ -300,21 +336,19 @@ def _emit(msg: str) -> None: # each command in a wrapper that captures exit/stdout/stderr. # That way the primitive still aborts on setup failure AND we get # observability in the response. - instruction_payload = instruction - opencode_task = self._OpenCodeTask( - instruction=instruction_payload, - metadata={"task_id": task_id}, + rollout_task = self._CodingAgentTask( + instruction=instruction, + metadata={"task_id": task_id, "agent": agent}, ) - backend_kwargs: dict[str, Any] = {} - if template: - backend_kwargs["template"] = template - - factory = self._OpenCodeSessionFactory( + factory = self._build_session_factory( + agent=agent, config=config, - sandbox_backend=self._E2BSandboxBackend(**backend_kwargs), mode=mode, - verifier=None, + template=template, + disable_thinking=disable_thinking, + top_logprobs=top_logprobs, + max_tokens_cap=max_tokens_cap, ) session = None @@ -323,7 +357,7 @@ def _emit(msg: str) -> None: f"creating E2B sandbox (template={template or 'default'}) — " "this is the slow phase (~5–60s cold, ~5s with template)" ) - session = factory.create(task=opencode_task) + session = factory.create(task=rollout_task) result.sandbox_id = session.sandbox.sandbox_id _emit( f"sandbox ready: {result.sandbox_id} — agent started " @@ -336,7 +370,7 @@ def _emit(msg: str) -> None: # we'd need to restructure. As a pragmatic compromise we run # setup IMMEDIATELY after create(), which races with the agent # for ~1-2s but is fine for typical pip/git/download work - # because opencode itself takes >=20s to make its first model + # because most agent CLIs take a while before their first model # call. for i, cmd in enumerate(setup, 1): _emit(f"setup [{i}/{len(setup)}]: {cmd[:80]}") @@ -352,7 +386,7 @@ def _emit(msg: str) -> None: # Block until the agent is done (or setup already failed). if result.error is None: _emit( - f"agent running — opencode CLI in sandbox " + f"agent running — {agent} CLI in sandbox " f"(timeout {int(agent_timeout_s)}s)" ) try: @@ -387,7 +421,7 @@ def _emit(msg: str) -> None: result.files, result.files_extra = self._collect_files(session.sandbox) result.proxy_turns = self._collect_proxy_turns(session) result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:] - result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:] + result.agent_log_tail = self._collect_agent_log_tail(session, agent) _emit( f"collected: {len(result.files)} file(s), " f"{len(result.proxy_turns)} proxy turn(s), " @@ -400,9 +434,7 @@ def _emit(msg: str) -> None: result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[ -2000: ] - result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[ - -2000: - ] + result.agent_log_tail = self._collect_agent_log_tail(session, agent) finally: if session is not None: try: @@ -422,6 +454,104 @@ def _emit(msg: str) -> None: return result.model_dump_json() + def _build_agent_config( + self, + *, + agent: str, + mode: str, + base_url: str, + api_key: str, + model: str, + agent_timeout_s: float, + disable_thinking: bool, + top_logprobs: int, + max_tokens_cap: int, + ) -> Any: + if agent == "opencode": + return self._CodingAgentConfig( + provider="openai_compatible", + base_url=base_url.rstrip("/"), + api_key=api_key, + model=model, + agent_timeout_s=agent_timeout_s, + proxy_disable_thinking=disable_thinking, + proxy_top_logprobs=top_logprobs, + proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, + ) + + provider = ( + "openai" if mode == "transparent_proxy" else self._infer_pi_provider(base_url) + ) + return _GenericAgentConfig( + base_url=base_url.rstrip("/"), + api_key=api_key, + model=model, + agent_timeout_s=agent_timeout_s, + provider=provider, + thinking="off" if disable_thinking else None, + ) + + def _build_session_factory( + self, + *, + agent: str, + config: Any, + mode: str, + template: str, + disable_thinking: bool, + top_logprobs: int, + max_tokens_cap: int, + ) -> Any: + backend_kwargs: dict[str, Any] = {} + if template: + backend_kwargs["template"] = template + backend = self._E2BSandboxBackend(**backend_kwargs) + + if agent == "opencode": + return self._CodingAgentSessionFactory( + config=config, + sandbox_backend=backend, + mode=mode, + verifier=None, + ) + + spec = self._get_agent_spec(agent) + return self._CLIAgentSessionFactory( + spec=spec, + config=config, + sandbox_backend=backend, + mode=mode, + verifier=None, + proxy_disable_thinking=disable_thinking, + proxy_top_logprobs=top_logprobs, + proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, + ) + + @staticmethod + def _infer_pi_provider(base_url: str) -> str: + url = (base_url or "").lower() + if "router.huggingface.co" in url: + return "huggingface" + if "anthropic" in url: + return "anthropic" + if "googleapis.com" in url or "generativelanguage" in url: + return "gemini" + return "openai" + + def _collect_agent_log_tail(self, session: Any, agent: str) -> str: + if hasattr(session, "collect_artifacts"): + try: + artifacts = session.collect_artifacts() + if isinstance(artifacts, dict) and "agent_log" in artifacts: + val = artifacts["agent_log"] + if isinstance(val, str): + return val[-2000:] + return json.dumps(val, default=str)[-2000:] + except Exception: + pass + path = _AGENT_LOG_BY_AGENT.get(agent, AGENT_LOG) + return self._safe_read(session.sandbox, path)[-2000:] + # ── Helpers ──────────────────────────────────────────────────────────── def _exec_command(self, sandbox: Any, cmd: str) -> Any: @@ -471,18 +601,33 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]: def _collect_proxy_turns(self, session: Any) -> list[Any]: turns: list[Any] = [] - proxy_trace_path = getattr(session, "_proxy_trace_path", None) - if not proxy_trace_path: - return turns - raw = self._safe_read(session.sandbox, proxy_trace_path) - for line in raw.splitlines(): - line = line.strip() - if not line: - continue + + records: list[dict[str, Any]] = [] + if hasattr(session, "fetch_proxy_trace"): try: - rec = json.loads(line) + fetched = session.fetch_proxy_trace() + if isinstance(fetched, list): + records = [r for r in fetched if isinstance(r, dict)] except Exception: - continue + records = [] + + if not records: + proxy_trace_path = getattr(session, "_proxy_trace_path", None) + if not proxy_trace_path: + return turns + raw = self._safe_read(session.sandbox, proxy_trace_path) + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + try: + rec = json.loads(line) + except Exception: + continue + if isinstance(rec, dict): + records.append(rec) + + for rec in records: response = rec.get("response") or {} turns.append( self._RolloutTurn( @@ -509,3 +654,4 @@ def _safe_read(sandbox: Any, path: str) -> str: return sandbox.read_text(path) or "" except Exception: return "" + diff --git a/envs/opencode_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py similarity index 92% rename from envs/opencode_env/server/gradio_ui.py rename to envs/coding_agent_env/server/gradio_ui.py index d1ee6e403..5497ef0f2 100644 --- a/envs/opencode_env/server/gradio_ui.py +++ b/envs/coding_agent_env/server/gradio_ui.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Minimal Gradio UI for opencode_env. +"""Minimal Gradio UI for coding_agent_env. Mounts under the standard OpenEnv ``/web`` path via the ``gradio_builder=`` callback documented at @@ -32,14 +32,14 @@ try: from .catalog import catalog_summary, ENDPOINT_KINDS, resolve_endpoint - from .opencode_environment import OpenCodeEnvironment + from .coding_environment import CodingAgentEnvironment except ImportError: # pragma: no cover from server.catalog import ( # type: ignore catalog_summary, ENDPOINT_KINDS, resolve_endpoint, ) - from server.opencode_environment import OpenCodeEnvironment # type: ignore + from server.coding_environment import CodingAgentEnvironment # type: ignore # ──────────────────────────────────────────────────────────────────────────── @@ -190,8 +190,8 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str: toks = first["completion_tokens"][:10] lps = first.get("per_token_logps") or [] lines.append( - f"\n**first productive turn (first 10 tokens)**\n\n" - f"```\n" + "\n**first productive turn (first 10 tokens)**\n\n" + "```\n" + "\n".join( f" {tok!r:<14} {lp:+.3f}" if i < len(lps) else f" {tok!r:<14} -" for i, (tok, lp) in enumerate(zip(toks, lps + [None] * len(toks))) @@ -202,6 +202,7 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str: def _live_status_md( + agent: str, endpoint_kind: str, model: str, mode: str, @@ -211,7 +212,7 @@ def _live_status_md( """Render a live phase log (latest at the bottom) with elapsed timestamps.""" head = ( f"### running… `elapsed={elapsed_s:.1f}s`\n\n" - f"_endpoint=`{endpoint_kind}` model=`{model}` mode=`{mode}`_\n\n" + f"_agent=`{agent}` endpoint=`{endpoint_kind}` model=`{model}` mode=`{mode}`_\n\n" ) if not lines: body = "_(waiting for first phase update…)_" @@ -255,7 +256,7 @@ def _catalog_banner() -> str: # ──────────────────────────────────────────────────────────────────────────── -def opencode_gradio_builder( +def coding_agent_gradio_builder( web_manager, # noqa: ARG001 (unused: we instantiate the env directly) action_fields, # noqa: ARG001 metadata, # noqa: ARG001 @@ -263,16 +264,17 @@ def opencode_gradio_builder( title, quick_start_md, # noqa: ARG001 ) -> gr.Blocks: - """Build the opencode_env console. + """Build the coding_agent_env console. Compatible with ``create_app(..., gradio_builder=...)``. We ignore - ``web_manager`` and instantiate :class:`OpenCodeEnvironment` ourselves - inside the run handler — opencode_env's run_rollout doesn't need any + ``web_manager`` and instantiate :class:`CodingAgentEnvironment` ourselves + inside the run handler — coding_agent_env's run_rollout doesn't need any per-session state beyond the env's own bookkeeping, and instantiating is cheap (no sandbox is created until the tool fires). """ def run( + agent: str, endpoint: str, model: str, base_url: str, @@ -317,7 +319,7 @@ def run( else: dt = None - env = OpenCodeEnvironment() + env = CodingAgentEnvironment() # The worker fires _run_rollout_impl in a background thread and # streams progress messages into a queue; this generator polls the @@ -331,6 +333,7 @@ def _cb(msg: str) -> None: def _worker(): try: payload = env._run_rollout_impl( + agent=agent, base_url=resolved.base_url, api_key=resolved.api_key, model=resolved.model, @@ -361,7 +364,7 @@ def _worker(): # First yield: announce we've started. Empty result panels. yield ( - f"### running…\n\n_endpoint=`{resolved.kind}` model=`{resolved.model}` mode=`{mode}`_", + f"### running…\n\n_agent=`{agent}` endpoint=`{resolved.kind}` model=`{resolved.model}` mode=`{mode}`_", [], [], "", @@ -387,7 +390,12 @@ def _worker(): # Render the live status pane. elapsed = time.time() - t_start md = _live_status_md( - resolved.kind, resolved.model, mode, elapsed, status_lines + agent, + resolved.kind, + resolved.model, + mode, + elapsed, + status_lines, ) yield (md, [], [], "", "", "", {}) @@ -409,6 +417,7 @@ def _worker(): "", "", _live_status_md( + agent, resolved.kind, resolved.model, mode, @@ -427,8 +436,9 @@ def _worker(): _files_md(result.get("files") or {}), _logprobs_md(result.get("proxy_turns") or []), ( - f"### live phase log\n\n" + "### live phase log\n\n" + _live_status_md( + agent, resolved.kind, resolved.model, mode, @@ -445,17 +455,24 @@ def apply_preset(name: str) -> tuple[str, str, str]: p = PRESETS.get(name) or {"instruction": "", "setup": "", "verify": ""} return p["instruction"], p["setup"], p["verify"] - with gr.Blocks(title=title or "opencode_env") as app: - gr.Markdown(f"# {title or 'opencode_env'}") + with gr.Blocks(title=title or "coding_agent_env") as app: + gr.Markdown(f"# {title or 'coding_agent_env'}") gr.Markdown( - "Run one OpenCode rollout in an E2B sandbox against your chosen " - "LLM endpoint. Pick an endpoint, write the task as `(instruction, " - "setup, verify)`, and inspect the reward + per-token logprobs." + "Run one coding-agent rollout in an E2B sandbox against your chosen " + "LLM endpoint. Pick an agent + endpoint, write the task as " + "`(instruction, setup, verify)`, and inspect reward + per-token " + "logprobs." ) gr.Markdown(_catalog_banner()) with gr.Row(): + agent = gr.Dropdown( + choices=["opencode", "pi"], + value="opencode", + label="Agent", + scale=1, + ) endpoint = gr.Dropdown( choices=list(ENDPOINT_KINDS), value="openai", @@ -481,7 +498,7 @@ def apply_preset(name: str) -> tuple[str, str, str]: ) instruction = gr.Textbox( - label="Instruction (the prompt opencode runs)", + label="Instruction (the prompt the selected agent runs)", lines=4, value=PRESETS["binary_search"]["instruction"], ) @@ -567,6 +584,7 @@ def apply_preset(name: str) -> tuple[str, str, str]: run_btn.click( fn=run, inputs=[ + agent, endpoint, model, base_url, @@ -593,3 +611,4 @@ def apply_preset(name: str) -> tuple[str, str, str]: ) return app + diff --git a/envs/opencode_env/task.py b/envs/coding_agent_env/task.py similarity index 73% rename from envs/opencode_env/task.py rename to envs/coding_agent_env/task.py index f9d208d84..8633eb7aa 100644 --- a/envs/opencode_env/task.py +++ b/envs/coding_agent_env/task.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Task payload accepted by :class:`OpenCodeSessionFactory`.""" +"""Task payload accepted by :class:`CodingAgentSessionFactory`.""" from __future__ import annotations @@ -13,8 +13,8 @@ from pydantic import BaseModel, Field -class OpenCodeTask(BaseModel): - """One task for an OpenCode rollout. +class CodingAgentTask(BaseModel): + """One task for a coding-agent rollout. The primitive only needs ``instruction`` (the prompt handed to ``opencode run``). Callers may attach ``setup_shell`` (run once inside the sandbox @@ -29,8 +29,8 @@ class OpenCodeTask(BaseModel): metadata: dict[str, Any] = Field(default_factory=dict) @classmethod - def coerce(cls, value: Any) -> "OpenCodeTask": - """Accept a bare string, a dict, or an existing ``OpenCodeTask``.""" + def coerce(cls, value: Any) -> "CodingAgentTask": + """Accept a bare string, a dict, or an existing ``CodingAgentTask``.""" if isinstance(value, cls): return value if isinstance(value, str): @@ -38,6 +38,6 @@ def coerce(cls, value: Any) -> "OpenCodeTask": if isinstance(value, dict): return cls(**value) raise TypeError( - f"Cannot coerce {type(value).__name__} to OpenCodeTask; " - "pass a str, dict, or OpenCodeTask." + f"Cannot coerce {type(value).__name__} to CodingAgentTask; " + "pass a str, dict, or CodingAgentTask." ) diff --git a/envs/opencode_env/uv.lock b/envs/coding_agent_env/uv.lock similarity index 99% rename from envs/opencode_env/uv.lock rename to envs/coding_agent_env/uv.lock index 80dd00ba0..aa35531cc 100644 --- a/envs/opencode_env/uv.lock +++ b/envs/coding_agent_env/uv.lock @@ -1664,38 +1664,7 @@ wheels = [ ] [[package]] -name = "openenv-core" -version = "0.2.3" -source = { git = "https://github.com/adithya-s-k/OpenEnv.git?rev=opencode-harness#aabcdbb9d52aa62a842ec69472b2a1106acb831a" } -dependencies = [ - { name = "fastapi" }, - { name = "fastmcp" }, - { name = "gradio" }, - { name = "httpx" }, - { name = "huggingface-hub" }, - { name = "openai" }, - { name = "pydantic" }, - { name = "pyyaml" }, - { name = "requests" }, - { name = "rich" }, - { name = "tomli" }, - { name = "tomli-w" }, - { name = "typer" }, - { name = "uvicorn" }, - { name = "websockets" }, -] - -[package.optional-dependencies] -core = [ - { name = "fastapi" }, - { name = "pydantic" }, - { name = "requests" }, - { name = "uvicorn" }, - { name = "websockets" }, -] - -[[package]] -name = "openenv-opencode-env" +name = "openenv-coding-agent-env" version = "0.1.0" source = { editable = "." } dependencies = [ @@ -1724,7 +1693,7 @@ requires-dist = [ { name = "fastmcp", specifier = ">=2.0.0" }, { name = "gradio", specifier = ">=6.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, - { name = "openenv-core", extras = ["core"], git = "https://github.com/adithya-s-k/OpenEnv.git?rev=opencode-harness" }, + { name = "openenv-core", extras = ["core"], specifier = ">=0.3.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" }, @@ -1734,6 +1703,41 @@ requires-dist = [ ] provides-extras = ["dev"] +[[package]] +name = "openenv-core" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastapi" }, + { name = "fastmcp" }, + { name = "gradio" }, + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "openai" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "rich" }, + { name = "tomli" }, + { name = "tomli-w" }, + { name = "typer" }, + { name = "uvicorn" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ce/d6/3bebe8afb55fcc3ea9251c4c2dfbab2879e31089bc91a8fe9696e5ce019b/openenv_core-0.3.0.tar.gz", hash = "sha256:c7fee2035badab5be497eb6f4afb2cb417de000f82cc19afd72fb5ec332c431d", size = 164720, upload-time = "2026-05-11T11:37:57.274Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/f5/aafa43138589bfd5d369a8d02ea365aae9d6fe55ac0b3894368d6d69bd03/openenv_core-0.3.0-py3-none-any.whl", hash = "sha256:859e875c9d5211b157c30fb9abc681606fcf0bf1b6ffcdf404678992823a1df0", size = 194313, upload-time = "2026-05-11T11:37:55.537Z" }, +] + +[package.optional-dependencies] +core = [ + { name = "fastapi" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "uvicorn" }, + { name = "websockets" }, +] + [[package]] name = "opentelemetry-api" version = "1.41.1" diff --git a/examples/opencode_env_simple.py b/examples/coding_agent_env_simple.py similarity index 83% rename from examples/opencode_env_simple.py rename to examples/coding_agent_env_simple.py index 1713880fb..f8996e586 100644 --- a/examples/opencode_env_simple.py +++ b/examples/coding_agent_env_simple.py @@ -5,18 +5,18 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""End-to-end opencode_env example: write binary_search.py and verify it. +"""End-to-end coding_agent_env example: write binary_search.py and verify it. -Hits the deployed HF Space ``AdithyaSK/opencode-env`` (override via -``OPENCODE_ENV_SPACE`` env var to point at your own Space or a local +Hits the deployed HF Space ``AdithyaSK/coding-agent-env`` (override via +``CODING_AGENT_ENV_SPACE`` env var to point at your own Space or a local container). The single MCP tool ``run_rollout`` does: - 1. Spawns a fresh E2B sandbox (using the prebaked ``opencode-rl`` + 1. Spawns a fresh E2B sandbox (using the prebaked ``coding-agent-rl`` template — falls back to a cold install if the template isn't present in your E2B account). 2. Bootstraps an in-sandbox FastAPI proxy that captures per-token logprobs (``mode="transparent_proxy"``). - 3. Runs ``opencode run`` with the instruction. + 3. Runs the selected harness CLI with the instruction. 4. Executes the verify bash commands; reward = passed / total. 5. Returns a ``RolloutResult`` with reward + per-turn logprobs + the file contents the agent produced. @@ -29,7 +29,7 @@ Usage:: - PYTHONPATH=src:envs uv run python examples/opencode_env_simple.py + PYTHONPATH=src:envs uv run python examples/coding_agent_env_simple.py Expected output (~20s with the prebaked template):: @@ -49,12 +49,12 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "envs")) -from opencode_env import OpenCodeEnv # noqa: E402 -from opencode_env.client import _extract_text # noqa: E402 -from opencode_env.models import RolloutResult # noqa: E402 +from coding_agent_env import CodingAgentEnv # noqa: E402 +from coding_agent_env.client import _extract_text # noqa: E402 +from coding_agent_env.models import RolloutResult # noqa: E402 -SPACE = os.environ.get("OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space") +SPACE = os.environ.get("CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space") INSTRUCTION = ( "Create a single Python file named `binary_search.py` in the current " @@ -91,7 +91,7 @@ async def main() -> int: print(f"Instruction: {INSTRUCTION.splitlines()[0]} ...") print() - async with OpenCodeEnv(base_url=SPACE) as env: + async with CodingAgentEnv(base_url=SPACE) as env: await env.reset() raw = await env.call_tool( "run_rollout", @@ -101,7 +101,7 @@ async def main() -> int: instruction=INSTRUCTION, setup=[], # no setup commands verify=VERIFY, - template="opencode-rl", # prebaked E2B template + template="coding-agent-rl", # prebaked E2B template task_id="binary_search_simple", agent_timeout_s=600, ) diff --git a/tests/envs/test_opencode_env.py b/tests/envs/test_coding_agent_env.py similarity index 73% rename from tests/envs/test_opencode_env.py rename to tests/envs/test_coding_agent_env.py index 5e930b8bc..ec1f66fa5 100644 --- a/tests/envs/test_opencode_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Smoke tests for ``opencode_env``. +"""Smoke tests for ``coding_agent_env``. The default suite runs in CI without any external dependencies (no E2B, no LLM, no network). It covers: @@ -13,7 +13,7 @@ - The endpoint catalog (`vllm` / `openai` / `hf_router`) resolves explicit + env-var + default-value precedence correctly. - Pydantic models accept their expected shapes. - - The `OpenCodeTask` coercion helper handles str / dict / `OpenCodeTask`. + - The `CodingAgentTask` coercion helper handles str / dict / `CodingAgentTask`. A second class is marked ``@pytest.mark.integration`` and exercises the deployed Space end-to-end. It only runs when ``E2B_API_KEY`` and at least @@ -45,15 +45,15 @@ def test_public_api_imports() -> None: """Top-level package re-exports the documented surface.""" - from opencode_env import ( # noqa: F401 + from coding_agent_env import ( # noqa: F401 CommandResult, E2BSandboxBackend, - OpenCodeConfig, - OpenCodeEnv, - OpenCodeSession, - OpenCodeSessionFactory, - OpenCodeState, - OpenCodeTask, + CodingAgentConfig, + CodingAgentEnv, + CodingAgentSession, + CodingAgentSessionFactory, + CodingAgentState, + CodingAgentTask, Provider, RolloutResult, RolloutTurn, @@ -64,14 +64,14 @@ def test_public_api_imports() -> None: def test_server_modules_import() -> None: """Server-side modules (FastAPI app, MCP env, catalog) import cleanly.""" - from opencode_env.server.app import app # noqa: F401 - from opencode_env.server.catalog import ( # noqa: F401 + from coding_agent_env.server.app import app # noqa: F401 + from coding_agent_env.server.catalog import ( # noqa: F401 catalog_summary, ENDPOINT_KINDS, resolve_endpoint, ) - from opencode_env.server.opencode_environment import ( # noqa: F401 - OpenCodeEnvironment, + from coding_agent_env.server.coding_environment import ( # noqa: F401 + CodingAgentEnvironment, ) @@ -81,14 +81,14 @@ def test_server_modules_import() -> None: def test_catalog_kinds() -> None: - from opencode_env.server.catalog import ENDPOINT_KINDS + from coding_agent_env.server.catalog import ENDPOINT_KINDS assert ENDPOINT_KINDS == ("vllm", "openai", "hf_router") def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) -> None: """Explicit args beat env vars beat catalog defaults.""" - from opencode_env.server.catalog import resolve_endpoint + from coding_agent_env.server.catalog import resolve_endpoint monkeypatch.setenv("OPENAI_API_KEY", "from-env") r = resolve_endpoint( @@ -107,7 +107,7 @@ def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) -> def test_resolve_endpoint_env_var_used_when_arg_missing( monkeypatch: pytest.MonkeyPatch, ) -> None: - from opencode_env.server.catalog import resolve_endpoint + from coding_agent_env.server.catalog import resolve_endpoint monkeypatch.setenv("OPENAI_API_KEY", "key-from-env") monkeypatch.setenv("OPENAI_MODEL", "gpt-4o") @@ -121,7 +121,7 @@ def test_resolve_endpoint_normalizes_v1_suffix( monkeypatch: pytest.MonkeyPatch, ) -> None: """Base URL gets ``/v1`` appended if missing, otherwise left alone.""" - from opencode_env.server.catalog import resolve_endpoint + from coding_agent_env.server.catalog import resolve_endpoint monkeypatch.setenv("VLLM_URL", "https://my-vllm.example/") monkeypatch.setenv("VLLM_API_KEY", "x") @@ -134,7 +134,7 @@ def test_resolve_endpoint_normalizes_v1_suffix( def test_resolve_endpoint_unknown_kind_raises() -> None: - from opencode_env.server.catalog import resolve_endpoint + from coding_agent_env.server.catalog import resolve_endpoint with pytest.raises(ValueError, match="unknown endpoint kind"): resolve_endpoint("bogus", base_url="x", api_key="y", model="z") @@ -143,7 +143,7 @@ def test_resolve_endpoint_unknown_kind_raises() -> None: def test_resolve_endpoint_missing_creds_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: - from opencode_env.server.catalog import resolve_endpoint + from coding_agent_env.server.catalog import resolve_endpoint # Strip any inherited env vars. for k in ("OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL"): @@ -153,7 +153,7 @@ def test_resolve_endpoint_missing_creds_raises( def test_catalog_summary_shape() -> None: - from opencode_env.server.catalog import catalog_summary + from coding_agent_env.server.catalog import catalog_summary summary = catalog_summary() assert {entry["kind"] for entry in summary} == {"vllm", "openai", "hf_router"} @@ -166,13 +166,67 @@ def test_catalog_summary_shape() -> None: } <= entry.keys() +def test_build_agent_config_opencode() -> None: + from coding_agent_env.server.coding_environment import CodingAgentEnvironment + + env = CodingAgentEnvironment() + cfg = env._build_agent_config( + agent="opencode", + mode="transparent_proxy", + base_url="https://api.openai.com/v1", + api_key="sk-test", + model="gpt-4o-mini", + agent_timeout_s=123.0, + disable_thinking=True, + top_logprobs=7, + max_tokens_cap=2048, + ) + assert isinstance(cfg, env._CodingAgentConfig) + assert cfg.proxy_disable_thinking is True + assert cfg.proxy_top_logprobs == 7 + assert cfg.proxy_max_tokens_cap == 2048 + + +def test_build_agent_config_pi() -> None: + from coding_agent_env.server.coding_environment import CodingAgentEnvironment + + env = CodingAgentEnvironment() + cfg = env._build_agent_config( + agent="pi", + mode="black_box", + base_url="https://router.huggingface.co/v1", + api_key="hf_xxx", + model="zai-org/GLM-5.1", + agent_timeout_s=180.0, + disable_thinking=True, + top_logprobs=5, + max_tokens_cap=4096, + ) + assert cfg.provider == "huggingface" + assert cfg.thinking == "off" + assert cfg.model == "zai-org/GLM-5.1" + + cfg_proxy = env._build_agent_config( + agent="pi", + mode="transparent_proxy", + base_url="https://router.huggingface.co/v1", + api_key="hf_xxx", + model="zai-org/GLM-5.1", + agent_timeout_s=180.0, + disable_thinking=False, + top_logprobs=5, + max_tokens_cap=4096, + ) + assert cfg_proxy.provider == "openai" + + # --------------------------------------------------------------------------- # Models + task coercion # --------------------------------------------------------------------------- def test_rollout_result_serializes_round_trip() -> None: - from opencode_env import CommandResult, RolloutResult, RolloutTurn + from coding_agent_env import CommandResult, RolloutResult, RolloutTurn r = RolloutResult( task_id="t1", @@ -201,40 +255,40 @@ def test_rollout_result_serializes_round_trip() -> None: assert rebuilt.proxy_turns[0].completion_tokens == ["hi"] -def test_opencode_task_coerce_str() -> None: - from opencode_env import OpenCodeTask +def test_coding_agent_task_coerce_str() -> None: + from coding_agent_env import CodingAgentTask - t = OpenCodeTask.coerce("write fizzbuzz.py") + t = CodingAgentTask.coerce("write fizzbuzz.py") assert t.instruction == "write fizzbuzz.py" assert t.setup_shell is None assert t.upload_files == {} -def test_opencode_task_coerce_dict() -> None: - from opencode_env import OpenCodeTask +def test_coding_agent_task_coerce_dict() -> None: + from coding_agent_env import CodingAgentTask - t = OpenCodeTask.coerce({"instruction": "x", "setup_shell": "pip install pandas"}) + t = CodingAgentTask.coerce({"instruction": "x", "setup_shell": "pip install pandas"}) assert t.instruction == "x" assert t.setup_shell == "pip install pandas" -def test_opencode_task_coerce_existing_passthrough() -> None: - from opencode_env import OpenCodeTask +def test_coding_agent_task_coerce_existing_passthrough() -> None: + from coding_agent_env import CodingAgentTask - src = OpenCodeTask(instruction="y") - assert OpenCodeTask.coerce(src) is src + src = CodingAgentTask(instruction="y") + assert CodingAgentTask.coerce(src) is src -def test_opencode_task_coerce_rejects_unknown_type() -> None: - from opencode_env import OpenCodeTask +def test_coding_agent_task_coerce_rejects_unknown_type() -> None: + from coding_agent_env import CodingAgentTask with pytest.raises(TypeError, match="Cannot coerce"): - OpenCodeTask.coerce(42) # type: ignore[arg-type] + CodingAgentTask.coerce(42) # type: ignore[arg-type] def test_start_proxy_keeps_upstream_key_out_of_command() -> None: """The proxy API key must be passed via env, not shell argv.""" - from opencode_env import OpenCodeConfig, OpenCodeSessionFactory + from coding_agent_env import CodingAgentConfig, CodingAgentSessionFactory class FakeExecResult: exit_code = 0 @@ -278,13 +332,13 @@ def kill(self) -> None: secret = "sk-test '$(leak)" model = "provider/model'; touch /tmp/pwn #" - config = OpenCodeConfig( + config = CodingAgentConfig( base_url="https://example.test/v1?x='y", api_key=secret, model=model, ) sandbox = FakeSandbox() - factory = OpenCodeSessionFactory( + factory = CodingAgentSessionFactory( config=config, sandbox_backend=object(), # unused by this protected-method test mode="transparent_proxy", @@ -354,16 +408,16 @@ def test_run_rollout_e2e_via_deployed_space() -> None: import asyncio - from opencode_env import OpenCodeEnv - from opencode_env.client import _extract_text - from opencode_env.models import RolloutResult + from coding_agent_env import CodingAgentEnv + from coding_agent_env.client import _extract_text + from coding_agent_env.models import RolloutResult SPACE = os.environ.get( - "OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space" + "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space" ) async def _go() -> RolloutResult: - async with OpenCodeEnv(base_url=SPACE) as env: + async with CodingAgentEnv(base_url=SPACE) as env: await env.reset() raw = await env.call_tool( "run_rollout", @@ -382,7 +436,7 @@ async def _go() -> RolloutResult: "import binary_search; " "assert binary_search.binary_search([1,2,3,4,5], 3) == 2; print('OK')\"", ], - template="opencode-rl", + template="coding-agent-rl", agent_timeout_s=600, ) return RolloutResult.model_validate_json(_extract_text(raw)) From ddf1313bd7e9f3695d08a8d079fabbeb8b8d9608 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Thu, 14 May 2026 15:06:07 +0530 Subject: [PATCH 09/35] feat: hf sandbox backend - tests --- src/openenv/core/harness/sandbox/__init__.py | 19 +- .../core/harness/sandbox/hf_backend.py | 306 ++++++++++++++++++ tests/core/test_hf_sandbox_backend.py | 221 +++++++++++++ 3 files changed, 544 insertions(+), 2 deletions(-) create mode 100644 src/openenv/core/harness/sandbox/hf_backend.py create mode 100644 tests/core/test_hf_sandbox_backend.py diff --git a/src/openenv/core/harness/sandbox/__init__.py b/src/openenv/core/harness/sandbox/__init__.py index 83d37fb48..208fe54d5 100644 --- a/src/openenv/core/harness/sandbox/__init__.py +++ b/src/openenv/core/harness/sandbox/__init__.py @@ -37,9 +37,16 @@ except ImportError: pass # e2b not installed +try: + from .hf_backend import HFBgJob, HFSandboxBackend, HFSandboxHandle # noqa: F401 + + __all__.extend(["HFBgJob", "HFSandboxBackend", "HFSandboxHandle"]) +except ImportError: + pass # hf-sandbox not installed + def create_sandbox_backend( - backend: Literal["e2b", "docker"] = "e2b", + backend: Literal["e2b", "docker", "hf"] = "e2b", **kwargs: Any, ) -> SandboxBackend: """Create a sandbox backend by name. @@ -48,6 +55,8 @@ def create_sandbox_backend( (set ``E2B_API_URL``). For ``"docker"``: local Docker, no external dependencies. + + For ``"hf"``: Hugging Face Jobs via ``hf-sandbox``. """ if backend == "e2b": from .e2b_backend import E2BSandboxBackend @@ -55,4 +64,10 @@ def create_sandbox_backend( return E2BSandboxBackend(**kwargs) elif backend == "docker": return DockerSandboxBackend(**kwargs) - raise ValueError(f"Unknown sandbox backend: {backend!r}. Use 'e2b' or 'docker'.") + elif backend == "hf": + from .hf_backend import HFSandboxBackend + + return HFSandboxBackend(**kwargs) + raise ValueError( + f"Unknown sandbox backend: {backend!r}. Use 'e2b', 'docker', or 'hf'." + ) diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py new file mode 100644 index 000000000..410a8daea --- /dev/null +++ b/src/openenv/core/harness/sandbox/hf_backend.py @@ -0,0 +1,306 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Hugging Face Sandbox implementation of :class:`SandboxBackend`. + +Wraps `hf-sandbox` (https://github.com/huggingface/hf-sandbox) so OpenEnv +harnesses can use it through the same protocol. +""" + +from __future__ import annotations + +import re +import time +import uuid +from pathlib import PurePosixPath +from threading import Event +from typing import Any + +from hf_sandbox import Sandbox +from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxHandle + +_ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +class HFSandboxError(RuntimeError): + """Base class for HF sandbox backend errors.""" + + +class HFSandboxCreateError(HFSandboxError): + """Raised when backend cannot create a sandbox.""" + + +class HFBgJob: + """Background process handle for :class:`HFSandboxHandle`.""" + + def __init__( + self, + sandbox: "HFSandboxHandle", + *, + pid: int, + marker_path: str, + poll_interval_s: float = 0.5, + ) -> None: + self._sandbox = sandbox + self._pid = pid + self._marker_path = marker_path + self._poll_interval_s = poll_interval_s + self._done = Event() + self._exit_code: int | None = None + + @property + def pid(self) -> int: + return self._pid + + def wait(self, timeout: float | None = None) -> int: + deadline = None if timeout is None else (time.monotonic() + timeout) + while True: + if self._done.is_set(): + return self._exit_code if self._exit_code is not None else 0 + if deadline is not None and time.monotonic() > deadline: + raise TimeoutError( + f"Background command (pid={self._pid}) " + f"did not exit within {timeout}s" + ) + + marker = self._sandbox.exec( + f"cat {_shell_quote(self._marker_path)}", + timeout=10, + ) + if marker.exit_code == 0 and marker.stdout.strip(): + self._exit_code = _parse_exit_code(marker.stdout.strip(), default=0) + self._done.set() + return self._exit_code + + alive = self._sandbox.exec(f"kill -0 {self._pid}", timeout=10) + if alive.exit_code != 0: + self._exit_code = 1 + self._done.set() + return self._exit_code + + time.sleep(self._poll_interval_s) + + def kill(self) -> None: + if self._done.is_set(): + return + try: + self._sandbox.exec(f"kill -9 {self._pid}", timeout=5) + except Exception: + pass + self._exit_code = 137 + self._done.set() + + +class HFSandboxHandle: + """Wraps a live ``hf_sandbox.Sandbox`` to satisfy :class:`SandboxHandle`.""" + + def __init__( + self, + sandbox: Any, + *, + default_envs: dict[str, str] | None = None, + ) -> None: + self._sbx = sandbox + self._default_envs = dict(default_envs or {}) + self._bg_jobs: list[HFBgJob] = [] + + @property + def sandbox_id(self) -> str: + return str(getattr(self._sbx, "job_id", "hf-sandbox")) + + @property + def raw(self) -> Any: + """Escape hatch for callers that need the underlying SDK object.""" + return self._sbx + + def exec( + self, + cmd: str, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + timeout: float | None = 60, + ) -> ExecResult: + merged_envs = dict(self._default_envs) + merged_envs.update(envs or {}) + shell_cmd = _with_env_prefix(cmd, merged_envs) + timeout_s = _normalize_exec_timeout(timeout) + try: + result = self._sbx.exec( + "bash", + "-lc", + shell_cmd, + workdir=cwd, + timeout=timeout_s, + ) + return ExecResult( + exit_code=int(getattr(result, "returncode", 1)), + stdout=str(getattr(result, "stdout", "") or ""), + stderr=str(getattr(result, "stderr", "") or ""), + ) + except Exception as exc: + return ExecResult(exit_code=-1, stdout="", stderr=str(exc)) + + def start_bg( + self, + cmd: str, + *, + envs: dict[str, str] | None = None, + cwd: str | None = None, + ) -> BgJob: + marker_path = f"/tmp/.openenv_bg_{uuid.uuid4().hex[:12]}.exit" + wrapped = f"{cmd}; rc=$?; echo $rc > {_shell_quote(marker_path)}" + launch_cmd = f"nohup bash -lc {_shell_quote(wrapped)} >/dev/null 2>&1 & echo $!" + + result = self.exec(launch_cmd, envs=envs, cwd=cwd, timeout=30) + if result.exit_code != 0: + raise RuntimeError( + f"Failed to start background command: {result.stderr or result.stdout}" + ) + + pid = _parse_pid(result.stdout) + if pid is None: + raise RuntimeError(f"Could not extract PID from start_bg output: {result.stdout!r}") + + job = HFBgJob(self, pid=pid, marker_path=marker_path) + self._bg_jobs.append(job) + return job + + def write_text(self, path: str, content: str) -> None: + parent = str(PurePosixPath(path).parent) + if parent not in ("", "/"): + r = self.exec(f"mkdir -p {_shell_quote(parent)}", timeout=10) + if r.exit_code != 0: + raise RuntimeError(f"Failed to create parent directory {parent!r}: {r.stderr}") + self._sbx.write_file(path, content) + + def read_text(self, path: str) -> str: + return str(self._sbx.read_file(path, text=True)) + + def exists(self, path: str) -> bool: + r = self.exec(f"test -e {_shell_quote(path)}", timeout=10) + return r.exit_code == 0 + + def kill(self) -> None: + for job in self._bg_jobs: + try: + job.kill() + except Exception: + pass + self._bg_jobs.clear() + try: + self._sbx.terminate() + except Exception: + pass + + +class HFSandboxBackend: + """Creates HF sandboxes for harness rollouts via ``hf-sandbox``.""" + + def __init__( + self, + *, + image: str = "python:3.12", + flavor: str = "cpu-basic", + timeout: str | None = None, + forward_hf_token: bool = False, + create_retries: int = 3, + create_backoff_s: float = 2.0, + ) -> None: + self._image = image + self._flavor = flavor + self._timeout = timeout + self._forward_hf_token = forward_hf_token + self._create_retries = max(1, int(create_retries)) + self._create_backoff_s = max(0.0, float(create_backoff_s)) + + def create( + self, + *, + timeout_s: int = 900, + envs: dict[str, str] | None = None, + metadata: dict[str, str] | None = None, + ) -> SandboxHandle: + # `hf-sandbox` does not support metadata at create-time yet. + del metadata + + timeout = self._timeout or _format_timeout(timeout_s) + last_error: Exception | None = None + + for attempt in range(self._create_retries): + try: + sbx = Sandbox.create( + image=self._image, + flavor=self._flavor, + timeout=timeout, + forward_hf_token=self._forward_hf_token, + ) + return HFSandboxHandle(sbx, default_envs=envs) + except Exception as exc: # noqa: BLE001 + last_error = exc + if attempt + 1 < self._create_retries: + time.sleep(self._create_backoff_s * (2**attempt)) + + assert last_error is not None + raise HFSandboxCreateError( + f"Failed to create HF sandbox after {self._create_retries} attempts: " + f"{last_error}" + ) from last_error + + +def _with_env_prefix(cmd: str, envs: dict[str, str]) -> str: + if not envs: + return cmd + parts: list[str] = [] + for key, value in envs.items(): + if not _ENV_KEY_RE.match(key): + raise ValueError(f"Invalid environment variable name: {key!r}") + parts.append(f"export {key}={_shell_quote(str(value))};") + return " ".join(parts) + f" {cmd}" + + +def _normalize_exec_timeout(timeout: float | None) -> int: + if timeout is None: + return 24 * 60 * 60 + return max(1, int(timeout)) + + +def _format_timeout(timeout_s: int) -> str: + timeout_s = max(1, int(timeout_s)) + if timeout_s % 3600 == 0: + return f"{timeout_s // 3600}h" + if timeout_s % 60 == 0: + return f"{timeout_s // 60}m" + return f"{timeout_s}s" + + +def _parse_pid(stdout: str) -> int | None: + for line in reversed(stdout.strip().splitlines()): + raw = line.strip() + if raw.isdigit(): + return int(raw) + return None + + +def _parse_exit_code(raw: str, *, default: int) -> int: + try: + return int(raw.splitlines()[-1].strip()) + except Exception: + return default + + +def _shell_quote(s: str) -> str: + """Single-quote a string for shell, escaping embedded single quotes.""" + return "'" + s.replace("'", "'\\''") + "'" + + +__all__ = [ + "HFBgJob", + "HFSandboxBackend", + "HFSandboxCreateError", + "HFSandboxError", + "HFSandboxHandle", +] diff --git a/tests/core/test_hf_sandbox_backend.py b/tests/core/test_hf_sandbox_backend.py new file mode 100644 index 000000000..d301b2b2b --- /dev/null +++ b/tests/core/test_hf_sandbox_backend.py @@ -0,0 +1,221 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Unit tests for the HF sandbox backend. + +These tests mock ``hf-sandbox`` so they run without network or HF credentials. +""" + +from __future__ import annotations + +import importlib +import re +import subprocess +import sys +import types +from dataclasses import dataclass, field + +import pytest + + +@dataclass +class _FakeSandbox: + job_id: str + files: dict[str, str] = field(default_factory=dict) + marker_files: dict[str, str] = field(default_factory=dict) + bg_jobs: dict[int, dict] = field(default_factory=dict) + next_pid: int = 1000 + terminated: bool = False + + def exec( + self, + *cmd: str, + workdir: str | None = None, + stdin: str | None = None, + timeout: int = 600, + ) -> subprocess.CompletedProcess: + del workdir, stdin, timeout + if len(cmd) < 3: + return subprocess.CompletedProcess(cmd, 1, "", "invalid command") + script = cmd[2] + + if "ok_cmd" in script: + return subprocess.CompletedProcess(cmd, 0, "ok\n", "") + if "fail_cmd" in script: + return subprocess.CompletedProcess(cmd, 42, "", "failed") + if "timeout_cmd" in script: + return subprocess.CompletedProcess(cmd, -1, "", "timeout") + + if "mkdir -p" in script: + return subprocess.CompletedProcess(cmd, 0, "", "") + + if "test -e " in script: + match = re.search(r"test -e '([^']+)'", script) + assert match is not None + path = match.group(1) + exists = path in self.files or path in self.marker_files + return subprocess.CompletedProcess(cmd, 0 if exists else 1, "", "") + + if "cat '/tmp/.openenv_bg_" in script: + match = re.search(r"cat '([^']+)'", script) + assert match is not None + marker = match.group(1) + if marker in self.marker_files: + return subprocess.CompletedProcess( + cmd, + 0, + f"{self.marker_files[marker]}\n", + "", + ) + return subprocess.CompletedProcess(cmd, 1, "", "missing") + + if script.strip().startswith("kill -0 "): + pid = int(script.strip().split()[2]) + alive = self.bg_jobs.get(pid, {}).get("alive", False) + return subprocess.CompletedProcess(cmd, 0 if alive else 1, "", "") + + if script.strip().startswith("kill -9 "): + pid = int(script.strip().split()[2]) + if pid in self.bg_jobs: + self.bg_jobs[pid]["alive"] = False + marker = self.bg_jobs[pid]["marker"] + self.marker_files[marker] = "137" + return subprocess.CompletedProcess(cmd, 0, "", "") + + if "echo $!" in script: + marker_match = re.search(r"(/tmp/\.openenv_bg_[A-Za-z0-9]+\.exit)", script) + assert marker_match is not None + marker = marker_match.group(1) + pid = self.next_pid + self.next_pid += 1 + long_running = "sleep 300" in script + self.bg_jobs[pid] = { + "marker": marker, + "alive": long_running, + } + if not long_running: + self.marker_files[marker] = "0" + return subprocess.CompletedProcess(cmd, 0, f"{pid}\n", "") + + return subprocess.CompletedProcess(cmd, 0, "", "") + + def write_file( + self, + path: str, + content: str | bytes | bytearray | memoryview, + ) -> None: + if isinstance(content, str): + normalized = content + else: + normalized = bytes(content).decode("utf-8", "replace") + self.files[path] = normalized + + def read_file(self, path: str, text: bool = True) -> str | bytes: + if path not in self.files: + raise FileNotFoundError(path) + return self.files[path] if text else self.files[path].encode() + + def terminate(self) -> None: + self.terminated = True + + +class _FakeSandboxAPI: + calls: list[dict] = [] + + @classmethod + def create( + cls, + image: str, + flavor: str, + timeout: str, + forward_hf_token: bool, + ) -> _FakeSandbox: + cls.calls.append( + { + "image": image, + "flavor": flavor, + "timeout": timeout, + "forward_hf_token": forward_hf_token, + } + ) + return _FakeSandbox(job_id="job-123") + + +def _install_fake_hf_sandbox(monkeypatch) -> None: + fake_module = types.ModuleType("hf_sandbox") + setattr(fake_module, "Sandbox", _FakeSandboxAPI) + monkeypatch.setitem(sys.modules, "hf_sandbox", fake_module) + + +class TestHFSandboxBackend: + def test_exported_from_package(self, monkeypatch): + _install_fake_hf_sandbox(monkeypatch) + + import openenv.core.harness.sandbox as sandbox_pkg + + importlib.reload(sandbox_pkg) + assert hasattr(sandbox_pkg, "HFSandboxBackend") + assert hasattr(sandbox_pkg, "HFSandboxHandle") + assert hasattr(sandbox_pkg, "HFBgJob") + + def test_create_exec_write_read_exists_bg_and_kill(self, monkeypatch): + import openenv.core.harness.sandbox.hf_backend as hf_backend + + _install_fake_hf_sandbox(monkeypatch) + importlib.reload(hf_backend) + + _FakeSandboxAPI.calls.clear() + monkeypatch.setattr(hf_backend, "Sandbox", _FakeSandboxAPI) + + backend = hf_backend.HFSandboxBackend( + image="python:3.12", + flavor="cpu-basic", + forward_hf_token=True, + ) + sandbox = backend.create(timeout_s=120, envs={"GLOBAL_ENV": "on"}) + + assert sandbox.sandbox_id == "job-123" + assert _FakeSandboxAPI.calls[-1]["timeout"] == "2m" + + ok = sandbox.exec("ok_cmd") + assert ok.exit_code == 0 + + failed = sandbox.exec("fail_cmd") + assert failed.exit_code == 42 + + timed = sandbox.exec("timeout_cmd") + assert timed.exit_code == -1 + + sandbox.write_text("/tmp/hello.txt", "hello") + assert sandbox.exists("/tmp/hello.txt") + assert sandbox.read_text("/tmp/hello.txt") == "hello" + + short_job = sandbox.start_bg("echo done > /tmp/bg.txt") + assert short_job.wait(timeout=2) == 0 + + long_job = sandbox.start_bg("sleep 300") + with pytest.raises(TimeoutError): + long_job.wait(timeout=0.1) + long_job.kill() + assert isinstance(long_job.wait(timeout=2), int) + + sandbox.kill() + raw = getattr(sandbox, "raw", None) + assert raw is not None + assert raw.terminated is True + + def test_factory_creates_hf_backend(self, monkeypatch): + _install_fake_hf_sandbox(monkeypatch) + + import openenv.core.harness.sandbox.hf_backend as hf_backend + import openenv.core.harness.sandbox as sandbox_pkg + + importlib.reload(hf_backend) + importlib.reload(sandbox_pkg) + + monkeypatch.setattr(hf_backend, "Sandbox", _FakeSandboxAPI) + backend = sandbox_pkg.create_sandbox_backend("hf", image="python:3.12") + assert isinstance(backend, hf_backend.HFSandboxBackend) From 9d856401edd6d2e23bebec130716d9674a5eba45 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Fri, 15 May 2026 14:23:56 +0530 Subject: [PATCH 10/35] chore: ruff + usort format pass --- envs/agent_world_model_env/server/web_ui.py | 4 +- envs/chat_env/models.py | 4 +- envs/chat_env/server/chat_environment.py | 4 +- envs/coding_agent_env/client.py | 1 - .../sandbox/build_template.py | 2 +- .../server/coding_environment.py | 5 +- envs/coding_agent_env/server/gradio_ui.py | 1 - .../server/coding_tools_env_environment.py | 65 ++++++++++--- envs/coding_tools_env/server/e2b_sandbox.py | 23 ++++- envs/coding_tools_env/server/gradio_ui.py | 94 +++++++++++++------ .../jupyter_env/server/jupyter_environment.py | 10 +- envs/repl_env/server/repl_environment.py | 4 +- .../server/terminus_env_environment.py | 10 +- envs/textarena_env/server/gradio_ui.py | 8 +- .../core/harness/sandbox/hf_backend.py | 8 +- tests/core/test_hf_sandbox_backend.py | 2 +- tests/envs/test_coding_agent_env.py | 8 +- 17 files changed, 178 insertions(+), 75 deletions(-) diff --git a/envs/agent_world_model_env/server/web_ui.py b/envs/agent_world_model_env/server/web_ui.py index 84b10c6b2..09b445d3f 100644 --- a/envs/agent_world_model_env/server/web_ui.py +++ b/envs/agent_world_model_env/server/web_ui.py @@ -21,9 +21,7 @@ # Keep in sync with DEFAULT_REWARD_CONFIG in config.py. -_DEFAULT_REWARD_JSON = json.dumps( - DEFAULT_REWARD_CONFIG, indent=2 -) +_DEFAULT_REWARD_JSON = json.dumps(DEFAULT_REWARD_CONFIG, indent=2) def _format_obs_md(payload: dict | None) -> str: diff --git a/envs/chat_env/models.py b/envs/chat_env/models.py index 8bc10f09e..da994cbe3 100644 --- a/envs/chat_env/models.py +++ b/envs/chat_env/models.py @@ -55,7 +55,9 @@ class ChatState(State): """State of the ChatEnvironment containing message history.""" history_messages: list[Message] = Field(default_factory=list) - history_tokens: list[list[int]] = Field(default_factory=list) # Same len as messages + history_tokens: list[list[int]] = Field( + default_factory=list + ) # Same len as messages class ChatObservation(Observation): diff --git a/envs/chat_env/server/chat_environment.py b/envs/chat_env/server/chat_environment.py index 90b2d01f0..f66f3e790 100644 --- a/envs/chat_env/server/chat_environment.py +++ b/envs/chat_env/server/chat_environment.py @@ -90,7 +90,9 @@ def _coerce_tokens(self, tokens) -> list[int]: def _tokenize_conversation(self, conversation: list[Message]) -> list[int]: """Tokenize a conversation with a chat-template fallback for base tokenizers.""" try: - tokens = self.tokenizer.apply_chat_template(conversation=conversation, tokenize=True) + tokens = self.tokenizer.apply_chat_template( + conversation=conversation, tokenize=True + ) except Exception: # Some tokenizers (e.g. gpt2) do not define `chat_template`. fallback_text = "".join( diff --git a/envs/coding_agent_env/client.py b/envs/coding_agent_env/client.py index 8c512090d..7e2a21696 100644 --- a/envs/coding_agent_env/client.py +++ b/envs/coding_agent_env/client.py @@ -169,4 +169,3 @@ def _extract_text(result: Any) -> str: return text return str(result) - diff --git a/envs/coding_agent_env/sandbox/build_template.py b/envs/coding_agent_env/sandbox/build_template.py index e22b30185..e1fdac50a 100644 --- a/envs/coding_agent_env/sandbox/build_template.py +++ b/envs/coding_agent_env/sandbox/build_template.py @@ -114,7 +114,7 @@ def main(argv: list[str] | None = None) -> int: p.add_argument( "--name", default="coding-agent-rl", - help="Template name (default: coding-agent-rl)." + help="Template name (default: coding-agent-rl).", ) p.add_argument( "--skip-cache", diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index 3f8eabd13..e389eb759 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -480,7 +480,9 @@ def _build_agent_config( ) provider = ( - "openai" if mode == "transparent_proxy" else self._infer_pi_provider(base_url) + "openai" + if mode == "transparent_proxy" + else self._infer_pi_provider(base_url) ) return _GenericAgentConfig( base_url=base_url.rstrip("/"), @@ -654,4 +656,3 @@ def _safe_read(sandbox: Any, path: str) -> str: return sandbox.read_text(path) or "" except Exception: return "" - diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py index 5497ef0f2..ef3f94aeb 100644 --- a/envs/coding_agent_env/server/gradio_ui.py +++ b/envs/coding_agent_env/server/gradio_ui.py @@ -611,4 +611,3 @@ def apply_preset(name: str) -> tuple[str, str, str]: ) return app - diff --git a/envs/coding_tools_env/server/coding_tools_env_environment.py b/envs/coding_tools_env/server/coding_tools_env_environment.py index 615e7770f..d0ef86675 100644 --- a/envs/coding_tools_env/server/coding_tools_env_environment.py +++ b/envs/coding_tools_env/server/coding_tools_env_environment.py @@ -45,16 +45,28 @@ def bash(command: str, timeout: float | None = 30) -> str: return "Error: environment not reset. Call reset() first." timeout_value = 30 if timeout is None else float(timeout) result = self._sandbox.run_shell(command, timeout_s=timeout_value) - self._record("bash", result.ok, result.output, result.error, result.metadata) - return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip() + self._record( + "bash", result.ok, result.output, result.error, result.metadata + ) + return ( + result.output + if result.ok + else f"ERROR: {result.error}\n{result.output}".strip() + ) @mcp.tool - def read(file_path: str, offset: int | None = None, limit: int | None = None) -> str: + def read( + file_path: str, offset: int | None = None, limit: int | None = None + ) -> str: """Read file contents using computer instance.""" if not self._sandbox: return "Error: environment not reset. Call reset() first." - result = self._sandbox.read_file(file_path=file_path, offset=offset, limit=limit) - self._record("read", result.ok, result.output, result.error, result.metadata) + result = self._sandbox.read_file( + file_path=file_path, offset=offset, limit=limit + ) + self._record( + "read", result.ok, result.output, result.error, result.metadata + ) return result.output if result.ok else f"ERROR: {result.error}" @mcp.tool @@ -63,7 +75,9 @@ def write(file_path: str, content: str) -> str: if not self._sandbox: return "Error: environment not reset. Call reset() first." result = self._sandbox.write_file(file_path=file_path, content=content) - self._record("write", result.ok, result.output, result.error, result.metadata) + self._record( + "write", result.ok, result.output, result.error, result.metadata + ) return result.output if result.ok else f"ERROR: {result.error}" @mcp.tool @@ -88,10 +102,14 @@ def edit( updated = original.replace(old_string, new_string) else: updated = original.replace(old_string, new_string, 1) - write_result = self._sandbox.write_file(file_path=file_path, content=updated) + write_result = self._sandbox.write_file( + file_path=file_path, content=updated + ) ok = write_result.ok msg = "edit ok" if ok else "" - self._record("edit", ok, msg, write_result.error, {"replace_all": replace_all}) + self._record( + "edit", ok, msg, write_result.error, {"replace_all": replace_all} + ) return msg if ok else f"ERROR: {write_result.error}" @mcp.tool @@ -129,7 +147,11 @@ def multi_edit(file_path: str, edits: list[dict[str, Any]]) -> str: write_result.error, {"applied": applied}, ) - return f"applied {applied} edits" if write_result.ok else f"ERROR: {write_result.error}" + return ( + f"applied {applied} edits" + if write_result.ok + else f"ERROR: {write_result.error}" + ) @mcp.tool def glob(pattern: str, path: str | None = None) -> str: @@ -137,17 +159,27 @@ def glob(pattern: str, path: str | None = None) -> str: if not self._sandbox: return "Error: environment not reset. Call reset() first." result = self._sandbox.glob_files(pattern=pattern, path=path) - self._record("glob", result.ok, result.output, result.error, result.metadata) + self._record( + "glob", result.ok, result.output, result.error, result.metadata + ) return result.output if result.ok else f"ERROR: {result.error}" @mcp.tool - def grep(pattern: str, path: str | None = None, include: str | None = None) -> str: + def grep( + pattern: str, path: str | None = None, include: str | None = None + ) -> str: """Search for patterns in files.""" if not self._sandbox: return "Error: environment not reset. Call reset() first." result = self._sandbox.grep(pattern=pattern, path=path, include=include) - self._record("grep", result.ok, result.output, result.error, result.metadata) - return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip() + self._record( + "grep", result.ok, result.output, result.error, result.metadata + ) + return ( + result.output + if result.ok + else f"ERROR: {result.error}\n{result.output}".strip() + ) @mcp.tool def ls(path: str = ".", ignore: list[str] | None = None) -> str: @@ -177,7 +209,9 @@ def todo_write(todos: list[dict[str, Any]]) -> str: self._record("todo_write", False, "", msg, None) return msg self._state.todos = validated - self._record("todo_write", True, f"stored {len(validated)} todos", None, None) + self._record( + "todo_write", True, f"stored {len(validated)} todos", None, None + ) return f"stored {len(validated)} todos" @mcp.tool @@ -281,7 +315,8 @@ def reset( "sandbox_id": self._state.sandbox_id, "message": "Setup command failed.", "setup_results": [ - entry.model_dump() for entry in self._state.setup_results + entry.model_dump() + for entry in self._state.setup_results ], }, ) diff --git a/envs/coding_tools_env/server/e2b_sandbox.py b/envs/coding_tools_env/server/e2b_sandbox.py index 5833c7ecb..d6f77373b 100644 --- a/envs/coding_tools_env/server/e2b_sandbox.py +++ b/envs/coding_tools_env/server/e2b_sandbox.py @@ -94,7 +94,11 @@ def read_file( def write_file(self, file_path: str, content: str) -> ToolResult: try: self._sbx.files.write(file_path, content.encode("utf-8")) - return ToolResult(ok=True, output="write ok", metadata={"bytes": len(content.encode("utf-8"))}) + return ToolResult( + ok=True, + output="write ok", + metadata={"bytes": len(content.encode("utf-8"))}, + ) except Exception as exc: return ToolResult(ok=False, error=f"write failed: {exc}") @@ -111,7 +115,9 @@ def glob_files(self, pattern: str, path: str | None = None) -> ToolResult: if result is None: return ToolResult(ok=False, error=_format_error(execution)) matches = result.get("matches", []) - return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches}) + return ToolResult( + ok=True, output="\n".join(matches), metadata={"matches": matches} + ) def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResult: ignore = ignore or [] @@ -137,10 +143,15 @@ def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResu if not result.get("ok", False): return ToolResult(ok=False, error=str(result.get("error", "ls failed"))) items = result.get("items", []) - lines = [f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}" for item in items] + lines = [ + f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}" + for item in items + ] return ToolResult(ok=True, output="\n".join(lines), metadata={"items": items}) - def grep(self, pattern: str, path: str | None = None, include: str | None = None) -> ToolResult: + def grep( + self, pattern: str, path: str | None = None, include: str | None = None + ) -> ToolResult: root = path or "." code = ( "from pathlib import Path\n" @@ -173,7 +184,9 @@ def grep(self, pattern: str, path: str | None = None, include: str | None = None if not result.get("ok", False): return ToolResult(ok=False, error=str(result.get("error", "grep failed"))) matches = result.get("matches", []) - return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches}) + return ToolResult( + ok=True, output="\n".join(matches), metadata={"matches": matches} + ) def kill(self) -> None: try: diff --git a/envs/coding_tools_env/server/gradio_ui.py b/envs/coding_tools_env/server/gradio_ui.py index c0d670a99..1f3845141 100644 --- a/envs/coding_tools_env/server/gradio_ui.py +++ b/envs/coding_tools_env/server/gradio_ui.py @@ -105,7 +105,9 @@ def _extract_tool_error(result: dict[str, Any]) -> bool: def _format_status(state: dict[str, Any]) -> str: if not state: - return "**No active session.** Configure setup/verify and click *Reset sandbox*." + return ( + "**No active session.** Configure setup/verify and click *Reset sandbox*." + ) sandbox_id = state.get("sandbox_id") or "—" step_count = state.get("step_count", 0) submitted = state.get("submitted", False) @@ -227,9 +229,9 @@ def state_payload() -> dict[str, Any]: label="edits (JSON array)", language="json", value=( - '[\n' + "[\n" ' {"old_string": "TODO", "new_string": "DONE", "replace_all": false}\n' - ']' + "]" ), lines=8, ) @@ -260,10 +262,10 @@ def state_payload() -> dict[str, Any]: label="todos (JSON array)", language="json", value=( - '[\n' + "[\n" ' {"id":"1","content":"Inspect files",' '"status":"in_progress","priority":"high"}\n' - ']' + "]" ), lines=8, ) @@ -337,23 +339,33 @@ def on_tool_change(tool: str): return [help_md, *updates] tool_dropdown.change( - on_tool_change, inputs=[tool_dropdown], outputs=[tool_help, *group_components] + on_tool_change, + inputs=[tool_dropdown], + outputs=[tool_help, *group_components], ) # ───────── Result rendering helper ───────── - def render_result(tool: str, raw: dict[str, Any]) -> tuple[str, str, str, str, str, list[list[str]]]: + def render_result( + tool: str, raw: dict[str, Any] + ) -> tuple[str, str, str, str, str, list[list[str]]]: text = _extract_tool_text(raw) - is_error = _extract_tool_error(raw) or text.startswith("ERROR:") or text.startswith("Error:") + is_error = ( + _extract_tool_error(raw) + or text.startswith("ERROR:") + or text.startswith("Error:") + ) badge = "❌ error" if is_error else "✅ ok" status_line = f"**{tool}** — {badge}" state = state_payload() return ( - status_line, # output_status - text, # output_view - json.dumps(raw, indent=2), # raw_response - _format_status(state), # state_summary (top + summary panel — same content) + status_line, # output_status + text, # output_view + json.dumps(raw, indent=2), # raw_response + _format_status( + state + ), # state_summary (top + summary panel — same content) json.dumps(state, indent=2, default=str), # state_json - _format_history(state), # history_table + _format_history(state), # history_table ) # ───────── Session handlers ───────── @@ -398,21 +410,33 @@ async def on_close(): async def on_run( tool: str, # bash - bash_command: str, bash_timeout: float, + bash_command: str, + bash_timeout: float, # read - read_path: str, read_offset: float | None, read_limit: float | None, + read_path: str, + read_offset: float | None, + read_limit: float | None, # write - write_path: str, write_content: str, + write_path: str, + write_content: str, # edit - edit_path: str, edit_old: str, edit_new: str, edit_replace_all: bool, + edit_path: str, + edit_old: str, + edit_new: str, + edit_replace_all: bool, # multi_edit - multi_edit_path: str, multi_edit_json: str, + multi_edit_path: str, + multi_edit_json: str, # glob - glob_pattern: str, glob_path: str, + glob_pattern: str, + glob_path: str, # grep - grep_pattern: str, grep_path: str, grep_include: str, + grep_pattern: str, + grep_path: str, + grep_include: str, # ls - ls_path: str, ls_ignore: str, + ls_path: str, + ls_ignore: str, # todo_write todo_json: str, ): @@ -493,14 +517,26 @@ async def on_run( # ───────── Wire up events ───────── all_inputs = [ tool_dropdown, - bash_command, bash_timeout, - read_path, read_offset, read_limit, - write_path, write_content, - edit_path, edit_old, edit_new, edit_replace_all, - multi_edit_path, multi_edit_json, - glob_pattern, glob_path, - grep_pattern, grep_path, grep_include, - ls_path, ls_ignore, + bash_command, + bash_timeout, + read_path, + read_offset, + read_limit, + write_path, + write_content, + edit_path, + edit_old, + edit_new, + edit_replace_all, + multi_edit_path, + multi_edit_json, + glob_pattern, + glob_path, + grep_pattern, + grep_path, + grep_include, + ls_path, + ls_ignore, todo_json, ] all_outputs = [ diff --git a/envs/jupyter_env/server/jupyter_environment.py b/envs/jupyter_env/server/jupyter_environment.py index bc622ae22..b7902e5d2 100644 --- a/envs/jupyter_env/server/jupyter_environment.py +++ b/envs/jupyter_env/server/jupyter_environment.py @@ -348,7 +348,10 @@ def step( ) -> Observation: self._state.step_count += 1 obs = super().step(action, timeout_s=timeout_s, **kwargs) - if self._state.submitted_answer is not None and self._state.last_reward is not None: + if ( + self._state.submitted_answer is not None + and self._state.last_reward is not None + ): obs.done = True obs.reward = self._state.last_reward return obs @@ -361,7 +364,10 @@ async def step_async( ) -> Observation: self._state.step_count += 1 obs = await super().step_async(action, timeout_s=timeout_s, **kwargs) - if self._state.submitted_answer is not None and self._state.last_reward is not None: + if ( + self._state.submitted_answer is not None + and self._state.last_reward is not None + ): obs.done = True obs.reward = self._state.last_reward return obs diff --git a/envs/repl_env/server/repl_environment.py b/envs/repl_env/server/repl_environment.py index f2e6f5d98..13a759c29 100644 --- a/envs/repl_env/server/repl_environment.py +++ b/envs/repl_env/server/repl_environment.py @@ -272,9 +272,7 @@ def reset( # reset() are treated as equal and don't trigger a redundant rebuild. resolved_model = self._resolve_model(llm_model) has_runtime_llm = self._runtime_controller is not None - model_changed = ( - has_runtime_llm and resolved_model != self._current_llm_model - ) + model_changed = has_runtime_llm and resolved_model != self._current_llm_model token_provided = hf_token is not None if not self.llm_query_fn or model_changed or token_provided: effective_token = ( diff --git a/envs/terminus_env/server/terminus_env_environment.py b/envs/terminus_env/server/terminus_env_environment.py index c6f9e1c02..03de18baa 100644 --- a/envs/terminus_env/server/terminus_env_environment.py +++ b/envs/terminus_env/server/terminus_env_environment.py @@ -183,7 +183,10 @@ def step( ) -> Observation: self._state.step_count += 1 obs = super().step(action, timeout_s=timeout_s, **kwargs) - if self._state.submitted_answer is not None and self._state.last_reward is not None: + if ( + self._state.submitted_answer is not None + and self._state.last_reward is not None + ): obs.done = True obs.reward = self._state.last_reward return obs @@ -196,7 +199,10 @@ async def step_async( ) -> Observation: self._state.step_count += 1 obs = await super().step_async(action, timeout_s=timeout_s, **kwargs) - if self._state.submitted_answer is not None and self._state.last_reward is not None: + if ( + self._state.submitted_answer is not None + and self._state.last_reward is not None + ): obs.done = True obs.reward = self._state.last_reward return obs diff --git a/envs/textarena_env/server/gradio_ui.py b/envs/textarena_env/server/gradio_ui.py index 45728fc00..c9bb88cae 100644 --- a/envs/textarena_env/server/gradio_ui.py +++ b/envs/textarena_env/server/gradio_ui.py @@ -71,7 +71,9 @@ def _sudoku_demo_html() -> str: for col in range(9): value = givens.get((row, col), "") border_right = "3px solid #0f172a" if col in {2, 5} else "1px solid #94a3b8" - border_bottom = "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8" + border_bottom = ( + "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8" + ) background = "#e2e8f0" if value else "#ffffff" cells.append( f""" @@ -82,7 +84,7 @@ def _sudoku_demo_html() -> str: align-items: center; justify-content: center; font-size: 1.1rem; - font-weight: {'700' if value else '400'}; + font-weight: {"700" if value else "400"}; color: #0f172a; background: {background}; border-right: {border_right}; @@ -105,7 +107,7 @@ def _sudoku_demo_html() -> str: border: 3px solid #0f172a; background: #ffffff; "> - {''.join(cells)} + {"".join(cells)}

Use the Playground tab to reset the game and submit moves in the diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py index 410a8daea..bb41356e2 100644 --- a/src/openenv/core/harness/sandbox/hf_backend.py +++ b/src/openenv/core/harness/sandbox/hf_backend.py @@ -163,7 +163,9 @@ def start_bg( pid = _parse_pid(result.stdout) if pid is None: - raise RuntimeError(f"Could not extract PID from start_bg output: {result.stdout!r}") + raise RuntimeError( + f"Could not extract PID from start_bg output: {result.stdout!r}" + ) job = HFBgJob(self, pid=pid, marker_path=marker_path) self._bg_jobs.append(job) @@ -174,7 +176,9 @@ def write_text(self, path: str, content: str) -> None: if parent not in ("", "/"): r = self.exec(f"mkdir -p {_shell_quote(parent)}", timeout=10) if r.exit_code != 0: - raise RuntimeError(f"Failed to create parent directory {parent!r}: {r.stderr}") + raise RuntimeError( + f"Failed to create parent directory {parent!r}: {r.stderr}" + ) self._sbx.write_file(path, content) def read_text(self, path: str) -> str: diff --git a/tests/core/test_hf_sandbox_backend.py b/tests/core/test_hf_sandbox_backend.py index d301b2b2b..cd235c748 100644 --- a/tests/core/test_hf_sandbox_backend.py +++ b/tests/core/test_hf_sandbox_backend.py @@ -210,8 +210,8 @@ def test_create_exec_write_read_exists_bg_and_kill(self, monkeypatch): def test_factory_creates_hf_backend(self, monkeypatch): _install_fake_hf_sandbox(monkeypatch) - import openenv.core.harness.sandbox.hf_backend as hf_backend import openenv.core.harness.sandbox as sandbox_pkg + import openenv.core.harness.sandbox.hf_backend as hf_backend importlib.reload(hf_backend) importlib.reload(sandbox_pkg) diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py index ec1f66fa5..3a89a3ce6 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -46,14 +46,14 @@ def test_public_api_imports() -> None: """Top-level package re-exports the documented surface.""" from coding_agent_env import ( # noqa: F401 - CommandResult, - E2BSandboxBackend, CodingAgentConfig, CodingAgentEnv, CodingAgentSession, CodingAgentSessionFactory, CodingAgentState, CodingAgentTask, + CommandResult, + E2BSandboxBackend, Provider, RolloutResult, RolloutTurn, @@ -267,7 +267,9 @@ def test_coding_agent_task_coerce_str() -> None: def test_coding_agent_task_coerce_dict() -> None: from coding_agent_env import CodingAgentTask - t = CodingAgentTask.coerce({"instruction": "x", "setup_shell": "pip install pandas"}) + t = CodingAgentTask.coerce( + {"instruction": "x", "setup_shell": "pip install pandas"} + ) assert t.instruction == "x" assert t.setup_shell == "pip install pandas" From 2f9435cb5d8936567cad5cdc272a12cb16668963 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Fri, 15 May 2026 23:35:31 +0530 Subject: [PATCH 11/35] refactor: remove transparent_proxy mode and in-sandbox interception proxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the transparent proxy was a passive forwarder that captured logprobs by injecting logprobs=true into upstream requests. It is replaced by the interception_gate mode where the trainer owns the forward pass entirely — no proxy needed inside the sandbox. --- envs/coding_agent_env/config.py | 13 - .../sandbox/build_template.py | 73 +- src/openenv/core/harness/agents/base.py | 3 - src/openenv/core/harness/agents/opencode.py | 1 - src/openenv/core/harness/agents/pi.py | 1 - .../core/harness/sandbox/interception.py | 660 ------------------ 6 files changed, 3 insertions(+), 748 deletions(-) delete mode 100644 src/openenv/core/harness/sandbox/interception.py diff --git a/envs/coding_agent_env/config.py b/envs/coding_agent_env/config.py index 2eac8d16f..b3243253e 100644 --- a/envs/coding_agent_env/config.py +++ b/envs/coding_agent_env/config.py @@ -51,19 +51,6 @@ class CodingAgentConfig(BaseModel): # ``/home/user``. Override when using a root-privileged backend (Docker). sandbox_home: str = "/home/user" - # --- Transparent-proxy tuning -------------------------------------------- - # Cap ``max_tokens`` / ``max_completion_tokens`` on forwarded requests. - # OpenCode defaults to a very large number (~32000) which exceeds some - # provider limits (e.g. gpt-4o-mini = 16384). Only used in - # ``mode="transparent_proxy"``. ``None`` disables the cap. - proxy_max_tokens_cap: int | None = 16384 - # Per-turn top-k logprobs the proxy requests from the upstream. - proxy_top_logprobs: int = 5 - # Disable reasoning/thinking mode for Qwen3 / Qwen3.5 models. Proxy sets - # ``extra_body.chat_template_kwargs.enable_thinking=false`` on forwarded - # requests. Ignored by providers that don't support the field. - proxy_disable_thinking: bool = False - _PROVIDER_NPM = { "openai_compatible": "@ai-sdk/openai-compatible", diff --git a/envs/coding_agent_env/sandbox/build_template.py b/envs/coding_agent_env/sandbox/build_template.py index e1fdac50a..01978b520 100644 --- a/envs/coding_agent_env/sandbox/build_template.py +++ b/envs/coding_agent_env/sandbox/build_template.py @@ -4,35 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Build a pre-baked E2B template with opencode + proxy deps already installed. - -Run-time per rollout drops from ~3 min (cold install) to ~30s once the -template is built, because we skip: - - - ``curl https://opencode.ai/install | bash`` (~30-90s) - - ``pip install fastapi uvicorn httpx`` (~30-60s) - - directory layout setup - - copying the proxy source - -The template ships: - - - opencode CLI at ``/home/user/.opencode/bin/opencode`` - - Python deps for the in-sandbox proxy - - The proxy source at ``/home/user/proxy/interception.py`` - - Pre-created dirs: ``~/.config/opencode``, ``~/logs/{agent,verifier}``, - ``~/task``, ``~/workdir``, ``~/proxy`` - - Default workdir: ``/home/user/workdir`` - -Usage:: - - .venv/bin/python envs/coding_agent_env/sandbox/build_template.py - # → builds (or rebuilds) ``coding-agent-rl`` template, prints template id - -Then rollout tests can use it via ``--template coding-agent-rl``. - -Requires ``E2B_API_KEY`` in the environment. First build is ~3-8 min; -subsequent builds reuse the cache and can finish in <60s. -""" +"""Build a pre-baked E2B template with opencode already installed.""" from __future__ import annotations @@ -43,11 +15,7 @@ from e2b import default_build_logger, Template - _REPO_ROOT = Path(__file__).resolve().parents[3] -_PROXY_SOURCE = ( - _REPO_ROOT / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py" -) def _load_env(path: Path) -> None: @@ -65,25 +33,9 @@ def _load_env(path: Path) -> None: def build_template(name: str, *, skip_cache: bool = False) -> str: - if not _PROXY_SOURCE.exists(): - raise RuntimeError(f"proxy source missing at {_PROXY_SOURCE}") - - # Template.copy() resolves relative paths against the caller's source - # file directory. This script lives next to ``interception.py`` so the - # bare filename works. - - # Stage 1 (root): system-wide pip deps for the proxy. - # Stage 2 (user): opencode install + dir layout + proxy copy. template = ( Template() .from_python_image("3.12") - .pip_install( - [ - "fastapi>=0.104", - "uvicorn[standard]>=0.24", - "httpx>=0.27", - ] - ) .set_user("user") .run_cmd("curl -fsSL https://opencode.ai/install | bash") .run_cmd("/home/user/.opencode/bin/opencode --version") @@ -92,13 +44,10 @@ def build_template(name: str, *, skip_cache: bool = False) -> str: .make_dir("/home/user/logs/verifier") .make_dir("/home/user/task") .make_dir("/home/user/workdir") - .make_dir("/home/user/proxy") - .copy(str(_PROXY_SOURCE), "/home/user/proxy/interception.py") .set_workdir("/home/user/workdir") ) if skip_cache: template = template.skip_cache() - info = Template.build( template, name, @@ -111,31 +60,15 @@ def build_template(name: str, *, skip_cache: bool = False) -> str: def main(argv: list[str] | None = None) -> int: p = argparse.ArgumentParser(prog="build_e2b_template") - p.add_argument( - "--name", - default="coding-agent-rl", - help="Template name (default: coding-agent-rl).", - ) - p.add_argument( - "--skip-cache", - action="store_true", - help="Force a clean rebuild, ignoring cache.", - ) + p.add_argument("--name", default="coding-agent-rl") + p.add_argument("--skip-cache", action="store_true") args = p.parse_args(argv) - _load_env(_REPO_ROOT / "envs" / "coding_agent_env" / "sandbox" / ".env") if not os.environ.get("E2B_API_KEY"): print("ERROR: E2B_API_KEY required.", file=sys.stderr) return 2 - - print(f"Building template '{args.name}' (proxy source: {_PROXY_SOURCE})") - print(f"Skip cache: {args.skip_cache}") - print() - template_id = build_template(args.name, skip_cache=args.skip_cache) - print() print(f"Built. Template id/name: {template_id}") - print(f"Use in code: Sandbox.create(template='{args.name}')") return 0 diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py index 72cc9a6cf..ded9ba3b8 100644 --- a/src/openenv/core/harness/agents/base.py +++ b/src/openenv/core/harness/agents/base.py @@ -174,9 +174,6 @@ class CLIAgentSpec: mcp_config: MCPConfigSpec """How MCP tool configuration is injected.""" - supports_logprob_proxy: bool = True - """Whether this agent can be routed through the interception proxy.""" - default_timeout_s: float = 600.0 """Default per-rollout timeout in seconds.""" diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py index d0146b008..13c17fa04 100644 --- a/src/openenv/core/harness/agents/opencode.py +++ b/src/openenv/core/harness/agents/opencode.py @@ -177,7 +177,6 @@ def _system_prompt_content(task: Any, config: Any) -> str | None: method="config_file", path_template="{home}/.config/opencode/opencode.json", ), - supports_logprob_proxy=True, default_timeout_s=900.0, setup=( "set -e && " diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index 63e2eb0c3..d7b60569f 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -111,7 +111,6 @@ def _parse_events(line: str) -> AgentEvent | None: method="config_file", path_template="{workdir}/.mcp.json", ), - supports_logprob_proxy=True, default_timeout_s=600.0, setup=( "set -e && " diff --git a/src/openenv/core/harness/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py deleted file mode 100644 index 4e7c857ac..000000000 --- a/src/openenv/core/harness/sandbox/interception.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -"""Transparent OpenAI-compatible forwarding proxy with logprob capture. - -The proxy is a small FastAPI app that agent CLIs (OpenCode, Claude Code, -Codex, Pi, etc.) talk to instead of the upstream LLM endpoint. It: - -1. Forwards every ``POST /v1/chat/completions`` request to the real upstream - URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream - returns per-token logprobs. -2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines - trace file. -3. Returns the upstream response to the agent verbatim (minus the ``logprobs`` - field, which we strip so the CLI never sees anything unexpected). - -The proxy is stateless beyond the trace file. One proxy instance runs per -session, normally inside the sandbox on ``localhost:7000``. - -Run standalone:: - - UPSTREAM_API_KEY=... python -m openenv.core.harness.sandbox.interception \\ - --upstream-url https://vllm.example/v1 \\ - --trace /tmp/trace.jsonl \\ - --port 7000 -""" - -from __future__ import annotations - -import argparse -import asyncio -import copy -import json -import logging -import os -import socket -import threading -import time -from contextlib import asynccontextmanager, closing -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any - -import httpx -import uvicorn -from fastapi import FastAPI, Request, Response -from fastapi.responses import JSONResponse, StreamingResponse - - -CHAT_COMPLETIONS_PATH = "/v1/chat/completions" -_LOG = logging.getLogger(__name__) - - -@dataclass -class ProxyConfig: - """Runtime configuration for one :class:`InterceptionProxy`.""" - - upstream_url: str - upstream_api_key: str = "intercepted" - trace_path: str = "/tmp/opencode-proxy-trace.jsonl" - host: str = "127.0.0.1" - port: int = 7000 - top_logprobs: int = 5 - request_timeout_s: float = 600.0 - # Cap ``max_tokens`` before forwarding. OpenCode historically asks for very - # large values (e.g. 32000) that exceed gpt-4o-mini's 16384 cap; capping - # here avoids spurious upstream 400s without requiring the caller to know - # per-model limits. - max_tokens_cap: int | None = 16384 - # Disable Qwen-style reasoning/thinking by injecting - # ``chat_template_kwargs.enable_thinking=false`` into forwarded requests. - disable_thinking: bool = False - # Override the ``model`` field on every forwarded request. Some opencode - # builds emit a stripped model id (e.g. ``Qwen3.5-4B`` instead of the - # ``Qwen/Qwen3.5-4B`` the upstream serves) for their internal - # title-generation call. Setting this to the exact upstream model id - # bypasses that mismatch. - model_override: str | None = None - - -@dataclass -class TurnRecord: - """One intercepted turn, written to the trace file as JSON-lines.""" - - turn: int - request: dict[str, Any] - response: dict[str, Any] - logprobs: list[dict[str, Any]] | None - completion_tokens: list[str] - completion_token_ids: list[int] - per_token_logps: list[float] - finish_reason: str | None - latency_s: float - timestamp: float = field(default_factory=time.time) - - def to_json(self) -> str: - return json.dumps(self.__dict__, default=str) - - -def _build_app(cfg: ProxyConfig) -> FastAPI: - """Construct the FastAPI app that serves one proxy session.""" - - state: dict[str, Any] = {"turn": 0, "lock": asyncio.Lock()} - - # HTTP client reused across requests. ``None`` auth header — we let each - # request carry its own ``Authorization`` populated from ``upstream_api_key``. - client = httpx.AsyncClient(timeout=cfg.request_timeout_s) - trace_file = open(cfg.trace_path, "a", buffering=1) - - @asynccontextmanager - async def lifespan(_: FastAPI) -> Any: - try: - yield - finally: - await client.aclose() - trace_file.close() - - app = FastAPI(title="opencode-interception-proxy", lifespan=lifespan) - - @app.get("/healthz") - def healthz() -> dict[str, str]: - return {"status": "ok"} - - @app.post(CHAT_COMPLETIONS_PATH) - async def chat_completions(request: Request) -> Response: - raw_body = await request.body() - try: - body = json.loads(raw_body) - except json.JSONDecodeError: - return JSONResponse(status_code=400, content={"error": "invalid json body"}) - - forwarded_body = _prepare_forwarded_body(body, cfg) - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {cfg.upstream_api_key}", - } - upstream_url = _resolve_upstream_url(cfg.upstream_url) - - async with state["lock"]: - state["turn"] += 1 - turn_idx = state["turn"] - - if forwarded_body.get("stream"): - return await _proxy_streaming( - client=client, - upstream_url=upstream_url, - headers=headers, - forwarded_body=forwarded_body, - original_body=body, - trace_file=trace_file, - turn_idx=turn_idx, - ) - return await _proxy_unary( - client=client, - upstream_url=upstream_url, - headers=headers, - forwarded_body=forwarded_body, - original_body=body, - trace_file=trace_file, - turn_idx=turn_idx, - ) - - return app - - -def _prepare_forwarded_body(body: dict[str, Any], cfg: ProxyConfig) -> dict[str, Any]: - """Return the body we actually send upstream. - - - Injects ``logprobs=true`` + ``top_logprobs`` so the upstream emits - per-token logprobs. - - Caps ``max_tokens`` / ``max_completion_tokens`` to ``max_tokens_cap``. - - For models that reject ``max_tokens`` (e.g. gpt-5.x), translates to - ``max_completion_tokens``. - """ - forwarded = copy.deepcopy(body) - forwarded.setdefault("logprobs", True) - forwarded.setdefault("top_logprobs", cfg.top_logprobs) - - # GPT-5.x and newer: ``max_tokens`` is rejected; must use - # ``max_completion_tokens``. Detect via model string so we don't break - # gpt-4.x or vLLM-hosted models that accept ``max_tokens``. - model = str(forwarded.get("model", "")) - needs_translation = _model_uses_max_completion_tokens(model) - if needs_translation and "max_tokens" in forwarded: - value = forwarded.pop("max_tokens") - forwarded.setdefault("max_completion_tokens", value) - - if cfg.max_tokens_cap is not None: - for key in ("max_tokens", "max_completion_tokens"): - value = forwarded.get(key) - if isinstance(value, int) and value > cfg.max_tokens_cap: - forwarded[key] = cfg.max_tokens_cap - - if cfg.disable_thinking: - # vLLM applies chat_template_kwargs to the tokenizer's chat template - # for Qwen3/Qwen3.5 models, turning off ... generation. - extra = forwarded.setdefault("chat_template_kwargs", {}) - extra.setdefault("enable_thinking", False) - - if cfg.model_override: - forwarded["model"] = cfg.model_override - - return forwarded - - -def _model_uses_max_completion_tokens(model: str) -> bool: - """Heuristic: ``True`` for models that reject ``max_tokens``.""" - # Strip a provider prefix opencode may have prepended (e.g. "intercepted/"). - bare = model.split("/", 1)[-1].lower() - return bare.startswith(("gpt-5", "o1", "o3", "o4")) - - -def _resolve_upstream_url(upstream: str) -> str: - """Build the fully qualified chat-completions URL from a base URL.""" - base = upstream.rstrip("/") - if base.endswith("/v1"): - return f"{base}/chat/completions" - return f"{base}{CHAT_COMPLETIONS_PATH}" - - -async def _proxy_unary( - *, - client: httpx.AsyncClient, - upstream_url: str, - headers: dict[str, str], - forwarded_body: dict[str, Any], - original_body: dict[str, Any], - trace_file: Any, - turn_idx: int, -) -> Response: - start = time.time() - upstream_response = await client.post( - upstream_url, content=json.dumps(forwarded_body), headers=headers - ) - latency = time.time() - start - try: - response_json = upstream_response.json() - except Exception: - return Response( - content=upstream_response.content, - status_code=upstream_response.status_code, - media_type=upstream_response.headers.get( - "content-type", "application/json" - ), - ) - - record = _build_turn_record( - turn_idx=turn_idx, - request_body=forwarded_body, - response_json=response_json, - latency_s=latency, - ) - trace_file.write(record.to_json() + "\n") - sanitized = _strip_logprobs(response_json) - return JSONResponse(content=sanitized, status_code=upstream_response.status_code) - - -async def _proxy_streaming( - *, - client: httpx.AsyncClient, - upstream_url: str, - headers: dict[str, str], - forwarded_body: dict[str, Any], - original_body: dict[str, Any], - trace_file: Any, - turn_idx: int, -) -> Response: - """Forward an SSE stream while accumulating the full response. - - Opens the upstream stream and inspects the status. On non-2xx, reads the - full body (an error JSON, not SSE) and returns it to the caller as a - regular JSON response — previously we silently emitted an empty - ``text/event-stream`` which opencode interpreted as an empty assistant - turn. Both the error body and the latency are written to the trace file - so debugging a broken rollout doesn't require another round-trip. - """ - - start = time.time() - - # Open the stream outside the generator so we can branch on status before - # committing to a streaming response shape. - upstream_cm = client.stream( - "POST", - upstream_url, - content=json.dumps(forwarded_body), - headers=headers, - ) - upstream = await upstream_cm.__aenter__() - - if upstream.status_code >= 400: - # Upstream responded with an error body (not SSE). Read it fully and - # return as a non-streaming JSON payload. - error_bytes = await upstream.aread() - await upstream_cm.__aexit__(None, None, None) - latency = time.time() - start - try: - error_json = json.loads(error_bytes.decode() or "{}") - except Exception: - error_json = {"error": error_bytes.decode(errors="replace")[:4000]} - record = _build_turn_record( - turn_idx=turn_idx, - request_body=forwarded_body, - response_json={ - "choices": [], - "usage": None, - "upstream_status": upstream.status_code, - "upstream_error": error_json, - }, - latency_s=latency, - ) - trace_file.write(record.to_json() + "\n") - _LOG.warning( - "proxy turn %s: upstream %s: %s", - turn_idx, - upstream.status_code, - str(error_json)[:400], - ) - return JSONResponse(content=error_json, status_code=upstream.status_code) - - async def _stream() -> Any: - accumulated: dict[str, Any] = { - "content_by_idx": {}, - "tool_calls_by_idx": {}, - "finish_by_idx": {}, - "logprobs_by_idx": {}, - } - last_chunk: dict[str, Any] = {} - try: - async for line in upstream.aiter_lines(): - if not line: - yield "\n" - continue - yield line + "\n" - if not line.startswith("data:"): - continue - data = line[len("data:") :].strip() - if data == "[DONE]": - continue - try: - chunk = json.loads(data) - except json.JSONDecodeError: - continue - last_chunk = chunk - _accumulate_stream_chunk(chunk, accumulated) - finally: - await upstream_cm.__aexit__(None, None, None) - - latency = time.time() - start - response_json = _assemble_streamed_response(last_chunk, accumulated) - record = _build_turn_record( - turn_idx=turn_idx, - request_body=forwarded_body, - response_json=response_json, - latency_s=latency, - ) - trace_file.write(record.to_json() + "\n") - - return StreamingResponse(_stream(), media_type="text/event-stream") - - -def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None: - for choice in chunk.get("choices", []) or []: - idx = choice.get("index", 0) - delta = choice.get("delta") or {} - content = delta.get("content") - if content: - acc["content_by_idx"].setdefault(idx, []).append(content) - # HF-Router's Qwen thinking mode streams the chain-of-thought under a - # separate ``reasoning`` field (per Together/Scaleway). Accumulate it - # so the assembled response surfaces it — otherwise it's dropped and - # proxy_turn observability is lost for thinking-mode rollouts. - reasoning = delta.get("reasoning") - if reasoning: - acc.setdefault("reasoning_by_idx", {}).setdefault(idx, []).append(reasoning) - for tc in delta.get("tool_calls") or []: - tc_idx = tc.get("index", 0) - bucket = acc["tool_calls_by_idx"].setdefault( - (idx, tc_idx), - { - "id": None, - "type": "function", - "function": {"name": "", "arguments": ""}, - }, - ) - if tc.get("id"): - bucket["id"] = tc["id"] - fn = tc.get("function") or {} - if fn.get("name"): - bucket["function"]["name"] += fn["name"] - if fn.get("arguments"): - bucket["function"]["arguments"] += fn["arguments"] - if choice.get("finish_reason"): - acc["finish_by_idx"][idx] = choice["finish_reason"] - lp = choice.get("logprobs") or {} - content_lp = lp.get("content") - if content_lp: - acc["logprobs_by_idx"].setdefault(idx, []).extend(content_lp) - - -def _assemble_streamed_response( - last_chunk: dict[str, Any], acc: dict[str, Any] -) -> dict[str, Any]: - indices = sorted( - set(acc["content_by_idx"]) - | set(acc["finish_by_idx"]) - | {k[0] for k in acc["tool_calls_by_idx"]} - | set(acc["logprobs_by_idx"]) - | {0} - ) - choices: list[dict[str, Any]] = [] - for idx in indices: - tool_calls = [ - acc["tool_calls_by_idx"][k] - for k in sorted(acc["tool_calls_by_idx"]) - if k[0] == idx - ] - message: dict[str, Any] = {"role": "assistant"} - content = "".join(acc["content_by_idx"].get(idx, [])) - if content: - message["content"] = content - reasoning = "".join((acc.get("reasoning_by_idx") or {}).get(idx, [])) - if reasoning: - message["reasoning"] = reasoning - if tool_calls: - message["tool_calls"] = tool_calls - choice: dict[str, Any] = { - "index": idx, - "message": message, - "finish_reason": acc["finish_by_idx"].get(idx), - } - if acc["logprobs_by_idx"].get(idx): - choice["logprobs"] = {"content": acc["logprobs_by_idx"][idx]} - choices.append(choice) - return { - "id": last_chunk.get("id", ""), - "object": "chat.completion", - "model": last_chunk.get("model", ""), - "choices": choices, - "usage": last_chunk.get("usage"), - } - - -def _build_turn_record( - *, - turn_idx: int, - request_body: dict[str, Any], - response_json: dict[str, Any], - latency_s: float, -) -> TurnRecord: - """Extract per-token logprobs into a normalized :class:`TurnRecord`.""" - - choice = (response_json.get("choices") or [{}])[0] - logprobs_field = choice.get("logprobs") or {} - content_lp = logprobs_field.get("content") or [] - - tokens: list[str] = [] - token_ids: list[int] = [] - per_token_logps: list[float] = [] - for entry in content_lp: - tokens.append(entry.get("token", "")) - # OpenAI returns no raw token ids; vLLM returns them as ``token_id``. - token_id = entry.get("token_id") - if token_id is not None: - token_ids.append(int(token_id)) - lp = entry.get("logprob") - if lp is not None: - per_token_logps.append(float(lp)) - - return TurnRecord( - turn=turn_idx, - request=request_body, - response=response_json, - logprobs=content_lp, - completion_tokens=tokens, - completion_token_ids=token_ids, - per_token_logps=per_token_logps, - finish_reason=choice.get("finish_reason"), - latency_s=latency_s, - ) - - -def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]: - """Return a copy of the response with ``choices[*].logprobs`` removed.""" - - out = dict(response_json) - choices = out.get("choices") - if isinstance(choices, list): - out["choices"] = [ - {k: v for k, v in (ch or {}).items() if k != "logprobs"} for ch in choices - ] - return out - - -# --------------------------------------------------------------------------- -# Standalone runner (used inside the sandbox) -# --------------------------------------------------------------------------- - - -def serve(cfg: ProxyConfig) -> None: - """Start the proxy and block (for use as the sandbox-side entry point).""" - - app = _build_app(cfg) - uvicorn.run(app, host=cfg.host, port=cfg.port, log_level="warning") - - -class InterceptionProxy: - """Thread-backed controller for running the proxy locally. - - Used by unit tests and by any in-process driver that wants a short-lived - proxy on the local machine. Inside a sandbox we invoke :func:`serve` - directly via ``python -m openenv.core.harness.sandbox.interception``. - """ - - def __init__(self, cfg: ProxyConfig) -> None: - self._cfg = cfg - self._server: uvicorn.Server | None = None - self._thread: threading.Thread | None = None - self._ready = threading.Event() - - @property - def url(self) -> str: - return f"http://{self._cfg.host}:{self._cfg.port}/v1" - - @property - def config(self) -> ProxyConfig: - return self._cfg - - def start(self) -> None: - app = _build_app(self._cfg) - config = uvicorn.Config( - app, - host=self._cfg.host, - port=self._cfg.port, - log_level="warning", - lifespan="on", - ) - self._server = uvicorn.Server(config) - self._thread = threading.Thread(target=self._run_server, daemon=True) - self._thread.start() - # Wait for the server to accept connections. - deadline = time.time() + 10 - while time.time() < deadline: - if _port_open(self._cfg.host, self._cfg.port): - self._ready.set() - return - time.sleep(0.05) - raise RuntimeError("InterceptionProxy failed to start within 10s") - - def _run_server(self) -> None: - assert self._server is not None - self._server.run() - - def stop(self) -> None: - if self._server is None: - return - self._server.should_exit = True - if self._thread is not None: - self._thread.join(timeout=5) - self._server = None - self._thread = None - - def __enter__(self) -> "InterceptionProxy": - self.start() - return self - - def __exit__(self, *exc) -> None: - self.stop() - - -def _port_open(host: str, port: int) -> bool: - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.settimeout(0.2) - return s.connect_ex((host, port)) == 0 - - -# --------------------------------------------------------------------------- -# Trace reader (used by the session to pull captured turns back) -# --------------------------------------------------------------------------- - - -def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]: - """Read a proxy trace file into a list of dicts.""" - - trace: list[dict[str, Any]] = [] - p = Path(path) - if not p.exists(): - return trace - for line in p.read_text().splitlines(): - line = line.strip() - if not line: - continue - trace.append(json.loads(line)) - return trace - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> None: - parser = argparse.ArgumentParser(prog="openenv.core.harness.sandbox.interception") - parser.add_argument("--upstream-url", required=True) - parser.add_argument( - "--upstream-api-key", - default=None, - help=( - "Upstream API key. Prefer OPENCODE_UPSTREAM_API_KEY so the key " - "does not appear in process argv." - ), - ) - parser.add_argument("--trace", default="/tmp/opencode-proxy-trace.jsonl") - parser.add_argument("--host", default="127.0.0.1") - parser.add_argument("--port", type=int, default=7000) - parser.add_argument("--top-logprobs", type=int, default=5) - parser.add_argument("--request-timeout", type=float, default=600.0) - parser.add_argument( - "--max-tokens-cap", - type=int, - default=None, - help="Clamp max_tokens/max_completion_tokens on forwarded requests.", - ) - parser.add_argument( - "--disable-thinking", - action="store_true", - help="Inject chat_template_kwargs.enable_thinking=false (Qwen3/Qwen3.5).", - ) - parser.add_argument( - "--model-override", - default=None, - help="Rewrite the `model` field on every forwarded request.", - ) - args = parser.parse_args() - upstream_api_key = ( - args.upstream_api_key - or os.environ.get("OPENCODE_UPSTREAM_API_KEY") - or os.environ.get("UPSTREAM_API_KEY") - or "intercepted" - ) - - cfg = ProxyConfig( - upstream_url=args.upstream_url, - upstream_api_key=upstream_api_key, - trace_path=args.trace, - host=args.host, - port=args.port, - top_logprobs=args.top_logprobs, - request_timeout_s=args.request_timeout, - max_tokens_cap=args.max_tokens_cap, - disable_thinking=args.disable_thinking, - model_override=args.model_override, - ) - serve(cfg) - - -if __name__ == "__main__": - main() From 71bd9e96326d2618ba3c28646d803faa4ec4f9b3 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Fri, 15 May 2026 23:35:44 +0530 Subject: [PATCH 12/35] feat: InterceptionServer + interception_gate mode for trainer-owned generation InterceptionServer (aiohttp) runs on the trainer host. Each rollout registers a queue. The agent's OPENAI_BASE_URL points at `{base_url}/rollout/{id}/v1`. When the agent makes an LLM call it blocks at the server. The training loop dequeues the request, calls vLLM with logprobs=True and return_token_ids=True, and delivers the response back via deliver_response(). --- pyproject.toml | 1 + src/openenv/core/harness/agents/__init__.py | 4 + src/openenv/core/harness/agents/cli_driver.py | 444 +++++------------- .../harness/agents/interception_server.py | 324 +++++++++++++ 4 files changed, 445 insertions(+), 328 deletions(-) create mode 100644 src/openenv/core/harness/agents/interception_server.py diff --git a/pyproject.toml b/pyproject.toml index 08f1bb6d3..e40b79c9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ # Web UI dependencies "gradio>=4.0.0", "httpx>=0.28.1", + "aiohttp>=3.13.5", ] [project.optional-dependencies] diff --git a/src/openenv/core/harness/agents/__init__.py b/src/openenv/core/harness/agents/__init__.py index 8ef31976b..b715582a4 100644 --- a/src/openenv/core/harness/agents/__init__.py +++ b/src/openenv/core/harness/agents/__init__.py @@ -28,6 +28,7 @@ CLIAgentSpec, MCPConfigSpec, ) +from .interception_server import deliver_response, InterceptionServer # Registry @@ -104,4 +105,7 @@ def _auto_import(name: str) -> None: "ArtifactSpec", "CLIAgentSpec", "MCPConfigSpec", + # Interception gate + "InterceptionServer", + "deliver_response", ] diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 42ac460f1..0e58af9e0 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -6,26 +6,24 @@ """Shared CLI agent driver, session, and session factory. -The :class:`CLIAgentDriver` factors out the common 70% of CLI harness -lifecycle — sandbox creation, MCP config injection, interception proxy -setup, subprocess management, and result collection. +Two modes are supported: -It is **fully generic**: it reads the :class:`CLIAgentSpec`'s declarative -data fields and executes them mechanically. No per-agent code lives here. - -The :class:`CLIAgentSession` implements :class:`ResourceSession` and -the :class:`CLIAgentSessionFactory` implements :class:`ResourceSessionFactory`, -so the CLI agent driver integrates seamlessly with the existing harness -runtime from PR #603. +- ``black_box`` — the agent talks directly to the upstream LLM. No logprob + capture. For eval and demos. +- ``interception_gate`` — the agent's LLM calls are routed to an + :class:`InterceptionServer` running on the trainer host. The training + loop owns the forward pass and delivers responses back. For RL training. """ from __future__ import annotations +import asyncio import json import logging import shlex +import threading import time -from pathlib import Path +import uuid from typing import Any, Callable, Literal from openenv.core.env_server.mcp_types import Tool @@ -39,37 +37,16 @@ from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle from .base import CLIAgentSpec +from .interception_server import deliver_response, InterceptionServer _log = logging.getLogger(__name__) -# Interception proxy defaults -_PROXY_PORT = 7000 -_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl" -_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log" - -# Where the proxy source lives on disk. Uploaded into sandboxes that don't -# already have it baked in. -_PROXY_SOURCE_PATH = Path(__file__).resolve().parents[1] / "sandbox" / "interception.py" - -# Verifier type — callable that checks the agent's work and returns a result Verifier = Callable[..., VerifyResult] -# CLIAgentSession - - class CLIAgentSession(ResourceSession): - """Per-rollout session wrapping one sandbox with one running agent CLI. - - The session is created already-running: :meth:`CLIAgentSessionFactory.create` - launches the agent before returning. Typical usage:: - - session = factory.create(task) - session.wait_for_completion() - result = session.verify([]) - session.close() - """ + """Per-rollout session wrapping one sandbox with one running agent CLI.""" def __init__( self, @@ -80,9 +57,10 @@ def __init__( config: Any, verifier: Verifier | None = None, base_url_override: str | None = None, - proxy_trace_path: str | None = None, - proxy_bg_job: BgJob | None = None, agent_bg_job: BgJob | None = None, + interception_server: InterceptionServer | None = None, + interception_rollout_id: str | None = None, + interception_queue: asyncio.Queue | None = None, ) -> None: self.spec = spec self.sandbox = sandbox @@ -90,11 +68,10 @@ def __init__( self.config = config self._verifier = verifier self._base_url_override = base_url_override - self._proxy_trace_path = proxy_trace_path - self._proxy_bg_job = proxy_bg_job self._agent_bg_job = agent_bg_job - - # ResourceSession contract + self._interception_server = interception_server + self._interception_rollout_id = interception_rollout_id + self._interception_queue = interception_queue def initial_messages(self) -> list[Message]: instruction = ( @@ -105,7 +82,6 @@ def initial_messages(self) -> list[Message]: return [{"role": "user", "content": instruction}] def list_tools(self) -> list[Tool]: - # CLI agents own their own tool loop — none are exposed to the harness. return [] def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult: @@ -132,16 +108,13 @@ def close(self) -> None: except Exception: pass self._agent_bg_job = None - if self._proxy_bg_job is not None: - try: - self._proxy_bg_job.kill() - except Exception: - pass - self._proxy_bg_job = None + if ( + self._interception_server is not None + and self._interception_rollout_id is not None + ): + self._interception_server.unregister_rollout(self._interception_rollout_id) self.sandbox.kill() - # CLI-agent-specific API - def wait_for_completion(self, timeout_s: float | None = None) -> int: """Block until the agent exits, returning its exit code.""" budget = timeout_s if timeout_s is not None else self.spec.default_timeout_s @@ -152,11 +125,7 @@ def wait_for_completion(self, timeout_s: float | None = None) -> int: return self._agent_bg_job.wait(timeout=budget) def collect_artifacts(self) -> dict[str, Any]: - """Collect all artifacts declared in ``spec.artifacts`` from the sandbox. - - Returns a dict keyed by artifact name. Missing optional artifacts are - silently skipped. - """ + """Collect all artifacts declared in ``spec.artifacts`` from the sandbox.""" result: dict[str, Any] = {} if not self.spec.artifacts: return result @@ -166,9 +135,6 @@ def collect_artifacts(self) -> dict[str, Any]: if artifact_spec.format == "json": result[name] = json.loads(content) elif artifact_spec.format == "jsonl": - # Parse valid JSON lines, skip non-JSON preamble - # (e.g. opencode emits database migration messages - # before the first JSON event). records = [] for line in content.splitlines(): line = line.strip() @@ -195,72 +161,85 @@ def collect_artifacts(self) -> dict[str, Any]: ) return result - def fetch_proxy_trace(self) -> list[dict[str, Any]]: - """Return per-turn proxy-captured records (transparent_proxy mode only). + # interception_gate API - Each entry has ``request``, ``response``, ``completion_tokens``, - ``completion_token_ids``, ``per_token_logps``, ``finish_reason``, - and ``latency_s``. Returns ``[]`` in black_box mode. + async def next_request( + self, timeout_s: float | None = None + ) -> dict[str, Any] | None: + """Await the next LLM request from the agent (interception_gate only). + + Returns the intercept dict, or ``None`` when the agent has exited. """ - if self._proxy_trace_path is None: - return [] - try: - content = self.sandbox.read_text(self._proxy_trace_path) - except Exception: - return [] - records: list[dict[str, Any]] = [] - for line in content.splitlines(): - line = line.strip() - if not line: + if self._interception_queue is None: + raise RuntimeError( + "next_request() is only available in interception_gate mode." + ) + server = self._interception_server + assert server is not None + + deadline = time.time() + (timeout_s or self.spec.default_timeout_s) + while True: + remaining = deadline - time.time() + if remaining <= 0: + raise TimeoutError( + f"{self.spec.name} interception_gate: no request within timeout" + ) + try: + request_id = await asyncio.wait_for( + self._interception_queue.get(), + timeout=min(remaining, 1.0), + ) + return server.intercepts[request_id] + except asyncio.TimeoutError: + if self._agent_bg_job is not None: + done_event = getattr(self._agent_bg_job, "_done", None) + if ( + done_event is not None + and isinstance(done_event, threading.Event) + and done_event.is_set() + ): + return None continue - records.append(json.loads(line)) - return records - -# CLIAgentDriver — shared lifecycle + async def deliver( + self, intercept: dict[str, Any], response_dict: dict[str, Any] + ) -> None: + """Return a trainer-generated response to the waiting agent.""" + await deliver_response(intercept, response_dict) class CLIAgentDriver: - """Shared driver for all CLI-based agentic harnesses. - - Implements the common lifecycle: - - 1. Create sandbox (via :class:`SandboxBackend`) - 2. Wait for sandbox ready (``echo ok`` probe) - 3. Install agent CLI — run ``spec.setup`` commands (skipped if - ``spec.install_check_cmd`` succeeds, i.e. pre-baked template) - 4. Upload ``spec.files`` into the sandbox - 5. Write MCP config (via ``spec.build_mcp_config``) - 6. Set environment variables from ``spec.env`` (with placeholder - resolution) - 7. Optionally start interception proxy (transparent_proxy mode) - 8. Build CLI command (via ``spec.build_command``) - 9. Launch agent as bg process - 10. Return a :class:`CLIAgentSession` - """ + """Shared driver for all CLI-based agentic harnesses.""" def __init__( self, spec: CLIAgentSpec, sandbox_backend: SandboxBackend, - mode: Literal["black_box", "transparent_proxy"] = "black_box", + mode: Literal["black_box", "interception_gate"] = "black_box", *, install_timeout_s: int = 240, setup_timeout_s: int = 300, - proxy_top_logprobs: int = 5, - proxy_max_tokens_cap: int | None = 16384, - proxy_disable_thinking: bool = False, + interception_server: InterceptionServer | None = None, + interception_base_url: str | None = None, ) -> None: - if mode not in {"black_box", "transparent_proxy"}: + if mode not in {"black_box", "interception_gate"}: raise ValueError(f"Unknown mode: {mode!r}") + if mode == "interception_gate": + if interception_server is None: + raise ValueError( + "interception_gate mode requires an InterceptionServer instance." + ) + if interception_base_url is None: + raise ValueError( + "interception_gate mode requires interception_base_url." + ) self.spec = spec self.sandbox_backend = sandbox_backend self.mode = mode self._install_timeout_s = install_timeout_s self._setup_timeout_s = setup_timeout_s - self._proxy_top_logprobs = proxy_top_logprobs - self._proxy_max_tokens_cap = proxy_max_tokens_cap - self._proxy_disable_thinking = proxy_disable_thinking + self._interception_server = interception_server + self._interception_base_url = interception_base_url def create_session( self, @@ -271,35 +250,16 @@ def create_session( seed: int | None = None, episode_id: str | None = None, ) -> CLIAgentSession: - """Create a fully bootstrapped session with a running agent. - - This is the main entry point. It: - 1. Creates a sandbox - 2. Bootstraps it (install agent, upload files, write MCP config) - 3. Optionally starts the interception proxy - 4. Launches the agent subprocess - 5. Returns a ready-to-use :class:`CLIAgentSession` - """ timeout_s = ( config.agent_timeout_s if hasattr(config, "agent_timeout_s") else self.spec.default_timeout_s ) sandbox_timeout = int(timeout_s) + 300 - - _log.info( - "%s driver: creating sandbox timeout=%ds mode=%s", - self.spec.name, - sandbox_timeout, - self.mode, - ) sandbox = self.sandbox_backend.create( timeout_s=sandbox_timeout, metadata={"episode_id": episode_id} if episode_id else None, ) - sid = getattr(sandbox, "sandbox_id", "?") - _log.info("%s driver: sandbox=%s — bootstrapping…", self.spec.name, sid) - try: self._bootstrap_sandbox(sandbox, task, config) except Exception as exc: @@ -308,33 +268,21 @@ def create_session( raise base_url_override: str | None = None - proxy_trace_path: str | None = None - proxy_bg_job: BgJob | None = None - - if self.mode == "transparent_proxy": - base_url = config.base_url if hasattr(config, "base_url") else "" - api_key = config.api_key if hasattr(config, "api_key") else "intercepted" - model = config.model if hasattr(config, "model") else "" - - _log.info( - "%s driver: starting interception proxy on :%d → %s", - self.spec.name, - _PROXY_PORT, - base_url, - ) - proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy( - sandbox, - base_url=base_url, - api_key=api_key, - model=model, + interception_rollout_id: str | None = None + interception_queue: asyncio.Queue | None = None + + if self.mode == "interception_gate": + assert self._interception_server is not None + assert self._interception_base_url is not None + rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}" + interception_rollout_id = rollout_id + interception_queue = self._interception_server.register_rollout(rollout_id) + base_url_override = ( + f"{self._interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1" ) - _log.info("%s driver: proxy up at %s", self.spec.name, base_url_override) agent_bg_job = self._start_agent( - sandbox, - task, - config, - base_url_override=base_url_override, + sandbox, task, config, base_url_override=base_url_override ) return CLIAgentSession( @@ -344,35 +292,20 @@ def create_session( config=config, verifier=verifier, base_url_override=base_url_override, - proxy_trace_path=proxy_trace_path, - proxy_bg_job=proxy_bg_job, agent_bg_job=agent_bg_job, + interception_server=self._interception_server, + interception_rollout_id=interception_rollout_id, + interception_queue=interception_queue, ) - # Bootstrap stages - def _bootstrap_sandbox( - self, - sandbox: SandboxHandle, - task: Any, - config: Any, + self, sandbox: SandboxHandle, task: Any, config: Any ) -> None: - """Install agent, upload files, write MCP config.""" - - # Stage 1: wait for sandbox readiness self._wait_for_sandbox_ready(sandbox) - - # Stage 2: install agent CLI (skip if pre-baked) if not self._agent_already_installed(sandbox): self._install_agent(sandbox) - - # Stage 3: upload spec.files self._upload_files(sandbox, task, config) - - # Stage 4: write MCP config (if the spec provides a builder) self._write_mcp_config(sandbox, config) - - # Stage 5: run task.setup_shell if present setup_shell = task.setup_shell if hasattr(task, "setup_shell") else None if setup_shell: r = sandbox.exec(setup_shell, timeout=self._setup_timeout_s) @@ -382,13 +315,8 @@ def _bootstrap_sandbox( ) def _wait_for_sandbox_ready( - self, - sandbox: SandboxHandle, - *, - attempts: int = 15, - delay_s: float = 1.0, + self, sandbox: SandboxHandle, *, attempts: int = 15, delay_s: float = 1.0 ) -> None: - """Probe sandbox until ``echo ok`` succeeds.""" last_err = "" for _ in range(attempts): try: @@ -405,7 +333,6 @@ def _wait_for_sandbox_ready( ) def _agent_already_installed(self, sandbox: SandboxHandle) -> bool: - """Check if the agent CLI is already available in the sandbox.""" cmd = " ".join(shlex.quote(c) for c in self.spec.install_check_cmd) try: r = sandbox.exec(cmd, timeout=10) @@ -414,11 +341,9 @@ def _agent_already_installed(self, sandbox: SandboxHandle) -> bool: return False def _install_agent(self, sandbox: SandboxHandle) -> None: - """Run ``spec.setup`` commands to install the agent CLI.""" if self.spec.setup is None: raise RuntimeError( - f"Agent {self.spec.name!r} is not installed in the sandbox " - "and no setup commands are provided in the spec." + f"Agent {self.spec.name!r} is not installed and no setup commands provided." ) commands = ( [self.spec.setup] if isinstance(self.spec.setup, str) else self.spec.setup @@ -433,34 +358,22 @@ def _install_agent(self, sandbox: SandboxHandle) -> None: label=f"{self.spec.name} install", ) - def _upload_files( - self, - sandbox: SandboxHandle, - task: Any, - config: Any, - ) -> None: - """Upload ``spec.files`` into the sandbox, resolving callables.""" + def _upload_files(self, sandbox: SandboxHandle, task: Any, config: Any) -> None: if not self.spec.files: return for path, content_or_fn in self.spec.files.items(): - if callable(content_or_fn): - content = content_or_fn(task, config) - else: - content = content_or_fn + content = ( + content_or_fn(task, config) + if callable(content_or_fn) + else content_or_fn + ) if content is not None: sandbox.write_text(path, content) - - # Also upload task.upload_files if the task has them. upload_files = task.upload_files if hasattr(task, "upload_files") else {} for path, content in upload_files.items(): sandbox.write_text(path, content) - def _write_mcp_config( - self, - sandbox: SandboxHandle, - config: Any, - ) -> None: - """Write MCP configuration using the spec's builder.""" + def _write_mcp_config(self, sandbox: SandboxHandle, config: Any) -> None: if self.spec.build_mcp_config is None: return if ( @@ -476,15 +389,12 @@ def _write_mcp_config( config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" ) mcp_path = self.spec.mcp_config.path_template.format( - workdir=workdir, - home=home, + workdir=workdir, home=home ) mcp_content = self.spec.build_mcp_config(self.spec, [], workdir) if mcp_content: sandbox.write_text(mcp_path, mcp_content) - # Agent launch - def _start_agent( self, sandbox: SandboxHandle, @@ -493,17 +403,14 @@ def _start_agent( *, base_url_override: str | None = None, ) -> BgJob: - """Build CLI command, resolve env vars, and launch as bg process.""" - # Build command via spec hook if self.spec.build_command is not None: cmd = self.spec.build_command(self.spec, config, task, None) else: cmd = " ".join(shlex.quote(c) for c in self.spec.base_command) - - # Resolve environment variables envs = self._resolve_env_vars(config, base_url_override=base_url_override) - - _log.info("%s driver: launching agent", self.spec.name) + if self.mode == "interception_gate" and self._interception_server is not None: + envs["OPENAI_API_KEY"] = self._interception_server.secret + envs["ANTHROPIC_API_KEY"] = self._interception_server.secret return sandbox.start_bg(cmd, envs=envs) def _resolve_env_vars( @@ -512,124 +419,24 @@ def _resolve_env_vars( *, base_url_override: str | None = None, ) -> dict[str, str]: - """Build the env var dict for the agent process. - - If ``spec.build_env_vars`` is provided, delegate to it. - Otherwise resolve ``{placeholder}`` substitutions in ``spec.env``. - """ if self.spec.build_env_vars is not None: return self.spec.build_env_vars(self.spec, config) - if not self.spec.env: return {} - base_url = base_url_override or ( config.base_url if hasattr(config, "base_url") else "" ) api_key = config.api_key if hasattr(config, "api_key") else "intercepted" model = config.model if hasattr(config, "model") else "" - - substitutions = { - "base_url": base_url, - "api_key": api_key, - "model": model, - } - + substitutions = {"base_url": base_url, "api_key": api_key, "model": model} resolved: dict[str, str] = {} for key, value in self.spec.env.items(): try: resolved[key] = value.format(**substitutions) except KeyError: - # If a placeholder isn't in our substitutions, keep it as-is. resolved[key] = value return resolved - # Interception proxy - - def _start_proxy( - self, - sandbox: SandboxHandle, - *, - base_url: str, - api_key: str, - model: str, - ) -> tuple[BgJob, str, str]: - """Install deps, start proxy as bg job, wait for healthz. - - Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``. - """ - proxy_already_present = sandbox.exists("/home/user/proxy/interception.py") - - if not proxy_already_present: - self._exec_with_retry( - sandbox, - "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' " - "'httpx>=0.27' 2>&1 | tail -20", - timeout=180, - attempts=3, - backoff_s=2.0, - label="proxy deps install", - ) - sandbox.write_text( - "/home/user/proxy/interception.py", - _PROXY_SOURCE_PATH.read_text(), - ) - sandbox.write_text("/home/user/proxy/__init__.py", "") - - proxy_args = [ - "python", - "interception.py", - "--upstream-url", - base_url, - "--trace", - _PROXY_TRACE_PATH, - "--port", - str(_PROXY_PORT), - "--top-logprobs", - str(self._proxy_top_logprobs), - ] - if self._proxy_max_tokens_cap is not None: - proxy_args.extend(["--max-tokens-cap", str(self._proxy_max_tokens_cap)]) - if self._proxy_disable_thinking: - proxy_args.append("--disable-thinking") - if model: - proxy_args.extend(["--model-override", model]) - - quoted = " ".join(shlex.quote(a) for a in proxy_args) - proxy_cmd = ( - f"cd /home/user/proxy && {quoted} > {shlex.quote(_PROXY_LOG_PATH)} 2>&1" - ) - proxy_env = {"OPENCODE_UPSTREAM_API_KEY": api_key} - proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env) - - # Wait for proxy healthz - attempts = 120 - interval_s = 0.5 - for _ in range(attempts): - r = sandbox.exec( - f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz", - timeout=5, - ) - if r.exit_code == 0: - break - time.sleep(interval_s) - else: - log_content = "" - try: - log_content = sandbox.read_text(_PROXY_LOG_PATH) - except Exception: - pass - proxy_job.kill() - raise RuntimeError( - f"proxy did not start within {attempts * interval_s:.0f}s. " - f"log:\n{log_content[-2000:]}" - ) - - override_url = f"http://127.0.0.1:{_PROXY_PORT}/v1" - return proxy_job, override_url, _PROXY_TRACE_PATH - - # Utilities - def _exec_with_retry( self, sandbox: SandboxHandle, @@ -640,7 +447,6 @@ def _exec_with_retry( backoff_s: float = 3.0, label: str = "cmd", ) -> Any: - """Run ``sandbox.exec`` with exponential backoff on transient failure.""" last_stdout = "" last_stderr = "" last_exit = 0 @@ -666,30 +472,19 @@ def _exec_with_retry( ) -# CLIAgentSessionFactory - - class CLIAgentSessionFactory(ResourceSessionFactory): - """Factory that produces :class:`CLIAgentSession` instances for any - registered agent. - - Wraps :class:`CLIAgentDriver` to satisfy the - :class:`ResourceSessionFactory` contract from PR #603. - """ - def __init__( self, *, spec: CLIAgentSpec, config: Any, sandbox_backend: SandboxBackend, - mode: Literal["black_box", "transparent_proxy"] = "black_box", + mode: Literal["black_box", "interception_gate"] = "black_box", verifier: Verifier | None = None, install_timeout_s: int = 240, setup_timeout_s: int = 300, - proxy_top_logprobs: int = 5, - proxy_max_tokens_cap: int | None = 16384, - proxy_disable_thinking: bool = False, + interception_server: InterceptionServer | None = None, + interception_base_url: str | None = None, ) -> None: self._spec = spec self._config = config @@ -700,9 +495,8 @@ def __init__( mode=mode, install_timeout_s=install_timeout_s, setup_timeout_s=setup_timeout_s, - proxy_top_logprobs=proxy_top_logprobs, - proxy_max_tokens_cap=proxy_max_tokens_cap, - proxy_disable_thinking=proxy_disable_thinking, + interception_server=interception_server, + interception_base_url=interception_base_url, ) def create( @@ -711,7 +505,6 @@ def create( seed: int | None = None, episode_id: str | None = None, ) -> CLIAgentSession: - """Create one isolated session for a rollout.""" return self._driver.create_session( task=task, config=self._config, @@ -721,9 +514,4 @@ def create( ) -__all__ = [ - "CLIAgentDriver", - "CLIAgentSession", - "CLIAgentSessionFactory", - "Verifier", -] +__all__ = ["CLIAgentDriver", "CLIAgentSession", "CLIAgentSessionFactory", "Verifier"] diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py new file mode 100644 index 000000000..a075ec8b4 --- /dev/null +++ b/src/openenv/core/harness/agents/interception_server.py @@ -0,0 +1,324 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Host-side interception server for trainer-owned generation. + +The :class:`InterceptionServer` runs on the trainer node, outside any +sandbox. Each sandbox's agent is pointed at:: + + http://:/rollout//v1 + +When the agent makes an LLM call it blocks at this server. The training +loop calls :meth:`~InterceptionServer.register_rollout` to get a queue, +``await queue.get()`` to dequeue the pending request, runs its own vLLM +forward pass, then calls :func:`deliver_response` to unblock the agent. + +The caller is responsible for making the server reachable from the sandbox. +For Docker sandboxes on the same machine, ``host.docker.internal:`` +works. For remote sandboxes (E2B, HF Sandbox), set up your own tunnel +(ngrok, frp, public IP, VPN) and pass the URL as +``interception_base_url``. + +Usage — training loop:: + + server = InterceptionServer(port=8765) + await server.start() + + # Make the server reachable — your responsibility. + # Docker: base_url = f"http://host.docker.internal:{server.port}" + # Remote: base_url = your_tunnel_or_public_url + + queue = server.register_rollout(rollout_id) + # Agent runs with OPENAI_BASE_URL = f"{base_url}/rollout/{rollout_id}/v1" + + while True: + request_id = await asyncio.wait_for(queue.get(), timeout=...) + intercept = server.intercepts[request_id] + response = await vllm.generate(intercept["messages"], ...) + await deliver_response(intercept, response) + + server.unregister_rollout(rollout_id) + await server.stop() +""" + +from __future__ import annotations + +import asyncio +import hmac +import json +import logging +import secrets +import time +import uuid +from typing import Any + +from aiohttp import web + + +_log = logging.getLogger(__name__) + +_KEEPALIVE_INTERVAL_S = 3.0 +_MAX_REQUEST_BODY = 16 * 1024 * 1024 + + +class InterceptionServer: + """Async HTTP server that gates every LLM call from sandboxed agents. + + One shared instance handles all concurrent rollouts. Each rollout is + identified by a ``rollout_id`` in the URL path. + """ + + def __init__(self, port: int = 0, secret: str | None = None) -> None: + self.port = port + self.secret = secret or secrets.token_urlsafe(32) + self._app: web.Application | None = None + self._runner: web.AppRunner | None = None + self._site: web.TCPSite | None = None + self._lock = asyncio.Lock() + self.active_rollouts: dict[str, dict[str, Any]] = {} + self.intercepts: dict[str, dict[str, Any]] = {} + + async def start(self) -> None: + async with self._lock: + if self._app is not None: + return + app = web.Application(client_max_size=_MAX_REQUEST_BODY) + app.router.add_post( + "/rollout/{rollout_id}/v1/chat/completions", + self._handle_chat_completions, + ) + app.router.add_get("/health", self._handle_health) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, "0.0.0.0", self.port) + await site.start() + if self.port == 0: + server = getattr(site, "_server", None) + sockets = getattr(server, "sockets", None) if server else None + if sockets: + self.port = sockets[0].getsockname()[1] + if self.port == 0: + raise RuntimeError("Failed to resolve OS-assigned port") + self._app = app + self._runner = runner + self._site = site + _log.info("InterceptionServer listening on :%d", self.port) + + async def stop(self) -> None: + async with self._lock: + if self._runner is None: + return + for intercept in list(self.intercepts.values()): + fut: asyncio.Future | None = intercept.get("response_future") + if fut and not fut.done(): + fut.cancel() + cq: asyncio.Queue | None = intercept.get("chunk_queue") + if cq is not None: + try: + cq.put_nowait(None) + except asyncio.QueueFull: + pass + self.intercepts.clear() + self.active_rollouts.clear() + try: + await self._runner.cleanup() + except RuntimeError: + pass + self._runner = None + self._site = None + self._app = None + + def register_rollout( + self, + rollout_id: str, + state: dict[str, Any] | None = None, + ) -> asyncio.Queue: + queue: asyncio.Queue = asyncio.Queue() + self.active_rollouts[rollout_id] = { + "request_id_queue": queue, + "state": state, + } + return queue + + def unregister_rollout(self, rollout_id: str) -> None: + for request_id in list(self.intercepts): + intercept = self.intercepts.get(request_id) + if intercept and intercept.get("rollout_id") == rollout_id: + fut: asyncio.Future | None = intercept.get("response_future") + if fut and not fut.done(): + fut.cancel() + cq: asyncio.Queue | None = intercept.get("chunk_queue") + if cq is not None: + try: + cq.put_nowait(None) + except asyncio.QueueFull: + pass + del self.intercepts[request_id] + self.active_rollouts.pop(rollout_id, None) + + def _authorized(self, request: web.Request) -> bool: + auth = request.headers.get("Authorization", "") + api_key = request.headers.get("x-api-key", "") + return hmac.compare_digest( + auth, f"Bearer {self.secret}" + ) or hmac.compare_digest(api_key, self.secret) + + async def _handle_health(self, request: web.Request) -> web.Response: + return web.json_response({"status": "ok"}) + + async def _handle_chat_completions( + self, request: web.Request + ) -> web.StreamResponse | web.Response: + if not self._authorized(request): + return web.json_response({"error": "Unauthorized"}, status=401) + + rollout_id = request.match_info["rollout_id"] + context = self.active_rollouts.get(rollout_id) + if not context: + return web.json_response({"error": "rollout not found"}, status=404) + + try: + body = await request.json() + except Exception as exc: + return web.json_response({"error": f"invalid JSON: {exc}"}, status=400) + + is_streaming = bool(body.get("stream")) + request_id = f"req_{uuid.uuid4().hex[:8]}" + chunk_queue: asyncio.Queue | None = asyncio.Queue() if is_streaming else None + + intercept: dict[str, Any] = { + "request_id": request_id, + "rollout_id": rollout_id, + "messages": body.get("messages"), + "model": body.get("model"), + "tools": body.get("tools"), + "stream": is_streaming, + "chunk_queue": chunk_queue, + "response_future": asyncio.get_event_loop().create_future(), + "body": body, + } + self.intercepts[request_id] = intercept + await context["request_id_queue"].put(request_id) + + if is_streaming: + return await self._stream_response(request, intercept) + + try: + response_dict = await intercept["response_future"] + except asyncio.CancelledError: + return web.json_response({"error": "rollout cancelled"}, status=499) + except Exception as exc: + return web.json_response({"error": str(exc)}, status=500) + + return web.json_response(response_dict) + + async def _stream_response( + self, request: web.Request, intercept: dict[str, Any] + ) -> web.StreamResponse: + chunk_queue: asyncio.Queue = intercept["chunk_queue"] + resp = web.StreamResponse( + status=200, + headers={ + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + }, + ) + await resp.prepare(request) + get_task: asyncio.Task | None = None + try: + while True: + if get_task is None: + get_task = asyncio.create_task(chunk_queue.get()) + done, _ = await asyncio.wait({get_task}, timeout=_KEEPALIVE_INTERVAL_S) + if get_task not in done: + await resp.write(b": keepalive\n\n") + continue + chunk = get_task.result() + get_task = None + if chunk is None: + await resp.write(b"data: [DONE]\n\n") + break + await resp.write(f"data: {json.dumps(chunk)}\n\n".encode()) + await asyncio.sleep(0) + except (asyncio.CancelledError, ConnectionResetError): + pass + finally: + if get_task and not get_task.done(): + get_task.cancel() + try: + await resp.write_eof() + except Exception: + pass + return resp + + +async def deliver_response( + intercept: dict[str, Any], response_dict: dict[str, Any] +) -> None: + """Unblock the agent's HTTP handler with ``response_dict``. + + For non-streaming requests, resolves the future directly. + For streaming requests, synthesizes SSE chunks from the complete + response and signals EOF. + """ + is_streaming = intercept.get("stream", False) + chunk_queue: asyncio.Queue | None = intercept.get("chunk_queue") + future: asyncio.Future | None = intercept.get("response_future") + + if not is_streaming: + if future and not future.done(): + future.set_result(response_dict) + return + + if chunk_queue is None: + raise RuntimeError("chunk_queue missing on streaming intercept") + + choices = response_dict.get("choices") or [] + for choice in choices: + msg = choice.get("message") or {} + content_chunk = { + "id": response_dict.get("id", ""), + "object": "chat.completion.chunk", + "created": response_dict.get("created", int(time.time())), + "model": response_dict.get("model", ""), + "choices": [ + { + "index": choice.get("index", 0), + "delta": { + "role": "assistant", + "content": msg.get("content"), + "tool_calls": msg.get("tool_calls"), + }, + "finish_reason": None, + } + ], + } + await chunk_queue.put(content_chunk) + finish_chunk = { + "id": response_dict.get("id", ""), + "object": "chat.completion.chunk", + "created": response_dict.get("created", int(time.time())), + "model": response_dict.get("model", ""), + "choices": [ + { + "index": choice.get("index", 0), + "delta": {}, + "finish_reason": choice.get("finish_reason"), + } + ], + } + await chunk_queue.put(finish_chunk) + + await chunk_queue.put(None) + if future and not future.done(): + future.set_result(response_dict) + + +__all__ = [ + "InterceptionServer", + "deliver_response", +] From 171a3eaf23da0b0f109bd40af635b398ac10482c Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Fri, 15 May 2026 23:35:54 +0530 Subject: [PATCH 13/35] refactor: wire coding_agent_env with interception_gate --- envs/coding_agent_env/client.py | 7 +- envs/coding_agent_env/harness.py | 163 ++---------------- envs/coding_agent_env/models.py | 2 +- .../server/coding_environment.py | 68 +------- envs/coding_agent_env/server/gradio_ui.py | 6 +- 5 files changed, 26 insertions(+), 220 deletions(-) diff --git a/envs/coding_agent_env/client.py b/envs/coding_agent_env/client.py index 7e2a21696..c1e0f6f92 100644 --- a/envs/coding_agent_env/client.py +++ b/envs/coding_agent_env/client.py @@ -63,7 +63,7 @@ def run_rollout( verify: list[str] | None = None, # Bookkeeping / tunables task_id: str = "", - mode: str = "transparent_proxy", + mode: str = "black_box", disable_thinking: bool | None = None, max_tokens_cap: int = 4096, top_logprobs: int = 5, @@ -87,8 +87,9 @@ def run_rollout( Reward = ``passed_count / total`` unless any command writes a float to ``/home/user/logs/verifier/reward.txt`` (override). task_id: Echoed back in the result for traceability. - mode: ``"transparent_proxy"`` (captures per-token logprobs via - an in-sandbox FastAPI proxy) or ``"black_box"`` (no proxy). + mode: ``"black_box"`` (agent talks directly to the LLM) or + ``"interception_gate"`` (LLM calls routed to trainer-side + InterceptionServer for trainer-owned generation). disable_thinking: Inject ``chat_template_kwargs.enable_thinking=false`` on forwarded requests. Needed for Qwen3.5 vLLM; harmless on Instruct diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py index ccbfa2cfc..295b07ac3 100644 --- a/envs/coding_agent_env/harness.py +++ b/envs/coding_agent_env/harness.py @@ -4,22 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Coding-agent session factory + session — backed by CLIAgentDriver. - -This module exposes :class:`CodingAgentSession` and -:class:`CodingAgentSessionFactory` built on top of the generic -:class:`CLIAgentDriver` / :class:`CLIAgentSession` / -:class:`CLIAgentSessionFactory` from ``openenv.core.harness.agents``. - -Agent-specific (OpenCode spec) configuration (``opencode.json`` generation, provider -mapping, tool enable/disable) is handled by -:mod:`coding_agent_env.opencode_runtime` builders wired into the -:data:`OPENCODE_SPEC` via callable hooks. -""" +"""Coding-agent session factory + session — backed by CLIAgentDriver.""" from __future__ import annotations -from pathlib import Path from typing import Any, Literal from openenv.core.harness import ResourceSessionFactory @@ -28,8 +16,9 @@ CLIAgentSession, Verifier, ) +from openenv.core.harness.agents.interception_server import InterceptionServer from openenv.core.harness.agents.opencode import OPENCODE_SPEC -from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle +from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle from .config import CodingAgentConfig from .opencode_runtime import ( @@ -45,29 +34,7 @@ from .task import CodingAgentTask -# Inside-sandbox proxy paths (Mode B). -_PROXY_PORT = 7000 -_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl" -_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log" - -_PROXY_SOURCE_PATH = ( - Path(__file__).resolve().parents[2] - / "src" - / "openenv" - / "core" - / "harness" - / "sandbox" - / "interception.py" -) - - class CodingAgentSession(CLIAgentSession): - """One live coding-agent rollout inside a sandbox. - - Extends :class:`CLIAgentSession` with Agent-specific (OpenCode spec) convenience - methods (``fetch_trace``, ``wait_for_completion`` with config-aware timeout). - """ - def __init__( self, *, @@ -76,9 +43,6 @@ def __init__( task: CodingAgentTask, verifier: Verifier | None = None, base_url_override: str | None = None, - proxy_trace_path: str | None = None, - proxy_bg_job: BgJob | None = None, - agent_bg_job: BgJob | None = None, ) -> None: super().__init__( spec=OPENCODE_SPEC, @@ -87,28 +51,18 @@ def __init__( config=config, verifier=verifier, base_url_override=base_url_override, - proxy_trace_path=proxy_trace_path, - proxy_bg_job=proxy_bg_job, - agent_bg_job=agent_bg_job, ) def fetch_trace(self) -> str: - """Return the raw ``opencode run`` log (JSONL when ``run_format=json``).""" return self.sandbox.read_text(agent_log_path(self.config)) def wait_for_completion(self, timeout_s: float | None = None) -> int: - """Block until the agent exits, returning its exit code.""" budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s if self._agent_bg_job is None: - raise RuntimeError("Agent not started; call start_agent() first.") + raise RuntimeError("Agent not started.") return self._agent_bg_job.wait(timeout=budget) def start_agent(self) -> None: - """Launch ``opencode run`` as a background subprocess in the sandbox. - - The factory starts the agent during ``create()``; this method is a no-op - if the agent is already running. - """ if self._agent_bg_job is not None: return cmd = build_run_cmd(self.config) @@ -117,28 +71,19 @@ def start_agent(self) -> None: class CodingAgentSessionFactory(ResourceSessionFactory): - """Produce isolated per-rollout :class:`CodingAgentSession` instances. - - The factory owns sandbox provisioning, opencode install, config injection, - and (Mode B) proxy startup. Each :meth:`create` call returns a fresh - sandbox with a running agent. - - Internally delegates to :class:`CLIAgentDriver` for the generic - sandbox lifecycle (readiness probing, install retry, proxy startup). - Agent-specific (OpenCode spec) config generation uses ``opencode_runtime`` builders. - """ - def __init__( self, *, config: CodingAgentConfig, sandbox_backend: SandboxBackend, - mode: Literal["black_box", "transparent_proxy"] = "black_box", + mode: Literal["black_box", "interception_gate"] = "black_box", verifier: Verifier | None = None, install_timeout_s: int = 240, setup_timeout_s: int = 300, + interception_server: InterceptionServer | None = None, + interception_base_url: str | None = None, ) -> None: - if mode not in {"black_box", "transparent_proxy"}: + if mode not in {"black_box", "interception_gate"}: raise ValueError(f"Unknown mode: {mode!r}") self._config = config self._backend = sandbox_backend @@ -146,17 +91,14 @@ def __init__( self._verifier = verifier self._install_timeout_s = install_timeout_s self._setup_timeout_s = setup_timeout_s - - # Build a CLIAgentDriver for the shared lifecycle. self._driver = CLIAgentDriver( spec=OPENCODE_SPEC, sandbox_backend=sandbox_backend, mode=mode, install_timeout_s=install_timeout_s, setup_timeout_s=setup_timeout_s, - proxy_top_logprobs=config.proxy_top_logprobs, - proxy_max_tokens_cap=config.proxy_max_tokens_cap, - proxy_disable_thinking=config.proxy_disable_thinking, + interception_server=interception_server, + interception_base_url=interception_base_url, ) def create( @@ -168,87 +110,29 @@ def create( import logging _log = logging.getLogger(__name__) - oc_task = CodingAgentTask.coerce(task) sandbox_timeout = int(self._config.agent_timeout_s) + 300 - - _log.info( - "factory.create: creating sandbox timeout=%ds mode=%s", - sandbox_timeout, - self._mode, - ) sandbox = self._backend.create( timeout_s=sandbox_timeout, metadata={"episode_id": episode_id} if episode_id else None, ) - sid = getattr(sandbox, "sandbox_id", "?") - _log.info("factory.create: sandbox=%s — bootstrapping…", sid) - try: self._bootstrap_sandbox(sandbox, oc_task) except Exception as exc: _log.error("factory.create: bootstrap failed: %r", exc) sandbox.kill() raise - - base_url_override: str | None = None - proxy_trace_path: str | None = None - proxy_bg_job: BgJob | None = None - if self._mode == "transparent_proxy": - _log.info( - "factory.create: starting interception proxy on :%d → %s", - _PROXY_PORT, - self._config.base_url, - ) - proxy_bg_job, base_url_override, proxy_trace_path = ( - self._driver._start_proxy( - sandbox, - base_url=self._config.base_url, - api_key=self._config.api_key, - model=self._config.model, - ) - ) - _log.info("factory.create: proxy up at %s", base_url_override) - # Rewrite opencode.json so opencode points at the proxy. - proxy_cfg = CodingAgentConfig( - **{ - **self._config.model_dump(), - "provider": "openai_compatible", - "base_url": base_url_override, - } - ) - sandbox.write_text( - opencode_config_path(self._config), - build_opencode_json(proxy_cfg), - ) - session = CodingAgentSession( sandbox=sandbox, config=self._config, task=oc_task, verifier=self._verifier, - base_url_override=base_url_override, - proxy_trace_path=proxy_trace_path, - proxy_bg_job=proxy_bg_job, ) session.start_agent() return session - # ------------------------------------------------------------------ - # Bootstrap — delegates to CLIAgentDriver utilities - # ------------------------------------------------------------------ - - def _bootstrap_sandbox( - self, - sandbox: SandboxHandle, - task: CodingAgentTask, - ) -> None: - """Install opencode, write config + task files, run optional setup.""" - - # Stage 1: wait for the sandbox to be responsive. + def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: CodingAgentTask) -> None: self._driver._wait_for_sandbox_ready(sandbox) - - # Stage 2: install opencode (skipped if pre-baked). if not self._driver._agent_already_installed(sandbox): self._driver._exec_with_retry( sandbox, @@ -258,24 +142,16 @@ def _bootstrap_sandbox( backoff_s=3.0, label="opencode install", ) - - # Stage 3: write opencode.json + task files. sandbox.write_text( - opencode_config_path(self._config), - build_opencode_json(self._config), + opencode_config_path(self._config), build_opencode_json(self._config) ) sandbox.write_text(instruction_path(self._config), task.instruction) - if self._config.system_prompt: sandbox.write_text( - system_prompt_path(self._config), - self._config.system_prompt, + system_prompt_path(self._config), self._config.system_prompt ) - for remote_path, content in task.upload_files.items(): sandbox.write_text(remote_path, content) - - # Stage 4: extra setup if self._config.extra_setup_shell: self._driver._exec_with_retry( sandbox, @@ -285,7 +161,6 @@ def _bootstrap_sandbox( backoff_s=2.0, label="extra_setup_shell", ) - if task.setup_shell: r = sandbox.exec(task.setup_shell, timeout=self._setup_timeout_s) if r.exit_code != 0: @@ -293,18 +168,6 @@ def _bootstrap_sandbox( f"task.setup_shell failed ({r.exit_code}): {r.stderr}" ) - def _start_proxy( - self, - sandbox: SandboxHandle, - ) -> tuple[BgJob, str, str]: - """Start proxy — delegates to driver.""" - return self._driver._start_proxy( - sandbox, - base_url=self._config.base_url, - api_key=self._config.api_key, - model=self._config.model, - ) - __all__ = [ "CodingAgentSession", diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py index 3e31962fb..821b1bd57 100644 --- a/envs/coding_agent_env/models.py +++ b/envs/coding_agent_env/models.py @@ -59,7 +59,7 @@ class RolloutResult(BaseModel): reward: float | None = None agent_exit_code: int | None = None wall_s: float = 0.0 - mode: str = "transparent_proxy" + mode: str = "black_box" # Per-step results setup_results: list[CommandResult] = Field(default_factory=list) diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index e389eb759..0c8598cdd 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -143,7 +143,7 @@ def run_rollout( verify: Optional[list[str]] = None, # Bookkeeping / tunables task_id: str = "", - mode: str = "transparent_proxy", + mode: str = "black_box", disable_thinking: Optional[bool] = None, max_tokens_cap: int = 4096, top_logprobs: int = 5, @@ -359,10 +359,7 @@ def _emit(msg: str) -> None: ) session = factory.create(task=rollout_task) result.sandbox_id = session.sandbox.sandbox_id - _emit( - f"sandbox ready: {result.sandbox_id} — agent started " - f"({'proxy on :7000, logprobs capturing' if mode == 'transparent_proxy' else 'direct LLM, no logprobs'})" - ) + _emit(f"sandbox ready: {result.sandbox_id} — agent started (mode={mode})") # Run setup commands one at a time, *before* the agent starts. # The factory has already started the agent in start_agent() @@ -474,16 +471,9 @@ def _build_agent_config( api_key=api_key, model=model, agent_timeout_s=agent_timeout_s, - proxy_disable_thinking=disable_thinking, - proxy_top_logprobs=top_logprobs, - proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, ) - provider = ( - "openai" - if mode == "transparent_proxy" - else self._infer_pi_provider(base_url) - ) + provider = self._infer_pi_provider(base_url) return _GenericAgentConfig( base_url=base_url.rstrip("/"), api_key=api_key, @@ -524,9 +514,6 @@ def _build_session_factory( sandbox_backend=backend, mode=mode, verifier=None, - proxy_disable_thinking=disable_thinking, - proxy_top_logprobs=top_logprobs, - proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, ) @staticmethod @@ -602,53 +589,8 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]: return files, extras def _collect_proxy_turns(self, session: Any) -> list[Any]: - turns: list[Any] = [] - - records: list[dict[str, Any]] = [] - if hasattr(session, "fetch_proxy_trace"): - try: - fetched = session.fetch_proxy_trace() - if isinstance(fetched, list): - records = [r for r in fetched if isinstance(r, dict)] - except Exception: - records = [] - - if not records: - proxy_trace_path = getattr(session, "_proxy_trace_path", None) - if not proxy_trace_path: - return turns - raw = self._safe_read(session.sandbox, proxy_trace_path) - for line in raw.splitlines(): - line = line.strip() - if not line: - continue - try: - rec = json.loads(line) - except Exception: - continue - if isinstance(rec, dict): - records.append(rec) - - for rec in records: - response = rec.get("response") or {} - turns.append( - self._RolloutTurn( - turn=int(rec.get("turn") or 0), - finish_reason=rec.get("finish_reason"), - completion_tokens=list(rec.get("completion_tokens") or []), - completion_token_ids=list(rec.get("completion_token_ids") or []), - per_token_logps=[ - float(x) - for x in (rec.get("per_token_logps") or []) - if x is not None - ], - latency_s=float(rec.get("latency_s") or 0.0), - timestamp=float(rec.get("timestamp") or 0.0), - upstream_status=response.get("upstream_status"), - upstream_error=response.get("upstream_error"), - ) - ) - return turns + """Logprob capture is now owned by the training loop via interception_gate.""" + return [] @staticmethod def _safe_read(sandbox: Any, path: str) -> str: diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py index ef3f94aeb..82f130ce3 100644 --- a/envs/coding_agent_env/server/gradio_ui.py +++ b/envs/coding_agent_env/server/gradio_ui.py @@ -158,7 +158,7 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]: def _logprobs_md(turns: list[dict[str, Any]]) -> str: if not turns: - return "_No proxy turns captured._\n\nThis is normal in `black_box` mode. In `transparent_proxy` mode, an empty list usually means the agent never made an LLM call (check the agent log)." + return "_No proxy turns captured._\n\nLogprob capture is handled by the training loop via `interception_gate` mode." n = len(turns) productive = sum(1 for t in turns if t.get("completion_tokens")) total_toks = sum(len(t.get("completion_tokens") or []) for t in turns) @@ -523,8 +523,8 @@ def apply_preset(name: str) -> tuple[str, str, str]: with gr.Accordion("Tunables", open=False): with gr.Row(): mode = gr.Dropdown( - choices=["transparent_proxy", "black_box"], - value="transparent_proxy", + choices=["black_box", "interception_gate"], + value="black_box", label="mode", ) disable_thinking = gr.Dropdown( From 52a024e2c616d4cb46c8dc03fb99fcc990201d5d Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Fri, 15 May 2026 23:36:04 +0530 Subject: [PATCH 14/35] chore: update tests for interception_gate, remove proxy test cases --- tests/core/test_cli_agent_driver.py | 81 ++----------------- tests/envs/test_coding_agent_env.py | 118 +++------------------------- 2 files changed, 16 insertions(+), 183 deletions(-) diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 29bf06caa..0b218f19a 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -207,7 +207,6 @@ def test_cli_agent_spec_minimal(self): mcp_config=MCPConfigSpec(method="cli_flags"), ) assert spec.name == "test-agent" - assert spec.supports_logprob_proxy is True assert spec.default_timeout_s == 600.0 assert spec.setup is None assert spec.files is None @@ -229,7 +228,6 @@ def test_cli_agent_spec_full(self): mcp_config=MCPConfigSpec( method="config_file", path_template="{workdir}/mcp.json" ), - supports_logprob_proxy=True, default_timeout_s=900.0, setup="npm install -g full-agent", files={ @@ -457,41 +455,16 @@ def test_create_session_skips_install_when_prebaked(self): assert not any("apt-get install" in cmd for cmd in sbx.executed) session.close() - def test_create_session_with_proxy(self): + def test_create_session_interception_gate_requires_server(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver spec = _make_test_spec() - backend = FakeSandboxBackend() - driver = CLIAgentDriver( - spec=spec, - sandbox_backend=backend, - mode="transparent_proxy", - ) - - session = driver.create_session( - task=FakeTask(), - config=FakeConfig(), - ) - - sbx = backend.created[0] - - # Proxy source should have been uploaded - assert "/home/user/proxy/interception.py" in sbx.written - assert "/home/user/proxy/__init__.py" in sbx.written - - # Proxy should have been started as bg (before agent) - # and agent as second bg - assert len(sbx.bg_commands) == 2 - proxy_cmd, proxy_envs = sbx.bg_commands[0] - assert "interception.py" in proxy_cmd - assert proxy_envs == {"OPENCODE_UPSTREAM_API_KEY": "sk-test-key"} - - # Agent env should point at proxy - agent_cmd, agent_envs = sbx.bg_commands[1] - assert agent_envs is not None - assert agent_envs["BASE_URL"] == "http://127.0.0.1:7000/v1" - - session.close() + with pytest.raises(ValueError, match="InterceptionServer"): + CLIAgentDriver( + spec=spec, + sandbox_backend=FakeSandboxBackend(), + mode="interception_gate", + ) def test_create_session_uploads_task_files(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver @@ -679,49 +652,12 @@ def test_collect_artifacts_missing_required_raises(self): with pytest.raises(FileNotFoundError): session.collect_artifacts() - def test_fetch_proxy_trace_black_box(self): - from openenv.core.harness.agents.cli_driver import CLIAgentSession - - spec = _make_test_spec() - session = CLIAgentSession( - spec=spec, - sandbox=FakeSandbox(), - task=FakeTask(), - config=FakeConfig(), - proxy_trace_path=None, - ) - assert session.fetch_proxy_trace() == [] - - def test_fetch_proxy_trace_with_data(self): - from openenv.core.harness.agents.cli_driver import CLIAgentSession - - spec = _make_test_spec() - sbx = FakeSandbox() - trace_path = "/logs/proxy_trace.jsonl" - sbx.written[trace_path] = ( - json.dumps({"turn": 1, "latency_s": 0.5}) - + "\n" - + json.dumps({"turn": 2, "latency_s": 0.3}) - + "\n" - ) - session = CLIAgentSession( - spec=spec, - sandbox=sbx, - task=FakeTask(), - config=FakeConfig(), - proxy_trace_path=trace_path, - ) - trace = session.fetch_proxy_trace() - assert len(trace) == 2 - assert trace[0]["turn"] == 1 - def test_close_kills_sandbox_and_jobs(self): from openenv.core.harness.agents.cli_driver import CLIAgentSession spec = _make_test_spec() sbx = FakeSandbox() agent_job = FakeBgJob() - proxy_job = FakeBgJob() session = CLIAgentSession( spec=spec, @@ -729,12 +665,10 @@ def test_close_kills_sandbox_and_jobs(self): task=FakeTask(), config=FakeConfig(), agent_bg_job=agent_job, - proxy_bg_job=proxy_job, ) session.close() assert sbx._killed assert session._agent_bg_job is None - assert session._proxy_bg_job is None class TestCLIAgentSessionFactory: @@ -814,7 +748,6 @@ def test_spec_fields(self): "/home/user/.opencode/bin/opencode", "--version", ] - assert OPENCODE_SPEC.supports_logprob_proxy is True assert OPENCODE_SPEC.default_timeout_s == 900.0 assert OPENCODE_SPEC.mcp_config.method == "config_file" assert OPENCODE_SPEC.mcp_config.path_template is not None diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py index 3a89a3ce6..6626c1c59 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -24,7 +24,6 @@ from __future__ import annotations import os -import shlex import sys import pytest @@ -172,7 +171,7 @@ def test_build_agent_config_opencode() -> None: env = CodingAgentEnvironment() cfg = env._build_agent_config( agent="opencode", - mode="transparent_proxy", + mode="black_box", base_url="https://api.openai.com/v1", api_key="sk-test", model="gpt-4o-mini", @@ -182,9 +181,8 @@ def test_build_agent_config_opencode() -> None: max_tokens_cap=2048, ) assert isinstance(cfg, env._CodingAgentConfig) - assert cfg.proxy_disable_thinking is True - assert cfg.proxy_top_logprobs == 7 - assert cfg.proxy_max_tokens_cap == 2048 + assert cfg.model == "gpt-4o-mini" + assert cfg.agent_timeout_s == 123.0 def test_build_agent_config_pi() -> None: @@ -206,9 +204,9 @@ def test_build_agent_config_pi() -> None: assert cfg.thinking == "off" assert cfg.model == "zai-org/GLM-5.1" - cfg_proxy = env._build_agent_config( + cfg_gate = env._build_agent_config( agent="pi", - mode="transparent_proxy", + mode="interception_gate", base_url="https://router.huggingface.co/v1", api_key="hf_xxx", model="zai-org/GLM-5.1", @@ -217,7 +215,7 @@ def test_build_agent_config_pi() -> None: top_logprobs=5, max_tokens_cap=4096, ) - assert cfg_proxy.provider == "openai" + assert cfg_gate.provider == "huggingface" # --------------------------------------------------------------------------- @@ -234,7 +232,7 @@ def test_rollout_result_serializes_round_trip() -> None: reward=0.75, agent_exit_code=0, wall_s=12.5, - mode="transparent_proxy", + mode="black_box", setup_results=[CommandResult(cmd="pip install pandas", exit_code=0)], verify_results=[CommandResult(cmd="pytest", exit_code=1, stderr="boom")], proxy_turns=[ @@ -288,105 +286,6 @@ def test_coding_agent_task_coerce_rejects_unknown_type() -> None: CodingAgentTask.coerce(42) # type: ignore[arg-type] -def test_start_proxy_keeps_upstream_key_out_of_command() -> None: - """The proxy API key must be passed via env, not shell argv.""" - from coding_agent_env import CodingAgentConfig, CodingAgentSessionFactory - - class FakeExecResult: - exit_code = 0 - stdout = "ok" - stderr = "" - - class FakeBgJob: - def wait(self, timeout: float | None = None) -> int: - return 0 - - def kill(self) -> None: - pass - - class FakeSandbox: - sandbox_id = "fake-sandbox" - - def __init__(self) -> None: - self.started_cmd: str | None = None - self.started_envs: dict[str, str] | None = None - self.written: dict[str, str] = {} - - def exec(self, *args, **kwargs) -> FakeExecResult: - return FakeExecResult() - - def start_bg(self, cmd: str, *, envs=None, cwd=None) -> FakeBgJob: - self.started_cmd = cmd - self.started_envs = envs - return FakeBgJob() - - def write_text(self, path: str, content: str) -> None: - self.written[path] = content - - def read_text(self, path: str) -> str: - return "" - - def exists(self, path: str) -> bool: - return path in self.written - - def kill(self) -> None: - pass - - secret = "sk-test '$(leak)" - model = "provider/model'; touch /tmp/pwn #" - config = CodingAgentConfig( - base_url="https://example.test/v1?x='y", - api_key=secret, - model=model, - ) - sandbox = FakeSandbox() - factory = CodingAgentSessionFactory( - config=config, - sandbox_backend=object(), # unused by this protected-method test - mode="transparent_proxy", - ) - - # _start_proxy delegates to CLIAgentDriver._start_proxy which runs the - # proxy inside the sandbox. The driver handles dep install + source upload. - factory._start_proxy(sandbox) - - assert sandbox.started_cmd is not None - assert sandbox.started_envs == {"OPENCODE_UPSTREAM_API_KEY": secret} - assert secret not in sandbox.started_cmd - assert "--upstream-api-key" not in sandbox.started_cmd - - argv = shlex.split(sandbox.started_cmd.split("&&", 1)[1].split(">", 1)[0].strip()) - assert argv[argv.index("--upstream-url") + 1] == config.base_url - assert argv[argv.index("--model-override") + 1] == model - - -def test_interception_cli_reads_upstream_key_from_env( - monkeypatch: pytest.MonkeyPatch, -) -> None: - from openenv.core.harness.sandbox import interception - - captured = {} - - def fake_serve(cfg) -> None: - captured["cfg"] = cfg - - monkeypatch.setattr(interception, "serve", fake_serve) - monkeypatch.setenv("OPENCODE_UPSTREAM_API_KEY", "sk-from-env") - monkeypatch.setattr( - sys, - "argv", - [ - "interception.py", - "--upstream-url", - "https://example.test/v1", - ], - ) - - interception.main() - - assert captured["cfg"].upstream_api_key == "sk-from-env" - - # --------------------------------------------------------------------------- # Integration — only runs when E2B + endpoint creds are present and the # user explicitly opts in via ``pytest -m integration``. @@ -447,7 +346,8 @@ async def _go() -> RolloutResult: assert result.reward == 1.0, ( f"expected reward=1.0 got {result.reward}: {result.error}" ) - assert result.proxy_turns, "expected at least one captured LLM turn" + # proxy_turns is now always empty — logprob capture is trainer-owned + # via interception_gate mode, not captured by the environment. assert any(f.endswith("/binary_search.py") for f in result.files), ( f"expected binary_search.py in workdir, got {list(result.files)}" ) From 4b1b707e749f900706b1d5677335030c15fb19c6 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 02:24:35 +0530 Subject: [PATCH 15/35] chore: address greptile review comments --- .../server/coding_environment.py | 50 ++++++++++--------- src/openenv/core/harness/agents/cli_driver.py | 2 - 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index 0c8598cdd..ceee49002 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -98,12 +98,14 @@ def __init__(self) -> None: from openenv.core.harness.agents import get_agent_spec from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory - from coding_agent_env import ( - E2BSandboxBackend, - CodingAgentConfig, - CodingAgentSessionFactory, - CodingAgentTask, - ) + from coding_agent_env.config import CodingAgentConfig + from coding_agent_env.harness import CodingAgentSessionFactory + from coding_agent_env.task import CodingAgentTask + + try: + from openenv.core.harness.sandbox import E2BSandboxBackend + except ImportError: + E2BSandboxBackend = None # type: ignore[assignment,misc] self._CommandResult = CommandResult self._RolloutResult = RolloutResult @@ -330,14 +332,18 @@ def _emit(msg: str) -> None: max_tokens_cap=max_tokens_cap, ) - # Concatenate setup commands into a single ``set -e`` script and let - # the primitive run it as ``task.setup_shell`` before the agent - # starts. The per-command tracking happens here too — we re-run - # each command in a wrapper that captures exit/stdout/stderr. - # That way the primitive still aborts on setup failure AND we get - # observability in the response. + # Concatenate setup commands into a single ``set -e`` script so the + # primitive runs them inside _bootstrap_sandbox BEFORE the agent + # starts. This avoids the race where the agent's first tool call + # depends on files or packages that setup is still installing. + setup_shell: str | None = None + if setup: + # ``set -e`` makes the script abort on the first failing command. + setup_shell = "set -e\n" + "\n".join(setup) + rollout_task = self._CodingAgentTask( instruction=instruction, + setup_shell=setup_shell, metadata={"task_id": task_id, "agent": agent}, ) @@ -361,23 +367,21 @@ def _emit(msg: str) -> None: result.sandbox_id = session.sandbox.sandbox_id _emit(f"sandbox ready: {result.sandbox_id} — agent started (mode={mode})") - # Run setup commands one at a time, *before* the agent starts. - # The factory has already started the agent in start_agent() - # during create(); to keep the order "setup → agent → verify" - # we'd need to restructure. As a pragmatic compromise we run - # setup IMMEDIATELY after create(), which races with the agent - # for ~1-2s but is fine for typical pip/git/download work - # because most agent CLIs take a while before their first model - # call. + # Re-run setup commands individually for per-command + # observability in the response. The commands already ran + # atomically via setup_shell above, so these re-runs are + # idempotent — they exist only to populate + # result.setup_results with per-command exit/stdout/stderr. for i, cmd in enumerate(setup, 1): - _emit(f"setup [{i}/{len(setup)}]: {cmd[:80]}") cr = self._exec_command(session.sandbox, cmd) result.setup_results.append(cr) if cr.exit_code != 0: + # Should not happen — setup_shell already succeeded + # during bootstrap, but record it for diagnostics. result.error = ( - f"setup command failed (exit {cr.exit_code}): {cmd[:120]}" + f"setup replay failed (exit {cr.exit_code}): {cmd[:120]}" ) - _emit(f"setup FAILED at [{i}]: exit={cr.exit_code}") + _emit(f"setup replay FAILED at [{i}]: exit={cr.exit_code}") break # Block until the agent is done (or setup already failed). diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 0e58af9e0..1d934777d 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -458,8 +458,6 @@ def _exec_with_retry( last_stdout = r.stdout or "" last_stderr = r.stderr or "" last_exit = r.exit_code - if last_stderr.strip(): - break except Exception as exc: last_stderr = f"{type(exc).__name__}: {exc}" last_exit = -1 From a478fa8bf0c19384c8c7bf3eae74d2206d8ca9ae Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:35:47 +0530 Subject: [PATCH 16/35] refactor: extract sandbox bootstrap to driver and fix interception races --- envs/coding_agent_env/README.md | 34 ++--- envs/coding_agent_env/client.py | 9 +- envs/coding_agent_env/harness.py | 54 ++------ envs/coding_agent_env/models.py | 11 +- envs/coding_agent_env/pyproject.toml | 4 +- .../server/coding_environment.py | 69 ++++++---- examples/coding_agent_env_simple.py | 26 +--- src/openenv/core/harness/agents/cli_driver.py | 40 ++++-- .../harness/agents/interception_server.py | 101 ++++++++++---- src/openenv/core/harness/agents/opencode.py | 11 +- src/openenv/core/harness/agents/pi.py | 10 +- src/openenv/core/harness/sandbox/_util.py | 12 ++ .../core/harness/sandbox/docker_backend.py | 43 +++--- .../core/harness/sandbox/hf_backend.py | 22 ++-- tests/core/test_cli_agent_driver.py | 114 ++++++++++++++++ tests/core/test_docker_sandbox_backend.py | 4 +- tests/core/test_hf_sandbox_backend.py | 6 +- tests/core/test_interception_server.py | 124 ++++++++++++++++++ tests/envs/test_coding_agent_env.py | 52 ++++++++ 19 files changed, 541 insertions(+), 205 deletions(-) create mode 100644 src/openenv/core/harness/sandbox/_util.py create mode 100644 tests/core/test_interception_server.py diff --git a/envs/coding_agent_env/README.md b/envs/coding_agent_env/README.md index 11fb88188..7825e5c25 100644 --- a/envs/coding_agent_env/README.md +++ b/envs/coding_agent_env/README.md @@ -9,7 +9,7 @@ app_port: 8000 base_path: /web tags: - openenv -short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B with logprob capture +short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B --- # Coding Agent Environment for OpenEnv @@ -17,13 +17,13 @@ short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B with lo `coding_agent_env` runs coding-agent harnesses (currently [OpenCode](https://opencode.ai) and [Pi](https://github.com/badlogic/pi-mono)) inside an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible -LLM endpoint, optionally capturing per-token logprobs for GRPO training. +LLM endpoint with optional trainer-owned interception for RL training. **🚀 Try it live**: [`AdithyaSK/coding-agent-env`](https://huggingface.co/spaces/AdithyaSK/coding-agent-env) The deployed Space exposes: -- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward + logprobs. +- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward. - **MCP tool API** at [`/mcp`](https://adithyask-coding-agent-env.hf.space/mcp) — programmatic `run_rollout` calls. - **OpenAPI docs** at [`/docs`](https://adithyask-coding-agent-env.hf.space/docs). - **Health** at [`/health`](https://adithyask-coding-agent-env.hf.space/health). @@ -83,7 +83,6 @@ async def main(): result = RolloutResult.model_validate_json(_extract_text(raw)) print("reward:", result.reward) - print("turns:", len(result.proxy_turns)) print("files:", list(result.files.keys())) print("wall:", result.wall_s, "s") @@ -95,7 +94,6 @@ Expected output (~20s with the prebaked template): ``` reward: 1.0 -turns: 3 files: ['/home/user/workdir/binary_search.py', ...] wall: 19.8 s ``` @@ -134,11 +132,10 @@ factory = CodingAgentSessionFactory( model="gpt-4o-mini", ), sandbox_backend=E2BSandboxBackend(), - mode="transparent_proxy", # captures per-token logprobs + mode="interception_gate", # trainer-owned interception mode ) session = factory.create(task=CodingAgentTask(instruction="...")) session.wait_for_completion() -turns = session.fetch_proxy_trace() # per-turn (tokens, logprobs) session.close() ``` @@ -195,23 +192,23 @@ directly. | `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. | | `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. | | `task_id` | `str` | `""` | Echoed back in result. | -| `mode` | `str` | `"transparent_proxy"` | Or `"black_box"` (no logprobs). | +| `mode` | `str` | `"black_box"` | Or `"interception_gate"` for trainer-owned generation. | | `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. | | `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. | -| `top_logprobs` | `int` | `5` | HF Router cap is 5; OpenAI 0–20; vLLM unbounded. | +| `top_logprobs` | `int` | `5` | Reserved for trainer-owned interception workflows. | | `agent_timeout_s` | `float` | `600.0` | Hard wall budget for the selected harness. | | `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. | Returns `RolloutResult` JSON with: `reward`, `setup_results[]`, -`verify_results[]`, `proxy_turns[]`, `files{}`, `agent_log_tail`, +`verify_results[]`, `files{}`, `agent_log_tail`, `proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`. ## Two Operating Modes | Mode | What it does | Best for | |---|---|---| -| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards harness LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. | -| **`black_box`** | No proxy. The selected harness talks straight to `base_url`. | Smoke tests, eval, SFT data collection. | +| **`black_box`** (default) | The selected harness talks directly to `base_url`. | Smoke tests, eval, SFT data collection. | +| **`interception_gate`** | Agent calls are routed through trainer-host interception endpoints. Trainer owns forward pass + trajectory capture. | RL training with trainer-owned generation. | ## Environment Variables @@ -230,21 +227,17 @@ sibling `.env` file; on HF Spaces, set them as **Space secrets**. | **OpenAI endpoint** | | | | `OPENAI_API_KEY` | required for `endpoint="openai"` | Standard OpenAI key. | | `OPENAI_BASE_URL` | no | Defaults to `https://api.openai.com/v1`. | -| `OPENAI_MODEL` | no | Defaults to `gpt-4o-mini` (gpt-5.x and o-series refuse logprobs). | +| `OPENAI_MODEL` | no | Defaults to `gpt-4o-mini`. | | **HF Router endpoint** | | | | `HF_ROUTER_API_KEY` | required for `endpoint="hf_router"` | HF user token. | | `HF_ROUTER_BASE_URL` | no | Defaults to `https://router.huggingface.co/v1`. | | `HF_ROUTER_MODEL` | no | Defaults to `Qwen/Qwen3-4B-Instruct-2507:nscale`. | -Pick `provider:` suffixes that actually return logprobs: -**Together / Nscale / Scaleway / SambaNova / Cerebras**. Avoid Novita / -Hyperbolic / Featherless (silent drop) and Groq (HTTP 400). ## Pre-baked E2B Template The first rollout in a fresh E2B sandbox spends ~2 min installing -harness tooling and the proxy's Python deps. Build a one-time template that -ships those pre-installed: +harness tooling. Build a one-time template that ships those pre-installed: ```bash .venv/bin/python envs/coding_agent_env/sandbox/build_template.py @@ -290,7 +283,8 @@ src/openenv/core/harness/sandbox/ ├── base.py # SandboxBackend / SandboxHandle protocols ├── e2b_backend.py # E2B implementation ├── docker_backend.py # local Docker backend -└── interception.py # in-sandbox FastAPI proxy (logprob capture) +├── hf_backend.py # HF sandbox backend +└── _util.py # shared sandbox shell utilities ``` ## References @@ -299,4 +293,4 @@ src/openenv/core/harness/sandbox/ - [OpenCode CLI](https://opencode.ai/docs/cli/) - [Pi](https://github.com/badlogic/pi-mono) - [E2B Python SDK](https://e2b.dev/docs) -- [HF Inference Providers logprob matrix](../../../DOCS/HF/hf_inference_providers_logprobs.md) + diff --git a/envs/coding_agent_env/client.py b/envs/coding_agent_env/client.py index c1e0f6f92..492060a25 100644 --- a/envs/coding_agent_env/client.py +++ b/envs/coding_agent_env/client.py @@ -25,7 +25,7 @@ verify=["python /home/user/test.py"], task_id="binary_search_v1", ) - print(result.reward, len(result.proxy_turns)) + print(result.reward) """ from __future__ import annotations @@ -95,15 +95,14 @@ def run_rollout( requests. Needed for Qwen3.5 vLLM; harmless on Instruct variants; rejected by OpenAI direct. max_tokens_cap: Clamp on per-turn ``max_tokens``. - top_logprobs: Top-k logprobs requested upstream. HF Router caps - at 5; OpenAI accepts up to 20; vLLM is unbounded. + top_logprobs: Reserved for trainer-owned interception workflows. agent_timeout_s: Hard wall-clock budget for one agent run. template: E2B template name (e.g. ``"coding-agent-rl"``). Empty string uses the default (slow) base image. Returns: - A :class:`RolloutResult` with reward, per-turn logprobs, file - outputs, setup/verify results, and diagnostic tails. + A :class:`RolloutResult` with reward, file outputs, + setup/verify results, and diagnostic tails. """ raw = self.call_tool( "run_rollout", diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py index 295b07ac3..2355260f5 100644 --- a/envs/coding_agent_env/harness.py +++ b/envs/coding_agent_env/harness.py @@ -24,12 +24,7 @@ from .opencode_runtime import ( agent_log_path, build_env_vars, - build_install_cmd, - build_opencode_json, build_run_cmd, - instruction_path, - opencode_config_path, - system_prompt_path, ) from .task import CodingAgentTask @@ -87,10 +82,7 @@ def __init__( raise ValueError(f"Unknown mode: {mode!r}") self._config = config self._backend = sandbox_backend - self._mode = mode self._verifier = verifier - self._install_timeout_s = install_timeout_s - self._setup_timeout_s = setup_timeout_s self._driver = CLIAgentDriver( spec=OPENCODE_SPEC, sandbox_backend=sandbox_backend, @@ -111,6 +103,16 @@ def create( _log = logging.getLogger(__name__) oc_task = CodingAgentTask.coerce(task) + setup_parts: list[str] = [] + if self._config.extra_setup_shell: + setup_parts.append(self._config.extra_setup_shell) + if oc_task.setup_shell: + setup_parts.append(oc_task.setup_shell) + if setup_parts: + oc_task = oc_task.model_copy( + update={"setup_shell": "set -e\n" + "\n".join(setup_parts)} + ) + sandbox_timeout = int(self._config.agent_timeout_s) + 300 sandbox = self._backend.create( timeout_s=sandbox_timeout, @@ -132,41 +134,7 @@ def create( return session def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: CodingAgentTask) -> None: - self._driver._wait_for_sandbox_ready(sandbox) - if not self._driver._agent_already_installed(sandbox): - self._driver._exec_with_retry( - sandbox, - build_install_cmd(self._config), - timeout=self._install_timeout_s, - attempts=3, - backoff_s=3.0, - label="opencode install", - ) - sandbox.write_text( - opencode_config_path(self._config), build_opencode_json(self._config) - ) - sandbox.write_text(instruction_path(self._config), task.instruction) - if self._config.system_prompt: - sandbox.write_text( - system_prompt_path(self._config), self._config.system_prompt - ) - for remote_path, content in task.upload_files.items(): - sandbox.write_text(remote_path, content) - if self._config.extra_setup_shell: - self._driver._exec_with_retry( - sandbox, - self._config.extra_setup_shell, - timeout=self._setup_timeout_s, - attempts=2, - backoff_s=2.0, - label="extra_setup_shell", - ) - if task.setup_shell: - r = sandbox.exec(task.setup_shell, timeout=self._setup_timeout_s) - if r.exit_code != 0: - raise RuntimeError( - f"task.setup_shell failed ({r.exit_code}): {r.stderr}" - ) + self._driver.bootstrap_sandbox(sandbox, task, self._config) __all__ = [ diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py index 821b1bd57..2111d84d5 100644 --- a/envs/coding_agent_env/models.py +++ b/envs/coding_agent_env/models.py @@ -21,7 +21,7 @@ class RolloutTurn(BaseModel): - """One intercepted LLM turn captured by the in-sandbox proxy (Mode B).""" + """One intercepted LLM turn shape (trainer-owned in interception_gate mode).""" turn: int finish_reason: str | None = None @@ -45,11 +45,7 @@ class CommandResult(BaseModel): class RolloutResult(BaseModel): - """Full payload returned from one ``run_rollout`` invocation. - - The trainer (or any client) decodes this from the MCP tool result JSON - and feeds ``proxy_turns`` + ``reward`` into GRPO. - """ + """Full payload returned from one ``run_rollout`` invocation.""" # Identifiers task_id: str = "" @@ -65,7 +61,8 @@ class RolloutResult(BaseModel): setup_results: list[CommandResult] = Field(default_factory=list) verify_results: list[CommandResult] = Field(default_factory=list) - # Per-turn LLM trajectory (empty in black_box mode) + # Per-turn LLM trajectory placeholder. Capture is trainer-owned in + # interception_gate mode; environment currently leaves this empty. proxy_turns: list[RolloutTurn] = Field(default_factory=list) # Filesystem the agent produced (path -> contents, truncated) diff --git a/envs/coding_agent_env/pyproject.toml b/envs/coding_agent_env/pyproject.toml index 276d3e0be..d935a0bf5 100644 --- a/envs/coding_agent_env/pyproject.toml +++ b/envs/coding_agent_env/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "openenv-coding-agent-env" version = "0.1.0" -description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints, optionally capturing per-token logprobs." +description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints." requires-python = ">=3.10" dependencies = [ # Core OpenEnv (server + MCP). 0.3.0 ships the harness runtime. @@ -26,7 +26,7 @@ dependencies = [ # behavior drift on Space rebuilds. "gradio>=6.0.0", - # OpenCode harness primitive — sandbox + proxy + agent driver + # OpenCode harness primitive — sandbox + agent driver "httpx>=0.27.0", "e2b>=1.0.0", ] diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index ceee49002..af70b292e 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -15,8 +15,8 @@ Reward = ``passed_verify_commands / total`` unless a verify command writes a float to ``/home/user/logs/verifier/reward.txt`` (override). -Returns a JSON-serialized :class:`RolloutResult` with reward + per-turn -logprobs (Mode B) + setup/verify command results + file outputs. +Returns a JSON-serialized :class:`RolloutResult` with reward, +setup/verify command results, and file outputs. """ from __future__ import annotations @@ -184,6 +184,11 @@ def run_rollout( raise ValueError( f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}" ) + if mode not in {"black_box", "interception_gate"}: + raise ValueError( + "unsupported mode {!r}; supported modes: ('black_box', " + "'interception_gate')".format(mode) + ) if not (base_url and api_key and model): raise ValueError( "must provide either ``endpoint`` (one of " @@ -303,6 +308,12 @@ def _emit(msg: str) -> None: except Exception: pass + if mode not in {"black_box", "interception_gate"}: + raise ValueError( + "unsupported mode {!r}; supported modes: ('black_box', " + "'interception_gate')".format(mode) + ) + result = self._RolloutResult(task_id=task_id, mode=mode) t0 = time.time() @@ -347,18 +358,17 @@ def _emit(msg: str) -> None: metadata={"task_id": task_id, "agent": agent}, ) - factory = self._build_session_factory( - agent=agent, - config=config, - mode=mode, - template=template, - disable_thinking=disable_thinking, - top_logprobs=top_logprobs, - max_tokens_cap=max_tokens_cap, - ) - session = None try: + factory = self._build_session_factory( + agent=agent, + config=config, + mode=mode, + template=template, + disable_thinking=disable_thinking, + top_logprobs=top_logprobs, + max_tokens_cap=max_tokens_cap, + ) _emit( f"creating E2B sandbox (template={template or 'default'}) — " "this is the slow phase (~5–60s cold, ~5s with template)" @@ -367,24 +377,22 @@ def _emit(msg: str) -> None: result.sandbox_id = session.sandbox.sandbox_id _emit(f"sandbox ready: {result.sandbox_id} — agent started (mode={mode})") - # Re-run setup commands individually for per-command - # observability in the response. The commands already ran - # atomically via setup_shell above, so these re-runs are - # idempotent — they exist only to populate - # result.setup_results with per-command exit/stdout/stderr. - for i, cmd in enumerate(setup, 1): - cr = self._exec_command(session.sandbox, cmd) - result.setup_results.append(cr) - if cr.exit_code != 0: - # Should not happen — setup_shell already succeeded - # during bootstrap, but record it for diagnostics. - result.error = ( - f"setup replay failed (exit {cr.exit_code}): {cmd[:120]}" + # setup commands already ran atomically during sandbox bootstrap. + # Avoid re-running them here because many setup scripts are not + # idempotent (e.g., migrations, one-shot installs, destructive prep). + # We still surface per-command bookkeeping for callers. + for cmd in setup: + result.setup_results.append( + self._CommandResult( + cmd=cmd, + exit_code=0, + stdout="executed during bootstrap", + stderr="", + duration_s=0.0, ) - _emit(f"setup replay FAILED at [{i}]: exit={cr.exit_code}") - break + ) - # Block until the agent is done (or setup already failed). + # Block until the agent is done. if result.error is None: _emit( f"agent running — {agent} CLI in sandbox " @@ -498,6 +506,11 @@ def _build_session_factory( top_logprobs: int, max_tokens_cap: int, ) -> Any: + if self._E2BSandboxBackend is None: + raise RuntimeError( + "E2BSandboxBackend unavailable: install optional dependency 'e2b'." + ) + backend_kwargs: dict[str, Any] = {} if template: backend_kwargs["template"] = template diff --git a/examples/coding_agent_env_simple.py b/examples/coding_agent_env_simple.py index f8996e586..caf81bad8 100644 --- a/examples/coding_agent_env_simple.py +++ b/examples/coding_agent_env_simple.py @@ -14,12 +14,9 @@ 1. Spawns a fresh E2B sandbox (using the prebaked ``coding-agent-rl`` template — falls back to a cold install if the template isn't present in your E2B account). - 2. Bootstraps an in-sandbox FastAPI proxy that captures per-token - logprobs (``mode="transparent_proxy"``). - 3. Runs the selected harness CLI with the instruction. - 4. Executes the verify bash commands; reward = passed / total. - 5. Returns a ``RolloutResult`` with reward + per-turn logprobs + - the file contents the agent produced. + 2. Runs the selected harness CLI with the instruction. + 3. Executes the verify bash commands; reward = passed / total. + 4. Returns a ``RolloutResult`` with reward + produced file contents. Prerequisites ------------- @@ -34,7 +31,6 @@ Expected output (~20s with the prebaked template):: reward: 1.0 - turns: 3 files: ['/home/user/workdir/binary_search.py', ...] wall: 19.8 s """ @@ -54,7 +50,9 @@ from coding_agent_env.models import RolloutResult # noqa: E402 -SPACE = os.environ.get("CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space") +SPACE = os.environ.get( + "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space" +) INSTRUCTION = ( "Create a single Python file named `binary_search.py` in the current " @@ -109,8 +107,6 @@ async def main() -> int: print("--- result ---") print(f"reward: {result.reward}") - print(f"turns: {len(result.proxy_turns)}") - print(f"tokens: {sum(len(t.completion_tokens) for t in result.proxy_turns)}") print(f"sandbox: {result.sandbox_id}") print(f"wall_s: {result.wall_s}") print(f"files: {sorted(result.files)}") @@ -118,16 +114,6 @@ async def main() -> int: if result.error: print(f"error: {result.error}") - if result.proxy_turns: - first = next((t for t in result.proxy_turns if t.completion_tokens), None) - if first: - print() - print("--- first productive turn (first 8 tokens with logprobs) ---") - toks = first.completion_tokens[:8] - lps = first.per_token_logps[:8] - for tok, lp in zip(toks, lps): - print(f" {tok!r:<14} {lp:+.3f}") - return 0 if result.reward == 1.0 else 1 diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 1d934777d..42161bfec 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -21,7 +21,6 @@ import json import logging import shlex -import threading import time import uuid from typing import Any, Callable, Literal @@ -117,11 +116,14 @@ def close(self) -> None: def wait_for_completion(self, timeout_s: float | None = None) -> int: """Block until the agent exits, returning its exit code.""" - budget = timeout_s if timeout_s is not None else self.spec.default_timeout_s - if hasattr(self.config, "agent_timeout_s"): - budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s if self._agent_bg_job is None: raise RuntimeError("Agent not started.") + default_timeout = ( + self.config.agent_timeout_s + if hasattr(self.config, "agent_timeout_s") + else self.spec.default_timeout_s + ) + budget = timeout_s if timeout_s is not None else default_timeout return self._agent_bg_job.wait(timeout=budget) def collect_artifacts(self) -> dict[str, Any]: @@ -189,17 +191,19 @@ async def next_request( self._interception_queue.get(), timeout=min(remaining, 1.0), ) - return server.intercepts[request_id] + intercept = server.get_intercept(request_id) + if intercept is not None: + return intercept except asyncio.TimeoutError: - if self._agent_bg_job is not None: - done_event = getattr(self._agent_bg_job, "_done", None) - if ( - done_event is not None - and isinstance(done_event, threading.Event) - and done_event.is_set() - ): - return None - continue + pass + + if self._agent_bg_job is not None: + try: + self._agent_bg_job.wait(timeout=0) + return None + except TimeoutError: + pass + continue async def deliver( self, intercept: dict[str, Any], response_dict: dict[str, Any] @@ -241,6 +245,14 @@ def __init__( self._interception_server = interception_server self._interception_base_url = interception_base_url + def bootstrap_sandbox(self, sandbox: SandboxHandle, task: Any, config: Any) -> None: + """Public bootstrap hook used by external wrappers. + + Runs readiness checks, optional install, file upload, MCP config write, + and task setup shell execution. + """ + self._bootstrap_sandbox(sandbox, task, config) + def create_session( self, task: Any, diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index a075ec8b4..1aa4edf57 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -36,7 +36,9 @@ while True: request_id = await asyncio.wait_for(queue.get(), timeout=...) - intercept = server.intercepts[request_id] + intercept = server.get_intercept(request_id) + if intercept is None: + continue response = await vllm.generate(intercept["messages"], ...) await deliver_response(intercept, response) @@ -51,6 +53,7 @@ import json import logging import secrets +import threading import time import uuid from typing import Any @@ -71,13 +74,20 @@ class InterceptionServer: identified by a ``rollout_id`` in the URL path. """ - def __init__(self, port: int = 0, secret: str | None = None) -> None: + def __init__( + self, + port: int = 0, + secret: str | None = None, + host: str = "127.0.0.1", + ) -> None: self.port = port + self.host = host self.secret = secret or secrets.token_urlsafe(32) self._app: web.Application | None = None self._runner: web.AppRunner | None = None self._site: web.TCPSite | None = None self._lock = asyncio.Lock() + self._state_lock = threading.RLock() self.active_rollouts: dict[str, dict[str, Any]] = {} self.intercepts: dict[str, dict[str, Any]] = {} @@ -93,7 +103,9 @@ async def start(self) -> None: app.router.add_get("/health", self._handle_health) runner = web.AppRunner(app) await runner.setup() - site = web.TCPSite(runner, "0.0.0.0", self.port) + if self.host == "0.0.0.0": + _log.warning("InterceptionServer exposed on all interfaces (0.0.0.0).") + site = web.TCPSite(runner, self.host, self.port) await site.start() if self.port == 0: server = getattr(site, "_server", None) @@ -111,7 +123,11 @@ async def stop(self) -> None: async with self._lock: if self._runner is None: return - for intercept in list(self.intercepts.values()): + with self._state_lock: + intercepts = list(self.intercepts.values()) + self.intercepts.clear() + self.active_rollouts.clear() + for intercept in intercepts: fut: asyncio.Future | None = intercept.get("response_future") if fut and not fut.done(): fut.cancel() @@ -121,8 +137,6 @@ async def stop(self) -> None: cq.put_nowait(None) except asyncio.QueueFull: pass - self.intercepts.clear() - self.active_rollouts.clear() try: await self._runner.cleanup() except RuntimeError: @@ -137,27 +151,39 @@ def register_rollout( state: dict[str, Any] | None = None, ) -> asyncio.Queue: queue: asyncio.Queue = asyncio.Queue() - self.active_rollouts[rollout_id] = { - "request_id_queue": queue, - "state": state, - } + with self._state_lock: + self.active_rollouts[rollout_id] = { + "request_id_queue": queue, + "state": state, + } return queue def unregister_rollout(self, rollout_id: str) -> None: - for request_id in list(self.intercepts): - intercept = self.intercepts.get(request_id) - if intercept and intercept.get("rollout_id") == rollout_id: - fut: asyncio.Future | None = intercept.get("response_future") - if fut and not fut.done(): - fut.cancel() - cq: asyncio.Queue | None = intercept.get("chunk_queue") - if cq is not None: - try: - cq.put_nowait(None) - except asyncio.QueueFull: - pass + with self._state_lock: + matching_ids = [ + request_id + for request_id, intercept in self.intercepts.items() + if intercept.get("rollout_id") == rollout_id + ] + matching_intercepts = [self.intercepts[i] for i in matching_ids] + for request_id in matching_ids: del self.intercepts[request_id] - self.active_rollouts.pop(rollout_id, None) + self.active_rollouts.pop(rollout_id, None) + + for intercept in matching_intercepts: + fut: asyncio.Future | None = intercept.get("response_future") + if fut and not fut.done(): + fut.cancel() + cq: asyncio.Queue | None = intercept.get("chunk_queue") + if cq is not None: + try: + cq.put_nowait(None) + except asyncio.QueueFull: + pass + + def get_intercept(self, request_id: str) -> dict[str, Any] | None: + with self._state_lock: + return self.intercepts.get(request_id) def _authorized(self, request: web.Request) -> bool: auth = request.headers.get("Authorization", "") @@ -176,7 +202,8 @@ async def _handle_chat_completions( return web.json_response({"error": "Unauthorized"}, status=401) rollout_id = request.match_info["rollout_id"] - context = self.active_rollouts.get(rollout_id) + with self._state_lock: + context = self.active_rollouts.get(rollout_id) if not context: return web.json_response({"error": "rollout not found"}, status=404) @@ -197,11 +224,16 @@ async def _handle_chat_completions( "tools": body.get("tools"), "stream": is_streaming, "chunk_queue": chunk_queue, - "response_future": asyncio.get_event_loop().create_future(), + "response_future": asyncio.get_running_loop().create_future(), "body": body, } - self.intercepts[request_id] = intercept - await context["request_id_queue"].put(request_id) + with self._state_lock: + context = self.active_rollouts.get(rollout_id) + if context is None: + return web.json_response({"error": "rollout not found"}, status=404) + self.intercepts[request_id] = intercept + request_queue: asyncio.Queue = context["request_id_queue"] + await request_queue.put(request_id) if is_streaming: return await self._stream_response(request, intercept) @@ -210,8 +242,12 @@ async def _handle_chat_completions( response_dict = await intercept["response_future"] except asyncio.CancelledError: return web.json_response({"error": "rollout cancelled"}, status=499) - except Exception as exc: - return web.json_response({"error": str(exc)}, status=500) + except Exception: + _log.exception("interception request %s failed", request_id) + return web.json_response({"error": "internal error"}, status=500) + finally: + with self._state_lock: + self.intercepts.pop(request_id, None) return web.json_response(response_dict) @@ -249,6 +285,13 @@ async def _stream_response( finally: if get_task and not get_task.done(): get_task.cancel() + fut: asyncio.Future | None = intercept.get("response_future") + if fut and not fut.done(): + fut.cancel() + request_id = intercept.get("request_id") + if isinstance(request_id, str): + with self._state_lock: + self.intercepts.pop(request_id, None) try: await resp.write_eof() except Exception: diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py index 13c17fa04..9a829c3e2 100644 --- a/src/openenv/core/harness/agents/opencode.py +++ b/src/openenv/core/harness/agents/opencode.py @@ -19,6 +19,7 @@ from __future__ import annotations import json +import shlex from typing import Any from . import register_agent @@ -39,11 +40,15 @@ def _build_opencode_command( log_file = f"{home}/logs/agent/opencode.jsonl" workdir = f"{home}/workdir" + workdir_q = shlex.quote(workdir) + instruction_q = shlex.quote(instruction_file) + log_q = shlex.quote(log_file) + return ( f'export PATH="$HOME/.opencode/bin:$PATH" && ' - f"cd {workdir} && git init -q 2>/dev/null; " - f'opencode run {format_flag} "$(cat {instruction_file})" ' - f"2>&1 | tee {log_file}" + f"cd {workdir_q} && git init -q 2>/dev/null; " + f'opencode run {format_flag} "$(cat {instruction_q})" ' + f"2>&1 | tee {log_q}" ).strip() diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index d7b60569f..dcc552842 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -61,12 +61,16 @@ def _build_command( if hasattr(config, "thinking") and config.thinking: thinking = f" --thinking {shlex.quote(config.thinking)}" + workdir_q = shlex.quote(workdir) + instruction_q = shlex.quote(instruction_file) + log_q = shlex.quote(log_file) + return ( - f"cd {workdir} && git init -q 2>/dev/null; " + f"cd {workdir_q} && git init -q 2>/dev/null; " f"pi --no-session --no-context-files" f"{provider}{model}{thinking}" - f" -p @{instruction_file}" - f" 2>&1 | tee {log_file}" + f" -p @{instruction_q}" + f" 2>&1 | tee {log_q}" ) diff --git a/src/openenv/core/harness/sandbox/_util.py b/src/openenv/core/harness/sandbox/_util.py new file mode 100644 index 000000000..6291b0fb3 --- /dev/null +++ b/src/openenv/core/harness/sandbox/_util.py @@ -0,0 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + + +def shell_quote(s: str) -> str: + """Single-quote a string for shell, escaping embedded single quotes.""" + return "'" + s.replace("'", "'\\''") + "'" diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py index 559817d1b..28447ce2e 100644 --- a/src/openenv/core/harness/sandbox/docker_backend.py +++ b/src/openenv/core/harness/sandbox/docker_backend.py @@ -31,6 +31,7 @@ import uuid from pathlib import PurePosixPath +from openenv.core.harness.sandbox._util import shell_quote from openenv.core.harness.sandbox.base import BgJob, ExecResult _log = logging.getLogger(__name__) @@ -45,12 +46,14 @@ class DockerBgJob: """ def __init__( - self, container_id: str, pid: int, poll_thread: threading.Thread + self, + container_id: str, + pid: int, + poll_thread: threading.Thread | None = None, ) -> None: self._container_id = container_id self._pid = pid self._exit_code: int | None = None - self._error: BaseException | None = None self._done = threading.Event() self._poll_thread = poll_thread @@ -63,8 +66,6 @@ def wait(self, timeout: float | None = None) -> int: raise TimeoutError( f"Background command (pid={self._pid}) did not exit within {timeout}s" ) - if self._error is not None: - raise self._error return self._exit_code if self._exit_code is not None else 0 def kill(self) -> None: @@ -127,8 +128,8 @@ def start_bg( envs: dict[str, str] | None = None, cwd: str | None = None, ) -> BgJob: - marker = f"/tmp/.bg_{uuid.uuid4().hex[:8]}" - wrapped = f"bash -c {_shell_quote(cmd + f'; echo $? > {marker}')} &\necho $!" + marker = f"/tmp/.bg_{uuid.uuid4().hex}" + wrapped = f"bash -c {shell_quote(cmd + f'; echo $? > {marker}')} &\necho $!" docker_cmd = self._build_exec_cmd(envs=envs, cwd=cwd) docker_cmd.extend(["bash", "-c", wrapped]) result = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=10) @@ -147,7 +148,7 @@ def start_bg( ) pid = int(pid_line) - job = DockerBgJob(self._container_id, pid, poll_thread=None) # type: ignore[arg-type] + job = DockerBgJob(self._container_id, pid) poll_thread = threading.Thread( target=self._poll_bg_job, args=(job, marker), @@ -174,7 +175,7 @@ def write_text(self, path: str, content: str) -> None: self._container_id, "bash", "-c", - f"cat > {_shell_quote(path)}", + f"cat > {shell_quote(path)}", ], input=content.encode(), capture_output=True, @@ -233,6 +234,7 @@ def _build_exec_cmd( return cmd def _poll_bg_job(self, job: DockerBgJob, marker: str) -> None: + consecutive_failures = 0 while not job._done.is_set(): try: result = subprocess.run( @@ -245,22 +247,38 @@ def _poll_bg_job(self, job: DockerBgJob, marker: str) -> None: job._exit_code = int(result.stdout.strip()) job._done.set() return + if "No such container" in (result.stderr or ""): + job._exit_code = 1 + job._done.set() + return except Exception: - pass + consecutive_failures += 1 + else: + consecutive_failures = 0 # Also check if PID is gone (crash without writing marker). try: check = subprocess.run( ["docker", "exec", self._container_id, "kill", "-0", str(job._pid)], capture_output=True, + text=True, timeout=5, ) if check.returncode != 0: job._exit_code = 1 job._done.set() return + if "No such container" in (check.stderr or ""): + job._exit_code = 1 + job._done.set() + return except Exception: - pass + consecutive_failures += 1 + + if consecutive_failures >= 10: + job._exit_code = 1 + job._done.set() + return time.sleep(0.5) @@ -332,8 +350,3 @@ def create( "Docker sandbox created: %s (image=%s)", container_id[:12], self._image ) return DockerSandboxHandle(container_id, user=self._user) - - -def _shell_quote(s: str) -> str: - """Single-quote a string for shell, escaping embedded single quotes.""" - return "'" + s.replace("'", "'\\''") + "'" diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py index bb41356e2..43ec5ad95 100644 --- a/src/openenv/core/harness/sandbox/hf_backend.py +++ b/src/openenv/core/harness/sandbox/hf_backend.py @@ -20,6 +20,7 @@ from typing import Any from hf_sandbox import Sandbox +from openenv.core.harness.sandbox._util import shell_quote from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxHandle _ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") @@ -67,7 +68,7 @@ def wait(self, timeout: float | None = None) -> int: ) marker = self._sandbox.exec( - f"cat {_shell_quote(self._marker_path)}", + f"cat {shell_quote(self._marker_path)}", timeout=10, ) if marker.exit_code == 0 and marker.stdout.strip(): @@ -152,8 +153,8 @@ def start_bg( cwd: str | None = None, ) -> BgJob: marker_path = f"/tmp/.openenv_bg_{uuid.uuid4().hex[:12]}.exit" - wrapped = f"{cmd}; rc=$?; echo $rc > {_shell_quote(marker_path)}" - launch_cmd = f"nohup bash -lc {_shell_quote(wrapped)} >/dev/null 2>&1 & echo $!" + wrapped = f"{cmd}; rc=$?; echo $rc > {shell_quote(marker_path)}" + launch_cmd = f"nohup bash -lc {shell_quote(wrapped)} >/dev/null 2>&1 & echo $!" result = self.exec(launch_cmd, envs=envs, cwd=cwd, timeout=30) if result.exit_code != 0: @@ -174,7 +175,7 @@ def start_bg( def write_text(self, path: str, content: str) -> None: parent = str(PurePosixPath(path).parent) if parent not in ("", "/"): - r = self.exec(f"mkdir -p {_shell_quote(parent)}", timeout=10) + r = self.exec(f"mkdir -p {shell_quote(parent)}", timeout=10) if r.exit_code != 0: raise RuntimeError( f"Failed to create parent directory {parent!r}: {r.stderr}" @@ -185,7 +186,7 @@ def read_text(self, path: str) -> str: return str(self._sbx.read_file(path, text=True)) def exists(self, path: str) -> bool: - r = self.exec(f"test -e {_shell_quote(path)}", timeout=10) + r = self.exec(f"test -e {shell_quote(path)}", timeout=10) return r.exit_code == 0 def kill(self) -> None: @@ -250,8 +251,8 @@ def create( assert last_error is not None raise HFSandboxCreateError( - f"Failed to create HF sandbox after {self._create_retries} attempts: " - f"{last_error}" + f"Failed to create HF sandbox after {self._create_retries} attempts " + f"({type(last_error).__name__})." ) from last_error @@ -262,7 +263,7 @@ def _with_env_prefix(cmd: str, envs: dict[str, str]) -> str: for key, value in envs.items(): if not _ENV_KEY_RE.match(key): raise ValueError(f"Invalid environment variable name: {key!r}") - parts.append(f"export {key}={_shell_quote(str(value))};") + parts.append(f"export {key}={shell_quote(str(value))};") return " ".join(parts) + f" {cmd}" @@ -296,11 +297,6 @@ def _parse_exit_code(raw: str, *, default: int) -> int: return default -def _shell_quote(s: str) -> str: - """Single-quote a string for shell, escaping embedded single quotes.""" - return "'" + s.replace("'", "'\\''") + "'" - - __all__ = [ "HFBgJob", "HFSandboxBackend", diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 0b218f19a..af9629970 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -483,6 +483,52 @@ def test_create_session_uploads_task_files(self): assert sbx.written["/extra/data.json"] == '{"key": "value"}' session.close() + def test_opencode_black_box_api_key_stays_out_of_command_argv(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + secret = "sk-test '$(leak)" + config = FakeConfig(api_key=secret) + backend = FakeSandboxBackend() + driver = CLIAgentDriver( + spec=OPENCODE_SPEC, + sandbox_backend=backend, + mode="black_box", + ) + + session = driver.create_session(task=FakeTask(), config=config) + sbx = backend.created[0] + cmd, envs = sbx.bg_commands[-1] + assert secret not in cmd + assert envs is not None + assert envs["OPENAI_API_KEY"] == secret + session.close() + + def test_opencode_interception_gate_uses_server_secret_not_user_key(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + from openenv.core.harness.agents.interception_server import InterceptionServer + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + secret = "sk-test '$(leak)" + config = FakeConfig(api_key=secret) + backend = FakeSandboxBackend() + server = InterceptionServer(port=0, secret="gate-secret") + driver = CLIAgentDriver( + spec=OPENCODE_SPEC, + sandbox_backend=backend, + mode="interception_gate", + interception_server=server, + interception_base_url="http://127.0.0.1:8765", + ) + + session = driver.create_session(task=FakeTask(), config=config) + sbx = backend.created[0] + cmd, envs = sbx.bg_commands[-1] + assert secret not in cmd + assert envs is not None + assert envs["OPENAI_API_KEY"] == "gate-secret" + session.close() + def test_create_session_runs_task_setup_shell(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver @@ -670,6 +716,32 @@ def test_close_kills_sandbox_and_jobs(self): assert sbx._killed assert session._agent_bg_job is None + @pytest.mark.asyncio + async def test_next_request_handles_missing_intercept_without_keyerror(self): + import asyncio + + from openenv.core.harness.agents.cli_driver import CLIAgentSession + from openenv.core.harness.agents.interception_server import InterceptionServer + + spec = _make_test_spec() + sbx = FakeSandbox() + queue: asyncio.Queue[str] = asyncio.Queue() + await queue.put("req_missing") + + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + agent_bg_job=FakeBgJob(), + interception_server=InterceptionServer(secret="s"), + interception_rollout_id="rollout-1", + interception_queue=queue, + ) + + # Missing request IDs can happen if unregister_rollout races with queue.get(). + assert await session.next_request(timeout_s=0.2) is None + class TestCLIAgentSessionFactory: """Tests for the ResourceSessionFactory wrapper.""" @@ -775,6 +847,25 @@ class OcConfig: assert "--format json" in cmd assert "/home/user/task/instruction.md" in cmd + def test_build_command_quotes_paths(self): + from openenv.core.harness.agents.opencode import OPENCODE_SPEC + + @dataclass + class OcConfig: + sandbox_home: str = "/home/user with space" + run_format: str = "json" + + assert OPENCODE_SPEC.build_command is not None + cmd = OPENCODE_SPEC.build_command( + OPENCODE_SPEC, + OcConfig(), + FakeTask(instruction="Write hello.py"), + None, + ) + assert "cd '/home/user with space/workdir'" in cmd + assert "cat '/home/user with space/task/instruction.md'" in cmd + assert "tee '/home/user with space/logs/agent/opencode.jsonl'" in cmd + def test_build_mcp_config(self): from openenv.core.harness.agents.opencode import OPENCODE_SPEC @@ -888,6 +979,29 @@ def test_opencode_driver_integration(self): session.close() +class TestPiSpec: + def test_build_command_quotes_paths(self): + from openenv.core.harness.agents.pi import PI_SPEC + + @dataclass + class PiConfig: + sandbox_home: str = "/home/user with space" + provider: str = "openai" + model: str = "model/name" + thinking: str = "off" + + assert PI_SPEC.build_command is not None + cmd = PI_SPEC.build_command( + PI_SPEC, + PiConfig(), + FakeTask(instruction="Write hello.py"), + None, + ) + assert "cd '/home/user with space/workdir'" in cmd + assert "-p @'/home/user with space/task/instruction.txt'" in cmd + assert "tee '/home/user with space/logs/agent/pi.txt'" in cmd + + # Env var resolution diff --git a/tests/core/test_docker_sandbox_backend.py b/tests/core/test_docker_sandbox_backend.py index b47f6bd4e..b2eebddd2 100644 --- a/tests/core/test_docker_sandbox_backend.py +++ b/tests/core/test_docker_sandbox_backend.py @@ -289,13 +289,13 @@ def test_factory_creates_docker_backend(self): sandbox.kill() def test_satisfies_sandbox_handle_protocol(self): - from openenv.core.harness.sandbox import SandboxBackend + from openenv.core.harness.sandbox import SandboxHandle from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend backend = DockerSandboxBackend(image="ubuntu:22.04") sandbox = backend.create(timeout_s=60) try: - assert isinstance(sandbox, SandboxBackend) or hasattr(sandbox, "exec") + assert isinstance(sandbox, SandboxHandle) assert hasattr(sandbox, "sandbox_id") assert hasattr(sandbox, "exec") assert hasattr(sandbox, "start_bg") diff --git a/tests/core/test_hf_sandbox_backend.py b/tests/core/test_hf_sandbox_backend.py index cd235c748..9cd94b5d8 100644 --- a/tests/core/test_hf_sandbox_backend.py +++ b/tests/core/test_hf_sandbox_backend.py @@ -150,6 +150,11 @@ def _install_fake_hf_sandbox(monkeypatch) -> None: monkeypatch.setitem(sys.modules, "hf_sandbox", fake_module) +@pytest.fixture(autouse=True) +def _reset_fake_hf_calls() -> None: + _FakeSandboxAPI.calls.clear() + + class TestHFSandboxBackend: def test_exported_from_package(self, monkeypatch): _install_fake_hf_sandbox(monkeypatch) @@ -167,7 +172,6 @@ def test_create_exec_write_read_exists_bg_and_kill(self, monkeypatch): _install_fake_hf_sandbox(monkeypatch) importlib.reload(hf_backend) - _FakeSandboxAPI.calls.clear() monkeypatch.setattr(hf_backend, "Sandbox", _FakeSandboxAPI) backend = hf_backend.HFSandboxBackend( diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py new file mode 100644 index 000000000..45922849c --- /dev/null +++ b/tests/core/test_interception_server.py @@ -0,0 +1,124 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import asyncio + +import aiohttp +import pytest + +from openenv.core.harness.agents.interception_server import ( + InterceptionServer, + deliver_response, +) + + +@pytest.mark.asyncio +async def test_interception_server_rejects_unauthorized_requests() -> None: + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + try: + async with aiohttp.ClientSession() as client: + resp = await client.post( + f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions", + json={"messages": []}, + ) + assert resp.status == 401 + finally: + await server.stop() + + +@pytest.mark.asyncio +async def test_interception_server_returns_404_for_unknown_rollout() -> None: + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + try: + async with aiohttp.ClientSession() as client: + resp = await client.post( + f"http://127.0.0.1:{server.port}/rollout/missing/v1/chat/completions", + headers={"Authorization": "Bearer secret-token"}, + json={"messages": []}, + ) + assert resp.status == 404 + finally: + await server.stop() + + +@pytest.mark.asyncio +async def test_interception_server_non_stream_roundtrip_cleans_intercept() -> None: + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + queue = server.register_rollout("r1") + try: + async with aiohttp.ClientSession() as client: + request_task = asyncio.create_task( + client.post( + f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions", + headers={"Authorization": "Bearer secret-token"}, + json={ + "messages": [{"role": "user", "content": "hi"}], + "stream": False, + }, + ) + ) + request_id = await asyncio.wait_for(queue.get(), timeout=1.0) + intercept = server.get_intercept(request_id) + assert intercept is not None + + await deliver_response( + intercept, + { + "id": "resp-1", + "model": "test-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "hello"}, + "finish_reason": "stop", + } + ], + }, + ) + + resp = await request_task + assert resp.status == 200 + payload = await resp.json() + assert payload["id"] == "resp-1" + + # Request entries should not leak after completion. + assert server.get_intercept(request_id) is None + finally: + server.unregister_rollout("r1") + await server.stop() + + +@pytest.mark.asyncio +async def test_interception_server_unregister_rollout_cancels_pending_request() -> None: + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + queue = server.register_rollout("r1") + try: + async with aiohttp.ClientSession() as client: + request_task = asyncio.create_task( + client.post( + f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions", + headers={"Authorization": "Bearer secret-token"}, + json={ + "messages": [{"role": "user", "content": "hi"}], + "stream": False, + }, + ) + ) + _request_id = await asyncio.wait_for(queue.get(), timeout=1.0) + server.unregister_rollout("r1") + + resp = await request_task + assert resp.status == 499 + payload = await resp.json() + assert payload["error"] == "rollout cancelled" + finally: + await server.stop() diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py index 6626c1c59..bfd37758a 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -165,6 +165,29 @@ def test_catalog_summary_shape() -> None: } <= entry.keys() +def test_run_rollout_rejects_unknown_mode() -> None: + from coding_agent_env.server.coding_environment import CodingAgentEnvironment + + env = CodingAgentEnvironment() + with pytest.raises(ValueError, match="unsupported mode"): + env._run_rollout_impl( + agent="opencode", + base_url="https://api.openai.com/v1", + api_key="sk-test", + model="gpt-4o-mini", + instruction="hello", + setup=[], + verify=[], + task_id="", + mode="legacy_proxy", + disable_thinking=False, + max_tokens_cap=1024, + top_logprobs=5, + agent_timeout_s=30.0, + template="", + ) + + def test_build_agent_config_opencode() -> None: from coding_agent_env.server.coding_environment import CodingAgentEnvironment @@ -218,6 +241,35 @@ def test_build_agent_config_pi() -> None: assert cfg_gate.provider == "huggingface" +def test_build_session_factory_requires_e2b_dependency() -> None: + from coding_agent_env.server.coding_environment import CodingAgentEnvironment + + env = CodingAgentEnvironment() + env._E2BSandboxBackend = None + cfg = env._build_agent_config( + agent="pi", + mode="black_box", + base_url="https://router.huggingface.co/v1", + api_key="hf_xxx", + model="zai-org/GLM-5.1", + agent_timeout_s=180.0, + disable_thinking=False, + top_logprobs=5, + max_tokens_cap=4096, + ) + + with pytest.raises(RuntimeError, match="E2BSandboxBackend unavailable"): + env._build_session_factory( + agent="pi", + config=cfg, + mode="black_box", + template="", + disable_thinking=False, + top_logprobs=5, + max_tokens_cap=4096, + ) + + # --------------------------------------------------------------------------- # Models + task coercion # --------------------------------------------------------------------------- From b18caf229f8341021f9aacb23d4ebe83d2465e34 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:39:29 +0530 Subject: [PATCH 17/35] chore: remove unsupported mode checks from CodingAgentEnvironment --- .../server/coding_environment.py | 11 --------- tests/envs/test_coding_agent_env.py | 23 ------------------- 2 files changed, 34 deletions(-) diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index af70b292e..b1e7f47ef 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -184,11 +184,6 @@ def run_rollout( raise ValueError( f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}" ) - if mode not in {"black_box", "interception_gate"}: - raise ValueError( - "unsupported mode {!r}; supported modes: ('black_box', " - "'interception_gate')".format(mode) - ) if not (base_url and api_key and model): raise ValueError( "must provide either ``endpoint`` (one of " @@ -308,12 +303,6 @@ def _emit(msg: str) -> None: except Exception: pass - if mode not in {"black_box", "interception_gate"}: - raise ValueError( - "unsupported mode {!r}; supported modes: ('black_box', " - "'interception_gate')".format(mode) - ) - result = self._RolloutResult(task_id=task_id, mode=mode) t0 = time.time() diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py index bfd37758a..905713e7a 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -165,29 +165,6 @@ def test_catalog_summary_shape() -> None: } <= entry.keys() -def test_run_rollout_rejects_unknown_mode() -> None: - from coding_agent_env.server.coding_environment import CodingAgentEnvironment - - env = CodingAgentEnvironment() - with pytest.raises(ValueError, match="unsupported mode"): - env._run_rollout_impl( - agent="opencode", - base_url="https://api.openai.com/v1", - api_key="sk-test", - model="gpt-4o-mini", - instruction="hello", - setup=[], - verify=[], - task_id="", - mode="legacy_proxy", - disable_thinking=False, - max_tokens_cap=1024, - top_logprobs=5, - agent_timeout_s=30.0, - template="", - ) - - def test_build_agent_config_opencode() -> None: from coding_agent_env.server.coding_environment import CodingAgentEnvironment From bfc730542f899a231060082c3a76475915b01863 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:40:47 +0530 Subject: [PATCH 18/35] chore: revert linting for out of scope files --- envs/agent_world_model_env/server/web_ui.py | 4 +- envs/chat_env/models.py | 4 +- envs/chat_env/server/chat_environment.py | 4 +- .../server/coding_tools_env_environment.py | 65 +++---------- envs/coding_tools_env/server/e2b_sandbox.py | 23 +---- envs/coding_tools_env/server/gradio_ui.py | 94 ++++++------------- .../jupyter_env/server/jupyter_environment.py | 10 +- envs/repl_env/server/repl_environment.py | 4 +- .../server/terminus_env_environment.py | 10 +- envs/textarena_env/server/gradio_ui.py | 8 +- 10 files changed, 64 insertions(+), 162 deletions(-) diff --git a/envs/agent_world_model_env/server/web_ui.py b/envs/agent_world_model_env/server/web_ui.py index 09b445d3f..84b10c6b2 100644 --- a/envs/agent_world_model_env/server/web_ui.py +++ b/envs/agent_world_model_env/server/web_ui.py @@ -21,7 +21,9 @@ # Keep in sync with DEFAULT_REWARD_CONFIG in config.py. -_DEFAULT_REWARD_JSON = json.dumps(DEFAULT_REWARD_CONFIG, indent=2) +_DEFAULT_REWARD_JSON = json.dumps( + DEFAULT_REWARD_CONFIG, indent=2 +) def _format_obs_md(payload: dict | None) -> str: diff --git a/envs/chat_env/models.py b/envs/chat_env/models.py index da994cbe3..8bc10f09e 100644 --- a/envs/chat_env/models.py +++ b/envs/chat_env/models.py @@ -55,9 +55,7 @@ class ChatState(State): """State of the ChatEnvironment containing message history.""" history_messages: list[Message] = Field(default_factory=list) - history_tokens: list[list[int]] = Field( - default_factory=list - ) # Same len as messages + history_tokens: list[list[int]] = Field(default_factory=list) # Same len as messages class ChatObservation(Observation): diff --git a/envs/chat_env/server/chat_environment.py b/envs/chat_env/server/chat_environment.py index f66f3e790..90b2d01f0 100644 --- a/envs/chat_env/server/chat_environment.py +++ b/envs/chat_env/server/chat_environment.py @@ -90,9 +90,7 @@ def _coerce_tokens(self, tokens) -> list[int]: def _tokenize_conversation(self, conversation: list[Message]) -> list[int]: """Tokenize a conversation with a chat-template fallback for base tokenizers.""" try: - tokens = self.tokenizer.apply_chat_template( - conversation=conversation, tokenize=True - ) + tokens = self.tokenizer.apply_chat_template(conversation=conversation, tokenize=True) except Exception: # Some tokenizers (e.g. gpt2) do not define `chat_template`. fallback_text = "".join( diff --git a/envs/coding_tools_env/server/coding_tools_env_environment.py b/envs/coding_tools_env/server/coding_tools_env_environment.py index d0ef86675..615e7770f 100644 --- a/envs/coding_tools_env/server/coding_tools_env_environment.py +++ b/envs/coding_tools_env/server/coding_tools_env_environment.py @@ -45,28 +45,16 @@ def bash(command: str, timeout: float | None = 30) -> str: return "Error: environment not reset. Call reset() first." timeout_value = 30 if timeout is None else float(timeout) result = self._sandbox.run_shell(command, timeout_s=timeout_value) - self._record( - "bash", result.ok, result.output, result.error, result.metadata - ) - return ( - result.output - if result.ok - else f"ERROR: {result.error}\n{result.output}".strip() - ) + self._record("bash", result.ok, result.output, result.error, result.metadata) + return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip() @mcp.tool - def read( - file_path: str, offset: int | None = None, limit: int | None = None - ) -> str: + def read(file_path: str, offset: int | None = None, limit: int | None = None) -> str: """Read file contents using computer instance.""" if not self._sandbox: return "Error: environment not reset. Call reset() first." - result = self._sandbox.read_file( - file_path=file_path, offset=offset, limit=limit - ) - self._record( - "read", result.ok, result.output, result.error, result.metadata - ) + result = self._sandbox.read_file(file_path=file_path, offset=offset, limit=limit) + self._record("read", result.ok, result.output, result.error, result.metadata) return result.output if result.ok else f"ERROR: {result.error}" @mcp.tool @@ -75,9 +63,7 @@ def write(file_path: str, content: str) -> str: if not self._sandbox: return "Error: environment not reset. Call reset() first." result = self._sandbox.write_file(file_path=file_path, content=content) - self._record( - "write", result.ok, result.output, result.error, result.metadata - ) + self._record("write", result.ok, result.output, result.error, result.metadata) return result.output if result.ok else f"ERROR: {result.error}" @mcp.tool @@ -102,14 +88,10 @@ def edit( updated = original.replace(old_string, new_string) else: updated = original.replace(old_string, new_string, 1) - write_result = self._sandbox.write_file( - file_path=file_path, content=updated - ) + write_result = self._sandbox.write_file(file_path=file_path, content=updated) ok = write_result.ok msg = "edit ok" if ok else "" - self._record( - "edit", ok, msg, write_result.error, {"replace_all": replace_all} - ) + self._record("edit", ok, msg, write_result.error, {"replace_all": replace_all}) return msg if ok else f"ERROR: {write_result.error}" @mcp.tool @@ -147,11 +129,7 @@ def multi_edit(file_path: str, edits: list[dict[str, Any]]) -> str: write_result.error, {"applied": applied}, ) - return ( - f"applied {applied} edits" - if write_result.ok - else f"ERROR: {write_result.error}" - ) + return f"applied {applied} edits" if write_result.ok else f"ERROR: {write_result.error}" @mcp.tool def glob(pattern: str, path: str | None = None) -> str: @@ -159,27 +137,17 @@ def glob(pattern: str, path: str | None = None) -> str: if not self._sandbox: return "Error: environment not reset. Call reset() first." result = self._sandbox.glob_files(pattern=pattern, path=path) - self._record( - "glob", result.ok, result.output, result.error, result.metadata - ) + self._record("glob", result.ok, result.output, result.error, result.metadata) return result.output if result.ok else f"ERROR: {result.error}" @mcp.tool - def grep( - pattern: str, path: str | None = None, include: str | None = None - ) -> str: + def grep(pattern: str, path: str | None = None, include: str | None = None) -> str: """Search for patterns in files.""" if not self._sandbox: return "Error: environment not reset. Call reset() first." result = self._sandbox.grep(pattern=pattern, path=path, include=include) - self._record( - "grep", result.ok, result.output, result.error, result.metadata - ) - return ( - result.output - if result.ok - else f"ERROR: {result.error}\n{result.output}".strip() - ) + self._record("grep", result.ok, result.output, result.error, result.metadata) + return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip() @mcp.tool def ls(path: str = ".", ignore: list[str] | None = None) -> str: @@ -209,9 +177,7 @@ def todo_write(todos: list[dict[str, Any]]) -> str: self._record("todo_write", False, "", msg, None) return msg self._state.todos = validated - self._record( - "todo_write", True, f"stored {len(validated)} todos", None, None - ) + self._record("todo_write", True, f"stored {len(validated)} todos", None, None) return f"stored {len(validated)} todos" @mcp.tool @@ -315,8 +281,7 @@ def reset( "sandbox_id": self._state.sandbox_id, "message": "Setup command failed.", "setup_results": [ - entry.model_dump() - for entry in self._state.setup_results + entry.model_dump() for entry in self._state.setup_results ], }, ) diff --git a/envs/coding_tools_env/server/e2b_sandbox.py b/envs/coding_tools_env/server/e2b_sandbox.py index d6f77373b..5833c7ecb 100644 --- a/envs/coding_tools_env/server/e2b_sandbox.py +++ b/envs/coding_tools_env/server/e2b_sandbox.py @@ -94,11 +94,7 @@ def read_file( def write_file(self, file_path: str, content: str) -> ToolResult: try: self._sbx.files.write(file_path, content.encode("utf-8")) - return ToolResult( - ok=True, - output="write ok", - metadata={"bytes": len(content.encode("utf-8"))}, - ) + return ToolResult(ok=True, output="write ok", metadata={"bytes": len(content.encode("utf-8"))}) except Exception as exc: return ToolResult(ok=False, error=f"write failed: {exc}") @@ -115,9 +111,7 @@ def glob_files(self, pattern: str, path: str | None = None) -> ToolResult: if result is None: return ToolResult(ok=False, error=_format_error(execution)) matches = result.get("matches", []) - return ToolResult( - ok=True, output="\n".join(matches), metadata={"matches": matches} - ) + return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches}) def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResult: ignore = ignore or [] @@ -143,15 +137,10 @@ def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResu if not result.get("ok", False): return ToolResult(ok=False, error=str(result.get("error", "ls failed"))) items = result.get("items", []) - lines = [ - f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}" - for item in items - ] + lines = [f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}" for item in items] return ToolResult(ok=True, output="\n".join(lines), metadata={"items": items}) - def grep( - self, pattern: str, path: str | None = None, include: str | None = None - ) -> ToolResult: + def grep(self, pattern: str, path: str | None = None, include: str | None = None) -> ToolResult: root = path or "." code = ( "from pathlib import Path\n" @@ -184,9 +173,7 @@ def grep( if not result.get("ok", False): return ToolResult(ok=False, error=str(result.get("error", "grep failed"))) matches = result.get("matches", []) - return ToolResult( - ok=True, output="\n".join(matches), metadata={"matches": matches} - ) + return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches}) def kill(self) -> None: try: diff --git a/envs/coding_tools_env/server/gradio_ui.py b/envs/coding_tools_env/server/gradio_ui.py index 1f3845141..c0d670a99 100644 --- a/envs/coding_tools_env/server/gradio_ui.py +++ b/envs/coding_tools_env/server/gradio_ui.py @@ -105,9 +105,7 @@ def _extract_tool_error(result: dict[str, Any]) -> bool: def _format_status(state: dict[str, Any]) -> str: if not state: - return ( - "**No active session.** Configure setup/verify and click *Reset sandbox*." - ) + return "**No active session.** Configure setup/verify and click *Reset sandbox*." sandbox_id = state.get("sandbox_id") or "—" step_count = state.get("step_count", 0) submitted = state.get("submitted", False) @@ -229,9 +227,9 @@ def state_payload() -> dict[str, Any]: label="edits (JSON array)", language="json", value=( - "[\n" + '[\n' ' {"old_string": "TODO", "new_string": "DONE", "replace_all": false}\n' - "]" + ']' ), lines=8, ) @@ -262,10 +260,10 @@ def state_payload() -> dict[str, Any]: label="todos (JSON array)", language="json", value=( - "[\n" + '[\n' ' {"id":"1","content":"Inspect files",' '"status":"in_progress","priority":"high"}\n' - "]" + ']' ), lines=8, ) @@ -339,33 +337,23 @@ def on_tool_change(tool: str): return [help_md, *updates] tool_dropdown.change( - on_tool_change, - inputs=[tool_dropdown], - outputs=[tool_help, *group_components], + on_tool_change, inputs=[tool_dropdown], outputs=[tool_help, *group_components] ) # ───────── Result rendering helper ───────── - def render_result( - tool: str, raw: dict[str, Any] - ) -> tuple[str, str, str, str, str, list[list[str]]]: + def render_result(tool: str, raw: dict[str, Any]) -> tuple[str, str, str, str, str, list[list[str]]]: text = _extract_tool_text(raw) - is_error = ( - _extract_tool_error(raw) - or text.startswith("ERROR:") - or text.startswith("Error:") - ) + is_error = _extract_tool_error(raw) or text.startswith("ERROR:") or text.startswith("Error:") badge = "❌ error" if is_error else "✅ ok" status_line = f"**{tool}** — {badge}" state = state_payload() return ( - status_line, # output_status - text, # output_view - json.dumps(raw, indent=2), # raw_response - _format_status( - state - ), # state_summary (top + summary panel — same content) + status_line, # output_status + text, # output_view + json.dumps(raw, indent=2), # raw_response + _format_status(state), # state_summary (top + summary panel — same content) json.dumps(state, indent=2, default=str), # state_json - _format_history(state), # history_table + _format_history(state), # history_table ) # ───────── Session handlers ───────── @@ -410,33 +398,21 @@ async def on_close(): async def on_run( tool: str, # bash - bash_command: str, - bash_timeout: float, + bash_command: str, bash_timeout: float, # read - read_path: str, - read_offset: float | None, - read_limit: float | None, + read_path: str, read_offset: float | None, read_limit: float | None, # write - write_path: str, - write_content: str, + write_path: str, write_content: str, # edit - edit_path: str, - edit_old: str, - edit_new: str, - edit_replace_all: bool, + edit_path: str, edit_old: str, edit_new: str, edit_replace_all: bool, # multi_edit - multi_edit_path: str, - multi_edit_json: str, + multi_edit_path: str, multi_edit_json: str, # glob - glob_pattern: str, - glob_path: str, + glob_pattern: str, glob_path: str, # grep - grep_pattern: str, - grep_path: str, - grep_include: str, + grep_pattern: str, grep_path: str, grep_include: str, # ls - ls_path: str, - ls_ignore: str, + ls_path: str, ls_ignore: str, # todo_write todo_json: str, ): @@ -517,26 +493,14 @@ async def on_run( # ───────── Wire up events ───────── all_inputs = [ tool_dropdown, - bash_command, - bash_timeout, - read_path, - read_offset, - read_limit, - write_path, - write_content, - edit_path, - edit_old, - edit_new, - edit_replace_all, - multi_edit_path, - multi_edit_json, - glob_pattern, - glob_path, - grep_pattern, - grep_path, - grep_include, - ls_path, - ls_ignore, + bash_command, bash_timeout, + read_path, read_offset, read_limit, + write_path, write_content, + edit_path, edit_old, edit_new, edit_replace_all, + multi_edit_path, multi_edit_json, + glob_pattern, glob_path, + grep_pattern, grep_path, grep_include, + ls_path, ls_ignore, todo_json, ] all_outputs = [ diff --git a/envs/jupyter_env/server/jupyter_environment.py b/envs/jupyter_env/server/jupyter_environment.py index b7902e5d2..bc622ae22 100644 --- a/envs/jupyter_env/server/jupyter_environment.py +++ b/envs/jupyter_env/server/jupyter_environment.py @@ -348,10 +348,7 @@ def step( ) -> Observation: self._state.step_count += 1 obs = super().step(action, timeout_s=timeout_s, **kwargs) - if ( - self._state.submitted_answer is not None - and self._state.last_reward is not None - ): + if self._state.submitted_answer is not None and self._state.last_reward is not None: obs.done = True obs.reward = self._state.last_reward return obs @@ -364,10 +361,7 @@ async def step_async( ) -> Observation: self._state.step_count += 1 obs = await super().step_async(action, timeout_s=timeout_s, **kwargs) - if ( - self._state.submitted_answer is not None - and self._state.last_reward is not None - ): + if self._state.submitted_answer is not None and self._state.last_reward is not None: obs.done = True obs.reward = self._state.last_reward return obs diff --git a/envs/repl_env/server/repl_environment.py b/envs/repl_env/server/repl_environment.py index 13a759c29..f2e6f5d98 100644 --- a/envs/repl_env/server/repl_environment.py +++ b/envs/repl_env/server/repl_environment.py @@ -272,7 +272,9 @@ def reset( # reset() are treated as equal and don't trigger a redundant rebuild. resolved_model = self._resolve_model(llm_model) has_runtime_llm = self._runtime_controller is not None - model_changed = has_runtime_llm and resolved_model != self._current_llm_model + model_changed = ( + has_runtime_llm and resolved_model != self._current_llm_model + ) token_provided = hf_token is not None if not self.llm_query_fn or model_changed or token_provided: effective_token = ( diff --git a/envs/terminus_env/server/terminus_env_environment.py b/envs/terminus_env/server/terminus_env_environment.py index 03de18baa..c6f9e1c02 100644 --- a/envs/terminus_env/server/terminus_env_environment.py +++ b/envs/terminus_env/server/terminus_env_environment.py @@ -183,10 +183,7 @@ def step( ) -> Observation: self._state.step_count += 1 obs = super().step(action, timeout_s=timeout_s, **kwargs) - if ( - self._state.submitted_answer is not None - and self._state.last_reward is not None - ): + if self._state.submitted_answer is not None and self._state.last_reward is not None: obs.done = True obs.reward = self._state.last_reward return obs @@ -199,10 +196,7 @@ async def step_async( ) -> Observation: self._state.step_count += 1 obs = await super().step_async(action, timeout_s=timeout_s, **kwargs) - if ( - self._state.submitted_answer is not None - and self._state.last_reward is not None - ): + if self._state.submitted_answer is not None and self._state.last_reward is not None: obs.done = True obs.reward = self._state.last_reward return obs diff --git a/envs/textarena_env/server/gradio_ui.py b/envs/textarena_env/server/gradio_ui.py index c9bb88cae..45728fc00 100644 --- a/envs/textarena_env/server/gradio_ui.py +++ b/envs/textarena_env/server/gradio_ui.py @@ -71,9 +71,7 @@ def _sudoku_demo_html() -> str: for col in range(9): value = givens.get((row, col), "") border_right = "3px solid #0f172a" if col in {2, 5} else "1px solid #94a3b8" - border_bottom = ( - "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8" - ) + border_bottom = "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8" background = "#e2e8f0" if value else "#ffffff" cells.append( f""" @@ -84,7 +82,7 @@ def _sudoku_demo_html() -> str: align-items: center; justify-content: center; font-size: 1.1rem; - font-weight: {"700" if value else "400"}; + font-weight: {'700' if value else '400'}; color: #0f172a; background: {background}; border-right: {border_right}; @@ -107,7 +105,7 @@ def _sudoku_demo_html() -> str: border: 3px solid #0f172a; background: #ffffff; "> - {"".join(cells)} + {''.join(cells)}

Use the Playground tab to reset the game and submit moves in the From c2ca0c8734c510c175968b77ff5383bc56d3be8c Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:52:53 +0530 Subject: [PATCH 19/35] feat: host-side tool routing and Pi gate models bootstrap --- src/openenv/core/harness/agents/cli_driver.py | 85 ++++++++++- .../harness/agents/interception_server.py | 136 +++++++++++++++++- tests/core/test_cli_agent_driver.py | 37 +++++ tests/core/test_interception_server.py | 124 ++++++++++++++++ 4 files changed, 379 insertions(+), 3 deletions(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 42161bfec..b499d3c5f 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -36,7 +36,11 @@ from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle from .base import CLIAgentSpec -from .interception_server import deliver_response, InterceptionServer +from .interception_server import ( + deliver_response, + InterceptionServer, + ToolHandler, +) _log = logging.getLogger(__name__) @@ -44,6 +48,19 @@ Verifier = Callable[..., VerifyResult] +class _ConfigOverrideView: + """Read-only attribute view with optional overrides.""" + + def __init__(self, base: Any, **overrides: Any) -> None: + self._base = base + self._overrides = overrides + + def __getattr__(self, name: str) -> Any: + if name in self._overrides: + return self._overrides[name] + return getattr(self._base, name) + + class CLIAgentSession(ResourceSession): """Per-rollout session wrapping one sandbox with one running agent CLI.""" @@ -211,6 +228,25 @@ async def deliver( """Return a trainer-generated response to the waiting agent.""" await deliver_response(intercept, response_dict) + def register_tool_handler( + self, + tool_name: str, + handler: ToolHandler, + *, + tool_definition: dict[str, Any] | None = None, + ) -> None: + """Register a host-side interception tool for this rollout.""" + if self._interception_server is None or self._interception_rollout_id is None: + raise RuntimeError( + "register_tool_handler() is only available in interception_gate mode." + ) + self._interception_server.register_tool_handler( + self._interception_rollout_id, + tool_name, + handler, + tool_definition=tool_definition, + ) + class CLIAgentDriver: """Shared driver for all CLI-based agentic harnesses.""" @@ -415,8 +451,23 @@ def _start_agent( *, base_url_override: str | None = None, ) -> BgJob: + command_config = config + if ( + self.mode == "interception_gate" + and self._interception_server is not None + and self.spec.name == "pi" + and base_url_override + ): + self._write_pi_models_config( + sandbox, + config, + rollout_url=base_url_override, + api_key=self._interception_server.secret, + ) + command_config = _ConfigOverrideView(config, provider="openenv") + if self.spec.build_command is not None: - cmd = self.spec.build_command(self.spec, config, task, None) + cmd = self.spec.build_command(self.spec, command_config, task, None) else: cmd = " ".join(shlex.quote(c) for c in self.spec.base_command) envs = self._resolve_env_vars(config, base_url_override=base_url_override) @@ -425,6 +476,36 @@ def _start_agent( envs["ANTHROPIC_API_KEY"] = self._interception_server.secret return sandbox.start_bg(cmd, envs=envs) + def _write_pi_models_config( + self, + sandbox: SandboxHandle, + config: Any, + *, + rollout_url: str, + api_key: str, + ) -> None: + home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" + model = config.model if hasattr(config, "model") else "model" + content = json.dumps( + { + "providers": { + "openenv": { + "baseUrl": rollout_url, + "api": "openai-completions", + "apiKey": api_key, + "compat": { + "supportsDeveloperRole": False, + "supportsReasoningEffort": False, + }, + "models": [{"id": model, "reasoning": False}], + } + } + }, + indent=2, + ) + for path in {f"{home}/.pi/agent/models.json", "/root/.pi/agent/models.json"}: + sandbox.write_text(path, content) + def _resolve_env_vars( self, config: Any, diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index 1aa4edf57..70f8c4247 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -56,7 +56,7 @@ import threading import time import uuid -from typing import Any +from typing import Any, Awaitable, Callable from aiohttp import web @@ -66,6 +66,8 @@ _KEEPALIVE_INTERVAL_S = 3.0 _MAX_REQUEST_BODY = 16 * 1024 * 1024 +ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]] + class InterceptionServer: """Async HTTP server that gates every LLM call from sandboxed agents. @@ -100,6 +102,10 @@ async def start(self) -> None: "/rollout/{rollout_id}/v1/chat/completions", self._handle_chat_completions, ) + app.router.add_post( + "/rollout/{rollout_id}/v1/tools/{tool_name}", + self._handle_tool_call, + ) app.router.add_get("/health", self._handle_health) runner = web.AppRunner(app) await runner.setup() @@ -155,6 +161,8 @@ def register_rollout( self.active_rollouts[rollout_id] = { "request_id_queue": queue, "state": state, + "tool_handlers": {}, + "tool_defs": {}, } return queue @@ -185,6 +193,74 @@ def get_intercept(self, request_id: str) -> dict[str, Any] | None: with self._state_lock: return self.intercepts.get(request_id) + def register_tool_handler( + self, + rollout_id: str, + tool_name: str, + handler: ToolHandler, + *, + tool_definition: dict[str, Any] | None = None, + ) -> None: + """Register a host-side tool handler for a rollout. + + The handler is called by ``POST /rollout/{rollout_id}/v1/tools/{tool_name}`` + with a JSON payload containing ``arguments``. + + Optionally provide ``tool_definition`` (OpenAI tool schema). Registered + schemas are injected into intercepted chat-completion requests for the + rollout when the incoming request does not already include the tool. + """ + with self._state_lock: + context = self.active_rollouts.get(rollout_id) + if context is None: + raise KeyError(f"rollout not found: {rollout_id}") + handlers: dict[str, ToolHandler] = context["tool_handlers"] + handlers[tool_name] = handler + if tool_definition is not None: + tool_defs: dict[str, dict[str, Any]] = context["tool_defs"] + tool_defs[tool_name] = tool_definition + + def unregister_tool_handler(self, rollout_id: str, tool_name: str) -> None: + with self._state_lock: + context = self.active_rollouts.get(rollout_id) + if context is None: + return + handlers: dict[str, ToolHandler] = context.get("tool_handlers", {}) + handlers.pop(tool_name, None) + tool_defs: dict[str, dict[str, Any]] = context.get("tool_defs", {}) + tool_defs.pop(tool_name, None) + + @staticmethod + def _tool_name(tool: dict[str, Any]) -> str | None: + if not isinstance(tool, dict): + return None + function = tool.get("function") + if not isinstance(function, dict): + return None + name = function.get("name") + return name if isinstance(name, str) and name else None + + def _merge_rollout_tools( + self, + tools: Any, + tool_defs: dict[str, dict[str, Any]], + ) -> list[dict[str, Any]] | None: + merged: list[dict[str, Any]] = [] + if isinstance(tools, list): + for tool in tools: + if isinstance(tool, dict): + merged.append(tool) + + existing = { + name for item in merged if (name := self._tool_name(item)) is not None + } + for name, tool in tool_defs.items(): + if name in existing: + continue + merged.append(tool) + + return merged or None + def _authorized(self, request: web.Request) -> bool: auth = request.headers.get("Authorization", "") api_key = request.headers.get("x-api-key", "") @@ -195,6 +271,59 @@ def _authorized(self, request: web.Request) -> bool: async def _handle_health(self, request: web.Request) -> web.Response: return web.json_response({"status": "ok"}) + async def _handle_tool_call(self, request: web.Request) -> web.Response: + if not self._authorized(request): + return web.json_response({"error": "Unauthorized"}, status=401) + + rollout_id = request.match_info["rollout_id"] + tool_name = request.match_info["tool_name"] + with self._state_lock: + context = self.active_rollouts.get(rollout_id) + if context is None: + return web.json_response({"error": "rollout not found"}, status=404) + handlers: dict[str, ToolHandler] = context.get("tool_handlers", {}) + handler = handlers.get(tool_name) + if handler is None: + return web.json_response({"error": "tool not found"}, status=404) + + try: + body = await request.json() + except Exception as exc: + return web.json_response({"error": f"invalid JSON: {exc}"}, status=400) + + arguments_raw: Any + if isinstance(body, dict) and "arguments" in body: + arguments_raw = body.get("arguments") + else: + arguments_raw = body + + if arguments_raw is None: + arguments = {} + elif isinstance(arguments_raw, dict): + arguments = arguments_raw + else: + return web.json_response( + {"error": "tool arguments must be a JSON object"}, + status=400, + ) + + try: + response = await handler(arguments) + except Exception: + _log.exception( + "tool handler failed (rollout=%s, tool=%s)", + rollout_id, + tool_name, + ) + return web.json_response({"error": "tool execution failed"}, status=500) + + if not isinstance(response, dict): + return web.json_response( + {"error": "tool handler must return a JSON object"}, + status=500, + ) + return web.json_response(response) + async def _handle_chat_completions( self, request: web.Request ) -> web.StreamResponse | web.Response: @@ -212,6 +341,11 @@ async def _handle_chat_completions( except Exception as exc: return web.json_response({"error": f"invalid JSON: {exc}"}, status=400) + tool_defs: dict[str, dict[str, Any]] = dict(context.get("tool_defs", {})) + merged_tools = self._merge_rollout_tools(body.get("tools"), tool_defs) + if merged_tools is not None: + body["tools"] = merged_tools + is_streaming = bool(body.get("stream")) request_id = f"req_{uuid.uuid4().hex[:8]}" chunk_queue: asyncio.Queue | None = asyncio.Queue() if is_streaming else None diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index af9629970..11110e34c 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -529,6 +529,43 @@ def test_opencode_interception_gate_uses_server_secret_not_user_key(self): assert envs["OPENAI_API_KEY"] == "gate-secret" session.close() + def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + from openenv.core.harness.agents.interception_server import InterceptionServer + from openenv.core.harness.agents.pi import PI_SPEC + + backend = FakeSandboxBackend() + server = InterceptionServer(port=0, secret="gate-secret") + driver = CLIAgentDriver( + spec=PI_SPEC, + sandbox_backend=backend, + mode="interception_gate", + interception_server=server, + interception_base_url="http://127.0.0.1:8765", + ) + + session = driver.create_session(task=FakeTask(), config=FakeConfig()) + sbx = backend.created[0] + + # Command should force the custom provider backed by models.json. + cmd, _envs = sbx.bg_commands[-1] + assert "--provider openenv" in cmd + + home_models = "/home/user/.pi/agent/models.json" + root_models = "/root/.pi/agent/models.json" + assert home_models in sbx.written + assert root_models in sbx.written + + cfg = json.loads(sbx.written[home_models]) + provider = cfg["providers"]["openenv"] + assert provider["api"] == "openai-completions" + assert provider["apiKey"] == "gate-secret" + assert provider["models"][0]["id"] == "test-model" + assert "/rollout/" in provider["baseUrl"] + assert provider["baseUrl"].endswith("/v1") + + session.close() + def test_create_session_runs_task_setup_shell(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py index 45922849c..77d844aff 100644 --- a/tests/core/test_interception_server.py +++ b/tests/core/test_interception_server.py @@ -17,6 +17,22 @@ ) +_ANSWER_TOOL = { + "type": "function", + "function": { + "name": "answer", + "description": "Submit final answer for grading", + "parameters": { + "type": "object", + "properties": { + "answer": {"type": "string"}, + }, + "required": ["answer"], + }, + }, +} + + @pytest.mark.asyncio async def test_interception_server_rejects_unauthorized_requests() -> None: server = InterceptionServer(port=0, secret="secret-token") @@ -122,3 +138,111 @@ async def test_interception_server_unregister_rollout_cancels_pending_request() assert payload["error"] == "rollout cancelled" finally: await server.stop() + + +@pytest.mark.asyncio +async def test_interception_server_tool_endpoint_executes_registered_handler() -> None: + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + server.register_rollout("r1") + seen: dict[str, object] = {} + + async def _handler(arguments: dict) -> dict: + seen["arguments"] = arguments + return {"content": [{"type": "text", "text": "✅"}]} + + server.register_tool_handler("r1", "answer", _handler) + try: + async with aiohttp.ClientSession() as client: + resp = await client.post( + f"http://127.0.0.1:{server.port}/rollout/r1/v1/tools/answer", + headers={"Authorization": "Bearer secret-token"}, + json={"arguments": {"answer": "42"}}, + ) + assert resp.status == 200 + payload = await resp.json() + assert payload["content"][0]["text"] == "✅" + assert seen["arguments"] == {"answer": "42"} + finally: + server.unregister_rollout("r1") + await server.stop() + + +@pytest.mark.asyncio +async def test_interception_server_tool_endpoint_returns_404_for_unknown_tool() -> None: + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + server.register_rollout("r1") + try: + async with aiohttp.ClientSession() as client: + resp = await client.post( + f"http://127.0.0.1:{server.port}/rollout/r1/v1/tools/missing", + headers={"Authorization": "Bearer secret-token"}, + json={"arguments": {}}, + ) + assert resp.status == 404 + finally: + server.unregister_rollout("r1") + await server.stop() + + +@pytest.mark.asyncio +async def test_interception_server_injects_registered_tool_defs_into_intercept() -> ( + None +): + server = InterceptionServer(port=0, secret="secret-token") + await server.start() + queue = server.register_rollout("r1") + + async def _handler(arguments: dict) -> dict: + return {"content": [{"type": "text", "text": str(arguments)}]} + + server.register_tool_handler( + "r1", + "answer", + _handler, + tool_definition=_ANSWER_TOOL, + ) + + try: + async with aiohttp.ClientSession() as client: + request_task = asyncio.create_task( + client.post( + f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions", + headers={"Authorization": "Bearer secret-token"}, + json={ + "messages": [{"role": "user", "content": "grade this"}], + "stream": False, + }, + ) + ) + request_id = await asyncio.wait_for(queue.get(), timeout=1.0) + intercept = server.get_intercept(request_id) + assert intercept is not None + tool_names = { + tool["function"]["name"] + for tool in intercept.get("tools", []) + if isinstance(tool, dict) and isinstance(tool.get("function"), dict) + } + assert "answer" in tool_names + + await deliver_response( + intercept, + { + "id": "resp-1", + "model": "test-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "done"}, + "finish_reason": "stop", + } + ], + }, + ) + + resp = await request_task + assert resp.status == 200 + finally: + server.unregister_rollout("r1") + await server.stop() From 1b1d9fbff307fb11f450906778063f3930be7604 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:53:56 +0530 Subject: [PATCH 20/35] fix: support configurable Pi workdir for command and MCP config --- src/openenv/core/harness/agents/cli_driver.py | 10 +++--- src/openenv/core/harness/agents/pi.py | 6 +++- tests/core/test_cli_agent_driver.py | 35 +++++++++++++++++++ 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index b499d3c5f..3a6c50f51 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -428,14 +428,14 @@ def _write_mcp_config(self, sandbox: SandboxHandle, config: Any) -> None: self.spec.mcp_config.method == "config_file" and self.spec.mcp_config.path_template ): - workdir = ( - config.sandbox_home + "/workdir" - if hasattr(config, "sandbox_home") - else "/home/user/workdir" - ) home = ( config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" ) + workdir = ( + config.workdir + if hasattr(config, "workdir") and getattr(config, "workdir") + else f"{home}/workdir" + ) mcp_path = self.spec.mcp_config.path_template.format( workdir=workdir, home=home ) diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index dcc552842..6d553eee4 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -49,7 +49,11 @@ def _build_command( home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" instruction_file = f"{home}/task/instruction.txt" log_file = f"{home}/logs/agent/pi.txt" - workdir = f"{home}/workdir" + workdir = ( + config.workdir + if hasattr(config, "workdir") and getattr(config, "workdir") + else f"{home}/workdir" + ) provider = "" if hasattr(config, "provider") and config.provider: diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 11110e34c..d27ca00cd 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -156,6 +156,7 @@ class FakeConfig: model: str = "test-model" agent_timeout_s: float = 300.0 sandbox_home: str = "/home/user" + workdir: str | None = None extra_env: dict[str, str] = field(default_factory=dict) @@ -438,6 +439,20 @@ def test_create_session_full_lifecycle(self): session.close() assert sbx._killed + def test_create_session_honors_configured_workdir_for_mcp_file(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec() + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + config = FakeConfig(workdir="/testbed") + session = driver.create_session(task=FakeTask(), config=config) + + sbx = backend.created[0] + assert "/testbed/mcp.json" in sbx.written + session.close() + def test_create_session_skips_install_when_prebaked(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver @@ -1038,6 +1053,26 @@ class PiConfig: assert "-p @'/home/user with space/task/instruction.txt'" in cmd assert "tee '/home/user with space/logs/agent/pi.txt'" in cmd + def test_build_command_uses_config_workdir_when_present(self): + from openenv.core.harness.agents.pi import PI_SPEC + + @dataclass + class PiConfig: + sandbox_home: str = "/home/user" + workdir: str = "/testbed" + provider: str = "openai" + model: str = "model/name" + thinking: str = "off" + + assert PI_SPEC.build_command is not None + cmd = PI_SPEC.build_command( + PI_SPEC, + PiConfig(), + FakeTask(instruction="Write hello.py"), + None, + ) + assert "cd /testbed" in cmd + # Env var resolution From 2f52e4858f3cc8647b89f2d5ae9b78855fdfa1e1 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:55:24 +0530 Subject: [PATCH 21/35] fix: Docker host gateway mapping and per-create image override --- src/openenv/core/harness/sandbox/base.py | 7 ++++- .../core/harness/sandbox/docker_backend.py | 17 ++++++++-- .../core/harness/sandbox/e2b_backend.py | 2 ++ .../core/harness/sandbox/hf_backend.py | 3 +- tests/core/test_docker_sandbox_backend.py | 31 +++++++++++++++++++ 5 files changed, 55 insertions(+), 5 deletions(-) diff --git a/src/openenv/core/harness/sandbox/base.py b/src/openenv/core/harness/sandbox/base.py index d84e267e1..22f096310 100644 --- a/src/openenv/core/harness/sandbox/base.py +++ b/src/openenv/core/harness/sandbox/base.py @@ -96,5 +96,10 @@ def create( timeout_s: int = 900, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + image: str | None = None, ) -> SandboxHandle: - """Create and return a new, ready-to-use sandbox.""" + """Create and return a new, ready-to-use sandbox. + + ``image`` is backend-specific and may be ignored by providers that do + not support per-sandbox image selection. + """ diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py index 28447ce2e..a64070a46 100644 --- a/src/openenv/core/harness/sandbox/docker_backend.py +++ b/src/openenv/core/harness/sandbox/docker_backend.py @@ -299,9 +299,16 @@ def __init__( user: str | None = None, ) -> None: self._image = image - self._docker_args = docker_args or [] + self._docker_args = list(docker_args or []) self._user = user + # Linux Docker Engine does not auto-resolve host.docker.internal + # unless we explicitly map it. + if "host.docker.internal:host-gateway" not in self._docker_args: + self._docker_args.extend( + ["--add-host", "host.docker.internal:host-gateway"] + ) + try: subprocess.run( ["docker", "version"], @@ -324,6 +331,7 @@ def create( timeout_s: int = 900, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + image: str | None = None, ) -> DockerSandboxHandle: cmd = [ "docker", @@ -338,7 +346,8 @@ def create( for k, v in (envs or {}).items(): cmd.extend(["-e", f"{k}={v}"]) cmd.extend(self._docker_args) - cmd.extend([self._image, "sleep", str(timeout_s)]) + effective_image = image or self._image + cmd.extend([effective_image, "sleep", str(timeout_s)]) result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) if result.returncode != 0: @@ -347,6 +356,8 @@ def create( ) container_id = result.stdout.strip() _log.info( - "Docker sandbox created: %s (image=%s)", container_id[:12], self._image + "Docker sandbox created: %s (image=%s)", + container_id[:12], + effective_image, ) return DockerSandboxHandle(container_id, user=self._user) diff --git a/src/openenv/core/harness/sandbox/e2b_backend.py b/src/openenv/core/harness/sandbox/e2b_backend.py index 29c9d952d..c0cbf75ba 100644 --- a/src/openenv/core/harness/sandbox/e2b_backend.py +++ b/src/openenv/core/harness/sandbox/e2b_backend.py @@ -184,7 +184,9 @@ def create( timeout_s: int = 900, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + image: str | None = None, ) -> SandboxHandle: + del image sbx = Sandbox.create( template=self._template, timeout=timeout_s, diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py index 43ec5ad95..3857ea7e6 100644 --- a/src/openenv/core/harness/sandbox/hf_backend.py +++ b/src/openenv/core/harness/sandbox/hf_backend.py @@ -228,9 +228,10 @@ def create( timeout_s: int = 900, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + image: str | None = None, ) -> SandboxHandle: # `hf-sandbox` does not support metadata at create-time yet. - del metadata + del metadata, image timeout = self._timeout or _format_timeout(timeout_s) last_error: Exception | None = None diff --git a/tests/core/test_docker_sandbox_backend.py b/tests/core/test_docker_sandbox_backend.py index b2eebddd2..c309e63b3 100644 --- a/tests/core/test_docker_sandbox_backend.py +++ b/tests/core/test_docker_sandbox_backend.py @@ -69,6 +69,37 @@ def test_create_sandbox_backend_unknown_raises(self): with pytest.raises(ValueError, match="Unknown sandbox backend"): create_sandbox_backend("bogus") # type: ignore[arg-type] + def test_create_adds_host_gateway_and_supports_image_override(self, monkeypatch): + import openenv.core.harness.sandbox.docker_backend as docker_backend + + calls: list[list[str]] = [] + + def _fake_run(cmd, *args, **kwargs): + calls.append(list(cmd)) + if cmd[:2] == ["docker", "version"]: + return subprocess.CompletedProcess(cmd, 0, "", "") + if cmd[:2] == ["docker", "run"]: + return subprocess.CompletedProcess( + cmd, + 0, + "1234567890abcdef\n", + "", + ) + return subprocess.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr(docker_backend.subprocess, "run", _fake_run) + + backend = docker_backend.DockerSandboxBackend(image="base:latest") + handle = backend.create(image="override:latest") + assert handle.sandbox_id == "1234567890ab" + + run_cmds = [cmd for cmd in calls if cmd[:2] == ["docker", "run"]] + assert len(run_cmds) == 1 + run_cmd = run_cmds[0] + assert "--add-host" in run_cmd + assert "host.docker.internal:host-gateway" in run_cmd + assert "override:latest" in run_cmd + @pytest.mark.skipif(_DOCKER_AVAILABLE, reason="Only test error when Docker missing") def test_backend_raises_without_docker(self): from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend From f3fede2354f072dc438677eb6d66da5edba4f8ee Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Sat, 16 May 2026 20:57:00 +0530 Subject: [PATCH 22/35] feat: configurable extension directory support for CLI agents --- src/openenv/core/harness/agents/base.py | 8 ++++++ src/openenv/core/harness/agents/cli_driver.py | 26 +++++++++++++++++++ src/openenv/core/harness/agents/pi.py | 1 + tests/core/test_cli_agent_driver.py | 21 +++++++++++++++ 4 files changed, 56 insertions(+) diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py index ded9ba3b8..4ec1c297a 100644 --- a/src/openenv/core/harness/agents/base.py +++ b/src/openenv/core/harness/agents/base.py @@ -206,6 +206,14 @@ class CLIAgentSpec: resolved from the rollout config at runtime. """ + extension_dir_template: str | None = None + """Optional extension install directory template. + + Receives ``{home}`` substitution at runtime (e.g. + ``"{home}/.pi/agent/extensions"``). Drivers may use this to create + extension directories in the correct sandbox user home. + """ + build_command: Callable[..., str] | None = None """``(spec, config, task, mcp_config_path) -> str`` diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 3a6c50f51..2ff07e4d2 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -352,6 +352,7 @@ def _bootstrap_sandbox( self._wait_for_sandbox_ready(sandbox) if not self._agent_already_installed(sandbox): self._install_agent(sandbox) + self._ensure_extension_dir(sandbox, config) self._upload_files(sandbox, task, config) self._write_mcp_config(sandbox, config) setup_shell = task.setup_shell if hasattr(task, "setup_shell") else None @@ -406,6 +407,31 @@ def _install_agent(self, sandbox: SandboxHandle) -> None: label=f"{self.spec.name} install", ) + def _resolve_sandbox_home(self, sandbox: SandboxHandle, config: Any) -> str: + configured = getattr(config, "sandbox_home", None) + if isinstance(configured, str) and configured.strip(): + return configured + try: + result = sandbox.exec('printf %s "$HOME"', timeout=5) + candidate = (result.stdout or "").strip() + if result.exit_code == 0 and candidate: + return candidate + except Exception: + pass + return "/home/user" + + def _ensure_extension_dir(self, sandbox: SandboxHandle, config: Any) -> None: + template = self.spec.extension_dir_template + if not template: + return + home = self._resolve_sandbox_home(sandbox, config) + extension_dir = template.format(home=home) + result = sandbox.exec(f"mkdir -p {shlex.quote(extension_dir)}", timeout=10) + if result.exit_code != 0: + raise RuntimeError( + f"failed to create extension dir {extension_dir!r}: {result.stderr}" + ) + def _upload_files(self, sandbox: SandboxHandle, task: Any, config: Any) -> None: if not self.spec.files: return diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index 6d553eee4..03946c552 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -144,6 +144,7 @@ def _parse_events(line: str) -> AgentEvent | None: "PI_SKIP_VERSION_CHECK": "1", "PI_TELEMETRY": "0", }, + extension_dir_template="{home}/.pi/agent/extensions", build_command=_build_command, build_mcp_config=_build_mcp_config, parse_events=_parse_events, diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index d27ca00cd..977bf6703 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -213,6 +213,7 @@ def test_cli_agent_spec_minimal(self): assert spec.files is None assert spec.artifacts is None assert spec.env is None + assert spec.extension_dir_template is None assert spec.build_command is None def test_cli_agent_spec_full(self): @@ -453,6 +454,21 @@ def test_create_session_honors_configured_workdir_for_mcp_file(self): assert "/testbed/mcp.json" in sbx.written session.close() + def test_create_session_creates_extension_dir_when_spec_declares_one(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + + spec = _make_test_spec(extension_dir_template="{home}/.agent/extensions") + backend = FakeSandboxBackend() + driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box") + + session = driver.create_session(task=FakeTask(), config=FakeConfig()) + sbx = backend.created[0] + assert any( + cmd.startswith("mkdir -p /home/user/.agent/extensions") + for cmd in sbx.executed + ) + session.close() + def test_create_session_skips_install_when_prebaked(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver @@ -1073,6 +1089,11 @@ class PiConfig: ) assert "cd /testbed" in cmd + def test_spec_declares_extension_dir_template(self): + from openenv.core.harness.agents.pi import PI_SPEC + + assert PI_SPEC.extension_dir_template == "{home}/.pi/agent/extensions" + # Env var resolution From 448f6905f258aa1ca0032812b4cb454bd31380ea Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 13:18:09 +0530 Subject: [PATCH 23/35] fix: thread-safe queue handling --- .../harness/agents/interception_server.py | 96 +++++++++++++++++-- .../core/harness/sandbox/hf_backend.py | 5 +- 2 files changed, 90 insertions(+), 11 deletions(-) diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index 70f8c4247..7bc67fedc 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -164,6 +164,12 @@ def register_rollout( "tool_handlers": {}, "tool_defs": {}, } + active = len(self.active_rollouts) + _log.info( + "interception_rollout_registered rollout_id=%s active_rollouts=%d", + rollout_id, + active, + ) return queue def unregister_rollout(self, rollout_id: str) -> None: @@ -176,7 +182,9 @@ def unregister_rollout(self, rollout_id: str) -> None: matching_intercepts = [self.intercepts[i] for i in matching_ids] for request_id in matching_ids: del self.intercepts[request_id] - self.active_rollouts.pop(rollout_id, None) + removed = self.active_rollouts.pop(rollout_id, None) is not None + active = len(self.active_rollouts) + pending = len(self.intercepts) for intercept in matching_intercepts: fut: asyncio.Future | None = intercept.get("response_future") @@ -189,10 +197,27 @@ def unregister_rollout(self, rollout_id: str) -> None: except asyncio.QueueFull: pass + _log.info( + "interception_rollout_unregistered rollout_id=%s removed=%s " + "active_rollouts=%d pending_intercepts=%d", + rollout_id, + removed, + active, + pending, + ) + def get_intercept(self, request_id: str) -> dict[str, Any] | None: with self._state_lock: return self.intercepts.get(request_id) + def stats(self) -> dict[str, int]: + """Return lightweight runtime counters for health/debug views.""" + with self._state_lock: + return { + "active_rollouts": len(self.active_rollouts), + "pending_intercepts": len(self.intercepts), + } + def register_tool_handler( self, rollout_id: str, @@ -269,7 +294,7 @@ def _authorized(self, request: web.Request) -> bool: ) or hmac.compare_digest(api_key, self.secret) async def _handle_health(self, request: web.Request) -> web.Response: - return web.json_response({"status": "ok"}) + return web.json_response({"status": "ok", **self.stats()}) async def _handle_tool_call(self, request: web.Request) -> web.Response: if not self._authorized(request): @@ -433,6 +458,53 @@ async def _stream_response( return resp +def _resolve_future_threadsafe( + future: asyncio.Future, value: Any +) -> None: + """Set a future's result from any thread. + + ``asyncio.Future`` is not thread-safe: calling ``set_result`` from a + thread that is not running the future's event loop can silently fail + to wake the coroutine awaiting it. This helper detects cross-loop + calls and uses ``call_soon_threadsafe`` to schedule the resolution on + the correct loop. + """ + if future.done(): + return + loop = future.get_loop() + try: + running = asyncio.get_running_loop() + except RuntimeError: + running = None + if running is loop: + future.set_result(value) + else: + loop.call_soon_threadsafe(future.set_result, value) + + +def _put_queue_threadsafe( + q: asyncio.Queue, item: Any +) -> None: + """Put an item on an asyncio.Queue from any thread.""" + loop = getattr(q, "_loop", None) + if loop is None: + # Fallback: try put_nowait which is simpler. + try: + q.put_nowait(item) + return + except asyncio.QueueFull: + pass + return + try: + running = asyncio.get_running_loop() + except RuntimeError: + running = None + if running is loop: + q.put_nowait(item) + else: + loop.call_soon_threadsafe(q.put_nowait, item) + + async def deliver_response( intercept: dict[str, Any], response_dict: dict[str, Any] ) -> None: @@ -441,14 +513,20 @@ async def deliver_response( For non-streaming requests, resolves the future directly. For streaming requests, synthesizes SSE chunks from the complete response and signals EOF. + + Thread-safe: can be called from any thread, not just the event loop + that owns the future/queue. This is required because the rollout + worker may run ``deliver_response`` from its own ``asyncio.run()`` + in a daemon thread while the ``InterceptionServer``'s aiohttp + handler awaits the future on a different loop. """ is_streaming = intercept.get("stream", False) chunk_queue: asyncio.Queue | None = intercept.get("chunk_queue") future: asyncio.Future | None = intercept.get("response_future") if not is_streaming: - if future and not future.done(): - future.set_result(response_dict) + if future: + _resolve_future_threadsafe(future, response_dict) return if chunk_queue is None: @@ -474,7 +552,7 @@ async def deliver_response( } ], } - await chunk_queue.put(content_chunk) + _put_queue_threadsafe(chunk_queue, content_chunk) finish_chunk = { "id": response_dict.get("id", ""), "object": "chat.completion.chunk", @@ -488,11 +566,11 @@ async def deliver_response( } ], } - await chunk_queue.put(finish_chunk) + _put_queue_threadsafe(chunk_queue, finish_chunk) - await chunk_queue.put(None) - if future and not future.done(): - future.set_result(response_dict) + _put_queue_threadsafe(chunk_queue, None) + if future: + _resolve_future_threadsafe(future, response_dict) __all__ = [ diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py index 3857ea7e6..3b7b060b5 100644 --- a/src/openenv/core/harness/sandbox/hf_backend.py +++ b/src/openenv/core/harness/sandbox/hf_backend.py @@ -231,15 +231,16 @@ def create( image: str | None = None, ) -> SandboxHandle: # `hf-sandbox` does not support metadata at create-time yet. - del metadata, image + del metadata timeout = self._timeout or _format_timeout(timeout_s) + effective_image = image or self._image last_error: Exception | None = None for attempt in range(self._create_retries): try: sbx = Sandbox.create( - image=self._image, + image=effective_image, flavor=self._flavor, timeout=timeout, forward_hf_token=self._forward_hf_token, From 37e549d5dc52fa318b88aae16f2edc0a2c68e805 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 14:18:52 +0530 Subject: [PATCH 24/35] fix: interception gate support in CodingAgentSessionFactory --- envs/coding_agent_env/harness.py | 33 +++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py index 2355260f5..748dcb091 100644 --- a/envs/coding_agent_env/harness.py +++ b/envs/coding_agent_env/harness.py @@ -8,6 +8,8 @@ from __future__ import annotations +import asyncio +import uuid from typing import Any, Literal from openenv.core.harness import ResourceSessionFactory @@ -21,11 +23,7 @@ from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle from .config import CodingAgentConfig -from .opencode_runtime import ( - agent_log_path, - build_env_vars, - build_run_cmd, -) +from .opencode_runtime import agent_log_path, build_env_vars, build_run_cmd from .task import CodingAgentTask @@ -124,12 +122,37 @@ def create( _log.error("factory.create: bootstrap failed: %r", exc) sandbox.kill() raise + + # Wire up interception_gate if the driver is configured for it + base_url_override: str | None = None + interception_rollout_id: str | None = None + interception_queue: asyncio.Queue | None = None + + if self._driver.mode == "interception_gate": + assert self._driver._interception_server is not None + assert self._driver._interception_base_url is not None + rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}" + interception_rollout_id = rollout_id + interception_queue = self._driver._interception_server.register_rollout( + rollout_id + ) + base_url_override = ( + f"{self._driver._interception_base_url.rstrip('/')}" + f"/rollout/{rollout_id}/v1" + ) + session = CodingAgentSession( sandbox=sandbox, config=self._config, task=oc_task, verifier=self._verifier, + base_url_override=base_url_override, ) + # Pass interception fields to the parent CLIAgentSession + session._interception_server = self._driver._interception_server + session._interception_rollout_id = interception_rollout_id + session._interception_queue = interception_queue + session.start_agent() return session From 8aa9d18ce39a1e3d7a97d2ddde5c0572d60b3e8c Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 14:28:35 +0530 Subject: [PATCH 25/35] fix: improve error handling and config propagation across agent pipeline - Wire disable_thinking and max_tokens_cap through CodingAgentConfig - Raise RuntimeError on mkdir/cat failures in docker backend - Propagate QueueFull exceptions instead of silently swallowing - Change CommandResult.exit_code to int | None for bootstrap clarity --- envs/coding_agent_env/config.py | 4 ++++ envs/coding_agent_env/models.py | 9 ++++++-- envs/coding_agent_env/opencode_runtime.py | 15 +++++++++++-- .../server/coding_environment.py | 21 +++++++++++++++---- .../harness/agents/interception_server.py | 17 +++++---------- .../core/harness/sandbox/docker_backend.py | 14 +++++++++++-- 6 files changed, 58 insertions(+), 22 deletions(-) diff --git a/envs/coding_agent_env/config.py b/envs/coding_agent_env/config.py index b3243253e..d70610542 100644 --- a/envs/coding_agent_env/config.py +++ b/envs/coding_agent_env/config.py @@ -45,6 +45,10 @@ class CodingAgentConfig(BaseModel): extra_env: dict[str, str] = Field(default_factory=dict) extra_setup_shell: str | None = None + # --- Model behavior -------------------------------------------------------- + disable_thinking: bool = False + max_tokens_cap: int | None = None + # --- Sandbox paths -------------------------------------------------------- # Root directory inside the sandbox where the primitive writes config, # task files, and logs. E2B's default user is ``user`` with home diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py index 2111d84d5..e338a4867 100644 --- a/envs/coding_agent_env/models.py +++ b/envs/coding_agent_env/models.py @@ -35,10 +35,15 @@ class RolloutTurn(BaseModel): class CommandResult(BaseModel): - """Outcome of one bash command in setup/verify.""" + """Outcome of one bash command in setup/verify. + + When ``exit_code`` is ``None``, the command ran during sandbox bootstrap + and its individual exit code was not captured (bootstrap succeeds or fails + atomically). + """ cmd: str - exit_code: int + exit_code: int | None = None stdout: str = "" stderr: str = "" duration_s: float = 0.0 diff --git a/envs/coding_agent_env/opencode_runtime.py b/envs/coding_agent_env/opencode_runtime.py index 49855528b..31285556e 100644 --- a/envs/coding_agent_env/opencode_runtime.py +++ b/envs/coding_agent_env/opencode_runtime.py @@ -52,6 +52,12 @@ def build_opencode_json(config: CodingAgentConfig) -> str: """ provider_name = "intercepted" + model_key = config.model.split("/", 1)[-1] + + model_block: dict[str, Any] = {"name": "Intercepted Model"} + if config.max_tokens_cap is not None: + model_block["limit"] = {"output": config.max_tokens_cap} + provider_block: dict[str, Any] = { "npm": provider_npm_package(config.provider), "name": "Intercepted", @@ -61,16 +67,21 @@ def build_opencode_json(config: CodingAgentConfig) -> str: "timeout": config.request_timeout_ms, }, "models": { - config.model.split("/", 1)[-1]: {"name": "Intercepted Model"}, + model_key: model_block, }, } doc: dict[str, Any] = { "$schema": "https://opencode.ai/config.json", - "model": f"{provider_name}/{config.model.split('/', 1)[-1]}", + "model": f"{provider_name}/{model_key}", "provider": {provider_name: provider_block}, } + # Disable thinking/reasoning tokens when requested. AI SDK respects + # the top-level "reasoning" key to control reasoning token generation. + if config.disable_thinking: + doc["reasoning"] = "none" + tools = _build_tools_block(config) if tools: doc["tools"] = tools diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index b1e7f47ef..9000ed4e0 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -22,6 +22,7 @@ from __future__ import annotations import json +import logging import os import time from typing import Any, Optional @@ -50,6 +51,8 @@ HOME = "/home/user" WORKDIR = f"{HOME}/workdir" INSTRUCTION_PATH = f"{HOME}/task/instruction.md" +_log = logging.getLogger(__name__) + REWARD_FILE = f"{HOME}/logs/verifier/reward.txt" PROXY_LOG = f"{HOME}/logs/agent/proxy.log" AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl" @@ -83,21 +86,22 @@ def __init__(self) -> None: # Lazy imports so module import stays cheap and so tests can patch. try: from ..models import ( - CommandResult, CodingAgentState, + CommandResult, RolloutResult, RolloutTurn, ) except ImportError: # pragma: no cover from models import ( # type: ignore - CommandResult, CodingAgentState, + CommandResult, RolloutResult, RolloutTurn, ) from openenv.core.harness.agents import get_agent_spec from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory + from coding_agent_env.config import CodingAgentConfig from coding_agent_env.harness import CodingAgentSessionFactory from coding_agent_env.task import CodingAgentTask @@ -374,8 +378,8 @@ def _emit(msg: str) -> None: result.setup_results.append( self._CommandResult( cmd=cmd, - exit_code=0, - stdout="executed during bootstrap", + exit_code=None, + stdout="executed during bootstrap (individual exit code not captured)", stderr="", duration_s=0.0, ) @@ -466,12 +470,21 @@ def _build_agent_config( max_tokens_cap: int, ) -> Any: if agent == "opencode": + if top_logprobs: + _log.warning( + "top_logprobs=%d is not supported for agent='opencode' " + "and will have no effect. Use interception_gate mode for " + "logprob capture.", + top_logprobs, + ) return self._CodingAgentConfig( provider="openai_compatible", base_url=base_url.rstrip("/"), api_key=api_key, model=model, agent_timeout_s=agent_timeout_s, + disable_thinking=disable_thinking, + max_tokens_cap=max_tokens_cap if max_tokens_cap != 4096 else None, ) provider = self._infer_pi_provider(base_url) diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index 7bc67fedc..5e541700d 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -458,9 +458,7 @@ async def _stream_response( return resp -def _resolve_future_threadsafe( - future: asyncio.Future, value: Any -) -> None: +def _resolve_future_threadsafe(future: asyncio.Future, value: Any) -> None: """Set a future's result from any thread. ``asyncio.Future`` is not thread-safe: calling ``set_result`` from a @@ -482,18 +480,13 @@ def _resolve_future_threadsafe( loop.call_soon_threadsafe(future.set_result, value) -def _put_queue_threadsafe( - q: asyncio.Queue, item: Any -) -> None: +def _put_queue_threadsafe(q: asyncio.Queue, item: Any) -> None: """Put an item on an asyncio.Queue from any thread.""" loop = getattr(q, "_loop", None) if loop is None: - # Fallback: try put_nowait which is simpler. - try: - q.put_nowait(item) - return - except asyncio.QueueFull: - pass + # Fallback: put_nowait which is simpler. Let QueueFull propagate — + # silently dropping items would cause hard-to-debug streaming issues. + q.put_nowait(item) return try: running = asyncio.get_running_loop() diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py index a64070a46..120fb9a11 100644 --- a/src/openenv/core/harness/sandbox/docker_backend.py +++ b/src/openenv/core/harness/sandbox/docker_backend.py @@ -162,12 +162,17 @@ def start_bg( def write_text(self, path: str, content: str) -> None: parent = str(PurePosixPath(path).parent) if parent not in ("", "/"): - subprocess.run( + mkdir_result = subprocess.run( ["docker", "exec", self._container_id, "mkdir", "-p", parent], capture_output=True, timeout=10, ) - subprocess.run( + if mkdir_result.returncode != 0: + raise RuntimeError( + f"Failed to create directory {parent!r} in container " + f"{self._container_id}: {mkdir_result.stderr.decode(errors='replace')}" + ) + write_result = subprocess.run( [ "docker", "exec", @@ -181,6 +186,11 @@ def write_text(self, path: str, content: str) -> None: capture_output=True, timeout=30, ) + if write_result.returncode != 0: + raise RuntimeError( + f"Failed to write file {path!r} in container " + f"{self._container_id}: {write_result.stderr.decode(errors='replace')}" + ) def read_text(self, path: str) -> str: result = subprocess.run( From 61e5524762182be06d43b290895d33f0e3e66b09 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 14:35:18 +0530 Subject: [PATCH 26/35] fix: whitespace secret validation + conditional /root/ write --- src/openenv/core/harness/agents/cli_driver.py | 5 ++++- src/openenv/core/harness/agents/interception_server.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 2ff07e4d2..587362746 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -529,7 +529,10 @@ def _write_pi_models_config( }, indent=2, ) - for path in {f"{home}/.pi/agent/models.json", "/root/.pi/agent/models.json"}: + paths = {f"{home}/.pi/agent/models.json"} + if home == "/root": + paths.add("/root/.pi/agent/models.json") + for path in paths: sandbox.write_text(path, content) def _resolve_env_vars( diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index 5e541700d..19b05bb95 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -85,6 +85,8 @@ def __init__( self.port = port self.host = host self.secret = secret or secrets.token_urlsafe(32) + if not self.secret.strip(): + raise ValueError("InterceptionServer secret must not be blank.") self._app: web.Application | None = None self._runner: web.AppRunner | None = None self._site: web.TCPSite | None = None From a2b43887c0b207837fb67c1d5d96d8e9c3bdf0fb Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 15:17:50 +0530 Subject: [PATCH 27/35] fix: cross-loop safe request queue via stdlib queue.Queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit replace asyncio.Queue with stdlib queue.Queue for the request notification path (server → training loop). This makes both directions of the InterceptionServer cross-loop/cross-thread safe: - Request notifications: queue.Queue (inherently thread-safe) - Response delivery: asyncio.Future via _resolve_future_threadsafe (already cross-loop safe) The consumer (next_request) uses asyncio.to_thread(q.get, timeout=...) to await without blocking the event loop. This follows the same pattern used by OpenClaw-RL at scale. chunk_queue (internal SSE streaming) remains asyncio.Queue since both producer and consumer run on the server's own event loop. --- src/openenv/core/harness/agents/cli_driver.py | 17 +++++++---------- .../core/harness/agents/interception_server.py | 17 +++++++++-------- tests/core/test_cli_agent_driver.py | 12 ++++++------ tests/core/test_interception_server.py | 6 +++--- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 587362746..a2724162e 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -20,6 +20,7 @@ import asyncio import json import logging +import queue as _queue_mod import shlex import time import uuid @@ -36,11 +37,7 @@ from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle from .base import CLIAgentSpec -from .interception_server import ( - deliver_response, - InterceptionServer, - ToolHandler, -) +from .interception_server import deliver_response, InterceptionServer, ToolHandler _log = logging.getLogger(__name__) @@ -76,7 +73,7 @@ def __init__( agent_bg_job: BgJob | None = None, interception_server: InterceptionServer | None = None, interception_rollout_id: str | None = None, - interception_queue: asyncio.Queue | None = None, + interception_queue: _queue_mod.Queue[str] | None = None, ) -> None: self.spec = spec self.sandbox = sandbox @@ -204,14 +201,14 @@ async def next_request( f"{self.spec.name} interception_gate: no request within timeout" ) try: - request_id = await asyncio.wait_for( - self._interception_queue.get(), + request_id = await asyncio.to_thread( + self._interception_queue.get, timeout=min(remaining, 1.0), ) intercept = server.get_intercept(request_id) if intercept is not None: return intercept - except asyncio.TimeoutError: + except _queue_mod.Empty: pass if self._agent_bg_job is not None: @@ -317,7 +314,7 @@ def create_session( base_url_override: str | None = None interception_rollout_id: str | None = None - interception_queue: asyncio.Queue | None = None + interception_queue: _queue_mod.Queue[str] | None = None if self.mode == "interception_gate": assert self._interception_server is not None diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index 19b05bb95..a71082e69 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -31,11 +31,11 @@ # Docker: base_url = f"http://host.docker.internal:{server.port}" # Remote: base_url = your_tunnel_or_public_url - queue = server.register_rollout(rollout_id) + request_queue = server.register_rollout(rollout_id) # Agent runs with OPENAI_BASE_URL = f"{base_url}/rollout/{rollout_id}/v1" while True: - request_id = await asyncio.wait_for(queue.get(), timeout=...) + request_id = await asyncio.to_thread(request_queue.get, timeout=...) intercept = server.get_intercept(request_id) if intercept is None: continue @@ -52,6 +52,7 @@ import hmac import json import logging +import queue as _queue_mod import secrets import threading import time @@ -157,11 +158,11 @@ def register_rollout( self, rollout_id: str, state: dict[str, Any] | None = None, - ) -> asyncio.Queue: - queue: asyncio.Queue = asyncio.Queue() + ) -> _queue_mod.Queue[str]: + request_queue: _queue_mod.Queue[str] = _queue_mod.Queue() with self._state_lock: self.active_rollouts[rollout_id] = { - "request_id_queue": queue, + "request_id_queue": request_queue, "state": state, "tool_handlers": {}, "tool_defs": {}, @@ -172,7 +173,7 @@ def register_rollout( rollout_id, active, ) - return queue + return request_queue def unregister_rollout(self, rollout_id: str) -> None: with self._state_lock: @@ -393,8 +394,8 @@ async def _handle_chat_completions( if context is None: return web.json_response({"error": "rollout not found"}, status=404) self.intercepts[request_id] = intercept - request_queue: asyncio.Queue = context["request_id_queue"] - await request_queue.put(request_id) + request_queue: _queue_mod.Queue[str] = context["request_id_queue"] + request_queue.put_nowait(request_id) if is_streaming: return await self._stream_response(request, intercept) diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 977bf6703..18854fe7e 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -18,6 +18,7 @@ from __future__ import annotations import json +import queue as _queue_mod from dataclasses import dataclass, field from typing import Any @@ -585,7 +586,8 @@ def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self) home_models = "/home/user/.pi/agent/models.json" root_models = "/root/.pi/agent/models.json" assert home_models in sbx.written - assert root_models in sbx.written + # /root/ path is only written when sandbox_home == "/root" + assert root_models not in sbx.written cfg = json.loads(sbx.written[home_models]) provider = cfg["providers"]["openenv"] @@ -786,15 +788,13 @@ def test_close_kills_sandbox_and_jobs(self): @pytest.mark.asyncio async def test_next_request_handles_missing_intercept_without_keyerror(self): - import asyncio - from openenv.core.harness.agents.cli_driver import CLIAgentSession from openenv.core.harness.agents.interception_server import InterceptionServer spec = _make_test_spec() sbx = FakeSandbox() - queue: asyncio.Queue[str] = asyncio.Queue() - await queue.put("req_missing") + q: _queue_mod.Queue[str] = _queue_mod.Queue() + q.put("req_missing") session = CLIAgentSession( spec=spec, @@ -804,7 +804,7 @@ async def test_next_request_handles_missing_intercept_without_keyerror(self): agent_bg_job=FakeBgJob(), interception_server=InterceptionServer(secret="s"), interception_rollout_id="rollout-1", - interception_queue=queue, + interception_queue=q, ) # Missing request IDs can happen if unregister_rollout races with queue.get(). diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py index 77d844aff..73421e1a7 100644 --- a/tests/core/test_interception_server.py +++ b/tests/core/test_interception_server.py @@ -81,7 +81,7 @@ async def test_interception_server_non_stream_roundtrip_cleans_intercept() -> No }, ) ) - request_id = await asyncio.wait_for(queue.get(), timeout=1.0) + request_id = await asyncio.to_thread(queue.get, timeout=1.0) intercept = server.get_intercept(request_id) assert intercept is not None @@ -129,7 +129,7 @@ async def test_interception_server_unregister_rollout_cancels_pending_request() }, ) ) - _request_id = await asyncio.wait_for(queue.get(), timeout=1.0) + _request_id = await asyncio.to_thread(queue.get, timeout=1.0) server.unregister_rollout("r1") resp = await request_task @@ -216,7 +216,7 @@ async def _handler(arguments: dict) -> dict: }, ) ) - request_id = await asyncio.wait_for(queue.get(), timeout=1.0) + request_id = await asyncio.to_thread(queue.get, timeout=1.0) intercept = server.get_intercept(request_id) assert intercept is not None tool_names = { From b10a4483a2309d7bdbd5fa5aacb5cfe34a082209 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 21:13:48 +0530 Subject: [PATCH 28/35] fix: replace asyncio.Queue with queue.Queue for thread-safe request handling - soak test --- envs/coding_agent_env/harness.py | 4 +- tests/core/test_cli_agent_driver.py | 76 +++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py index 748dcb091..379a055bb 100644 --- a/envs/coding_agent_env/harness.py +++ b/envs/coding_agent_env/harness.py @@ -8,7 +8,7 @@ from __future__ import annotations -import asyncio +import queue as _queue_mod import uuid from typing import Any, Literal @@ -126,7 +126,7 @@ def create( # Wire up interception_gate if the driver is configured for it base_url_override: str | None = None interception_rollout_id: str | None = None - interception_queue: asyncio.Queue | None = None + interception_queue: _queue_mod.Queue[str] | None = None if self._driver.mode == "interception_gate": assert self._driver._interception_server is not None diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 18854fe7e..6c1c1511e 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -17,8 +17,11 @@ from __future__ import annotations +import asyncio import json import queue as _queue_mod +import threading +import time from dataclasses import dataclass, field from typing import Any @@ -810,6 +813,79 @@ async def test_next_request_handles_missing_intercept_without_keyerror(self): # Missing request IDs can happen if unregister_rollout races with queue.get(). assert await session.next_request(timeout_s=0.2) is None + def test_next_request_soak_cross_loop_queue_get(self): + """Soak test cross-loop request dequeueing via queue.Queue. + + Exercises the worker pattern that used to be unsafe with asyncio.Queue: + repeatedly call next_request() from fresh event loops (asyncio.run) + while request IDs are pushed from another thread. + """ + from openenv.core.harness.agents.cli_driver import CLIAgentSession + from openenv.core.harness.agents.interception_server import InterceptionServer + + spec = _make_test_spec() + sbx = FakeSandbox() + server = InterceptionServer(secret="s") + request_queue = server.register_rollout("rollout-soak") + + session = CLIAgentSession( + spec=spec, + sandbox=sbx, + task=FakeTask(), + config=FakeConfig(), + interception_server=server, + interception_rollout_id="rollout-soak", + interception_queue=request_queue, + ) + + total_requests = 200 + consumed: list[str] = [] + failures: list[BaseException] = [] + + def _consumer() -> None: + try: + for _ in range(total_requests): + intercept = asyncio.run(session.next_request(timeout_s=2.0)) + assert intercept is not None + request_id = intercept["request_id"] + consumed.append(request_id) + with server._state_lock: + server.intercepts.pop(request_id, None) + except BaseException as exc: # pragma: no cover - assertion path + failures.append(exc) + + def _producer() -> None: + try: + for i in range(total_requests): + request_id = f"req_soak_{i:04d}" + with server._state_lock: + server.intercepts[request_id] = { + "request_id": request_id, + "messages": [{"role": "user", "content": "ping"}], + } + request_queue.put_nowait(request_id) + if i % 10 == 0: + time.sleep(0.001) + except BaseException as exc: # pragma: no cover - unexpected + failures.append(exc) + + consumer_t = threading.Thread(target=_consumer, name="soak-consumer") + producer_t = threading.Thread(target=_producer, name="soak-producer") + + consumer_t.start() + producer_t.start() + + producer_t.join(timeout=10) + consumer_t.join(timeout=15) + + assert not producer_t.is_alive(), "producer thread hung" + assert not consumer_t.is_alive(), "consumer thread hung" + assert not failures + assert len(consumed) == total_requests + assert len(set(consumed)) == total_requests + + session.close() + class TestCLIAgentSessionFactory: """Tests for the ResourceSessionFactory wrapper.""" From 659288b5e8fd0680253d65ba5a31648a13c242da Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 18 May 2026 21:23:35 +0530 Subject: [PATCH 29/35] fix: pi config discovery for CLIAgentDriver to be independent of runtime user's $HOME --- src/openenv/core/harness/agents/cli_driver.py | 12 +++---- tests/core/test_cli_agent_driver.py | 32 +++++++++++++++++-- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index a2724162e..831d930af 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -494,6 +494,10 @@ def _start_agent( else: cmd = " ".join(shlex.quote(c) for c in self.spec.base_command) envs = self._resolve_env_vars(config, base_url_override=base_url_override) + if self.spec.name == "pi": + home = self._resolve_sandbox_home(sandbox, config) + # Make pi config discovery independent of the runtime user's $HOME. + envs["PI_CODING_AGENT_DIR"] = f"{home}/.pi/agent" if self.mode == "interception_gate" and self._interception_server is not None: envs["OPENAI_API_KEY"] = self._interception_server.secret envs["ANTHROPIC_API_KEY"] = self._interception_server.secret @@ -507,7 +511,7 @@ def _write_pi_models_config( rollout_url: str, api_key: str, ) -> None: - home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user" + home = self._resolve_sandbox_home(sandbox, config) model = config.model if hasattr(config, "model") else "model" content = json.dumps( { @@ -526,11 +530,7 @@ def _write_pi_models_config( }, indent=2, ) - paths = {f"{home}/.pi/agent/models.json"} - if home == "/root": - paths.add("/root/.pi/agent/models.json") - for path in paths: - sandbox.write_text(path, content) + sandbox.write_text(f"{home}/.pi/agent/models.json", content) def _resolve_env_vars( self, diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 6c1c1511e..7338fc323 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -583,13 +583,14 @@ def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self) sbx = backend.created[0] # Command should force the custom provider backed by models.json. - cmd, _envs = sbx.bg_commands[-1] + cmd, envs = sbx.bg_commands[-1] assert "--provider openenv" in cmd + assert envs is not None + assert envs["PI_CODING_AGENT_DIR"] == "/home/user/.pi/agent" home_models = "/home/user/.pi/agent/models.json" root_models = "/root/.pi/agent/models.json" assert home_models in sbx.written - # /root/ path is only written when sandbox_home == "/root" assert root_models not in sbx.written cfg = json.loads(sbx.written[home_models]) @@ -602,6 +603,33 @@ def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self) session.close() + def test_pi_interception_gate_uses_explicit_pi_config_dir(self): + from openenv.core.harness.agents.cli_driver import CLIAgentDriver + from openenv.core.harness.agents.interception_server import InterceptionServer + from openenv.core.harness.agents.pi import PI_SPEC + + backend = FakeSandboxBackend() + server = InterceptionServer(port=0, secret="gate-secret") + driver = CLIAgentDriver( + spec=PI_SPEC, + sandbox_backend=backend, + mode="interception_gate", + interception_server=server, + interception_base_url="http://127.0.0.1:8765", + ) + + config = FakeConfig(sandbox_home="/custom/home") + session = driver.create_session(task=FakeTask(), config=config) + sbx = backend.created[0] + + _cmd, envs = sbx.bg_commands[-1] + assert envs is not None + assert envs["PI_CODING_AGENT_DIR"] == "/custom/home/.pi/agent" + assert "/custom/home/.pi/agent/models.json" in sbx.written + assert "/root/.pi/agent/models.json" not in sbx.written + + session.close() + def test_create_session_runs_task_setup_shell(self): from openenv.core.harness.agents.cli_driver import CLIAgentDriver From 5136337e287e31e07724c3e6a891d6ca72359005 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Wed, 20 May 2026 10:48:13 +0530 Subject: [PATCH 30/35] fix: interception params and update max_tokens_cap validation --- envs/coding_agent_env/harness.py | 32 ++++++++++++------- .../server/coding_environment.py | 2 +- tests/envs/test_coding_agent_env.py | 27 ++++++++++++++++ 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py index 379a055bb..de4ec91dd 100644 --- a/envs/coding_agent_env/harness.py +++ b/envs/coding_agent_env/harness.py @@ -36,6 +36,9 @@ def __init__( task: CodingAgentTask, verifier: Verifier | None = None, base_url_override: str | None = None, + interception_server: InterceptionServer | None = None, + interception_rollout_id: str | None = None, + interception_queue: _queue_mod.Queue[str] | None = None, ) -> None: super().__init__( spec=OPENCODE_SPEC, @@ -44,6 +47,9 @@ def __init__( config=config, verifier=verifier, base_url_override=base_url_override, + interception_server=interception_server, + interception_rollout_id=interception_rollout_id, + interception_queue=interception_queue, ) def fetch_trace(self) -> str: @@ -129,16 +135,21 @@ def create( interception_queue: _queue_mod.Queue[str] | None = None if self._driver.mode == "interception_gate": - assert self._driver._interception_server is not None - assert self._driver._interception_base_url is not None + interception_server = self._driver._interception_server + if interception_server is None: + raise RuntimeError( + "interception_gate mode requires an InterceptionServer" + ) + interception_base_url = self._driver._interception_base_url + if interception_base_url is None: + raise RuntimeError( + "interception_gate mode requires interception_base_url" + ) rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}" interception_rollout_id = rollout_id - interception_queue = self._driver._interception_server.register_rollout( - rollout_id - ) + interception_queue = interception_server.register_rollout(rollout_id) base_url_override = ( - f"{self._driver._interception_base_url.rstrip('/')}" - f"/rollout/{rollout_id}/v1" + f"{interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1" ) session = CodingAgentSession( @@ -147,11 +158,10 @@ def create( task=oc_task, verifier=self._verifier, base_url_override=base_url_override, + interception_server=self._driver._interception_server, + interception_rollout_id=interception_rollout_id, + interception_queue=interception_queue, ) - # Pass interception fields to the parent CLIAgentSession - session._interception_server = self._driver._interception_server - session._interception_rollout_id = interception_rollout_id - session._interception_queue = interception_queue session.start_agent() return session diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index 9000ed4e0..111d417b8 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -484,7 +484,7 @@ def _build_agent_config( model=model, agent_timeout_s=agent_timeout_s, disable_thinking=disable_thinking, - max_tokens_cap=max_tokens_cap if max_tokens_cap != 4096 else None, + max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, ) provider = self._infer_pi_provider(base_url) diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py index 905713e7a..6397a1060 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -183,6 +183,33 @@ def test_build_agent_config_opencode() -> None: assert isinstance(cfg, env._CodingAgentConfig) assert cfg.model == "gpt-4o-mini" assert cfg.agent_timeout_s == 123.0 + assert cfg.max_tokens_cap == 2048 + + cfg_4096 = env._build_agent_config( + agent="opencode", + mode="black_box", + base_url="https://api.openai.com/v1", + api_key="sk-test", + model="gpt-4o-mini", + agent_timeout_s=123.0, + disable_thinking=True, + top_logprobs=7, + max_tokens_cap=4096, + ) + assert cfg_4096.max_tokens_cap == 4096 + + cfg_uncapped = env._build_agent_config( + agent="opencode", + mode="black_box", + base_url="https://api.openai.com/v1", + api_key="sk-test", + model="gpt-4o-mini", + agent_timeout_s=123.0, + disable_thinking=True, + top_logprobs=7, + max_tokens_cap=0, + ) + assert cfg_uncapped.max_tokens_cap is None def test_build_agent_config_pi() -> None: From 8137b154adb78f305336348ef5735632bb279ecd Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Wed, 20 May 2026 10:59:40 +0530 Subject: [PATCH 31/35] refactor: remove RolloutTurn references --- envs/coding_agent_env/README.md | 6 +- envs/coding_agent_env/__init__.py | 3 +- envs/coding_agent_env/models.py | 21 ------ .../server/coding_environment.py | 18 +---- envs/coding_agent_env/server/gradio_ui.py | 70 +++---------------- tests/envs/test_coding_agent_env.py | 15 +--- 6 files changed, 15 insertions(+), 118 deletions(-) diff --git a/envs/coding_agent_env/README.md b/envs/coding_agent_env/README.md index 7825e5c25..347afdd05 100644 --- a/envs/coding_agent_env/README.md +++ b/envs/coding_agent_env/README.md @@ -200,8 +200,8 @@ directly. | `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. | Returns `RolloutResult` JSON with: `reward`, `setup_results[]`, -`verify_results[]`, `files{}`, `agent_log_tail`, -`proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`. +`verify_results[]`, `files{}`, `agent_log_tail`, `wall_s`, +`agent_exit_code`, `sandbox_id`, `error`. ## Two Operating Modes @@ -259,7 +259,7 @@ coding_agent_env/ ├── __init__.py # re-exports primitive + client + models │ ├── client.py # CodingAgentEnv(MCPToolClient) -├── models.py # RolloutResult / RolloutTurn / CodingAgentState +├── models.py # RolloutResult / CodingAgentState │ ├── config.py # CodingAgentConfig (primitive) ├── harness.py # CodingAgentSession / CodingAgentSessionFactory (CLI-only) diff --git a/envs/coding_agent_env/__init__.py b/envs/coding_agent_env/__init__.py index 6b839e7ea..bc04e7236 100644 --- a/envs/coding_agent_env/__init__.py +++ b/envs/coding_agent_env/__init__.py @@ -25,7 +25,7 @@ from .client import CodingAgentEnv from .config import CodingAgentConfig, Provider from .harness import CodingAgentSession, CodingAgentSessionFactory -from .models import CommandResult, CodingAgentState, RolloutResult, RolloutTurn +from .models import CommandResult, CodingAgentState, RolloutResult from .task import CodingAgentTask try: @@ -42,7 +42,6 @@ "CommandResult", "CodingAgentState", "RolloutResult", - "RolloutTurn", # Harness primitive "CodingAgentConfig", "CodingAgentSession", diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py index e338a4867..2bf19925e 100644 --- a/envs/coding_agent_env/models.py +++ b/envs/coding_agent_env/models.py @@ -14,26 +14,10 @@ from __future__ import annotations -from typing import Any - from openenv.core.env_server.types import State from pydantic import BaseModel, Field -class RolloutTurn(BaseModel): - """One intercepted LLM turn shape (trainer-owned in interception_gate mode).""" - - turn: int - finish_reason: str | None = None - completion_tokens: list[str] = Field(default_factory=list) - completion_token_ids: list[int] = Field(default_factory=list) - per_token_logps: list[float] = Field(default_factory=list) - latency_s: float = 0.0 - timestamp: float = 0.0 - upstream_status: int | None = None - upstream_error: dict[str, Any] | None = None - - class CommandResult(BaseModel): """Outcome of one bash command in setup/verify. @@ -66,17 +50,12 @@ class RolloutResult(BaseModel): setup_results: list[CommandResult] = Field(default_factory=list) verify_results: list[CommandResult] = Field(default_factory=list) - # Per-turn LLM trajectory placeholder. Capture is trainer-owned in - # interception_gate mode; environment currently leaves this empty. - proxy_turns: list[RolloutTurn] = Field(default_factory=list) - # Filesystem the agent produced (path -> contents, truncated) files: dict[str, str] = Field(default_factory=dict) files_extra: list[str] = Field(default_factory=list) # Diagnostic tails agent_log_tail: str = "" - proxy_log_tail: str = "" # Error surfacing error: str | None = None diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py index 111d417b8..9174666e7 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/coding_agent_env/server/coding_environment.py @@ -54,7 +54,6 @@ _log = logging.getLogger(__name__) REWARD_FILE = f"{HOME}/logs/verifier/reward.txt" -PROXY_LOG = f"{HOME}/logs/agent/proxy.log" AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl" VERIFY_TIMEOUT_S = 120 _SUPPORTED_AGENTS = ("opencode", "pi") @@ -89,14 +88,12 @@ def __init__(self) -> None: CodingAgentState, CommandResult, RolloutResult, - RolloutTurn, ) except ImportError: # pragma: no cover from models import ( # type: ignore CodingAgentState, CommandResult, RolloutResult, - RolloutTurn, ) from openenv.core.harness.agents import get_agent_spec @@ -113,7 +110,6 @@ def __init__(self) -> None: self._CommandResult = CommandResult self._RolloutResult = RolloutResult - self._RolloutTurn = RolloutTurn self._CodingAgentState = CodingAgentState self._CodingAgentConfig = CodingAgentConfig self._CodingAgentSessionFactory = CodingAgentSessionFactory @@ -418,24 +414,18 @@ def _emit(msg: str) -> None: else: result.reward = None - # Collect filesystem + proxy trace. - _emit("collecting workdir files + proxy trace + logs") + # Collect filesystem + agent log tail. + _emit("collecting workdir files + logs") result.files, result.files_extra = self._collect_files(session.sandbox) - result.proxy_turns = self._collect_proxy_turns(session) - result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:] result.agent_log_tail = self._collect_agent_log_tail(session, agent) _emit( f"collected: {len(result.files)} file(s), " - f"{len(result.proxy_turns)} proxy turn(s), " f"reward={'%.2f' % result.reward if result.reward is not None else 'n/a'}" ) except Exception as exc: # noqa: BLE001 result.error = f"{type(exc).__name__}: {exc}" _emit(f"ERROR: {result.error}") if session is not None: - result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[ - -2000: - ] result.agent_log_tail = self._collect_agent_log_tail(session, agent) finally: if session is not None: @@ -607,10 +597,6 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]: extras.append(path) return files, extras - def _collect_proxy_turns(self, session: Any) -> list[Any]: - """Logprob capture is now owned by the training loop via interception_gate.""" - return [] - @staticmethod def _safe_read(sandbox: Any, path: str) -> str: try: diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py index 82f130ce3..ea9cdb81f 100644 --- a/envs/coding_agent_env/server/gradio_ui.py +++ b/envs/coding_agent_env/server/gradio_ui.py @@ -19,8 +19,7 @@ agent_timeout_s, template). - Preset buttons for the ready-made example tasks. - Run button → result panel with reward, setup/verify per-command - results, file outputs, logprob stats, agent + proxy log tails, - and the raw RolloutResult JSON. + results, file outputs, agent log tail, and the raw RolloutResult JSON. """ from __future__ import annotations @@ -156,51 +155,6 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]: return rows -def _logprobs_md(turns: list[dict[str, Any]]) -> str: - if not turns: - return "_No proxy turns captured._\n\nLogprob capture is handled by the training loop via `interception_gate` mode." - n = len(turns) - productive = sum(1 for t in turns if t.get("completion_tokens")) - total_toks = sum(len(t.get("completion_tokens") or []) for t in turns) - all_lps = [ - float(x) - for t in turns - for x in (t.get("per_token_logps") or []) - if x is not None - ] - mean_lp = (sum(all_lps) / len(all_lps)) if all_lps else None - lines = [ - f"**turns**: `{n}` · **productive**: `{productive}` · " - f"**total_completion_tokens**: `{total_toks}`", - ] - if mean_lp is not None: - lines.append(f"**mean_logprob**: `{mean_lp:+.4f}`") - finishes: dict[str, int] = {} - for t in turns: - f = t.get("finish_reason") or "unknown" - finishes[f] = finishes.get(f, 0) + 1 - if finishes: - lines.append( - "**finish_reasons**: " - + " ".join(f"`{k}={v}`" for k, v in finishes.items()) - ) - productive_rows = [t for t in turns if t.get("completion_tokens")] - if productive_rows: - first = productive_rows[0] - toks = first["completion_tokens"][:10] - lps = first.get("per_token_logps") or [] - lines.append( - "\n**first productive turn (first 10 tokens)**\n\n" - "```\n" - + "\n".join( - f" {tok!r:<14} {lp:+.3f}" if i < len(lps) else f" {tok!r:<14} -" - for i, (tok, lp) in enumerate(zip(toks, lps + [None] * len(toks))) - ) - + "\n```" - ) - return "\n\n".join(lines) - - def _live_status_md( agent: str, endpoint_kind: str, @@ -292,9 +246,9 @@ def run( """Generator handler — yields incremental UI updates. Each ``yield`` is a tuple matching ``outputs=[...]``: - (summary_md, setup_table, verify_table, files_md, logprobs_md, - logs_md, raw_json). Early yields keep summary_md as a live phase - log while the rollout runs; the final yield populates everything. + (summary_md, setup_table, verify_table, files_md, logs_md, + raw_json). Early yields keep summary_md as a live phase log while + the rollout runs; the final yield populates everything. """ import queue import threading @@ -308,7 +262,7 @@ def run( ) except ValueError as exc: err = f"endpoint resolution failed: {exc}" - yield (f"### error\n\n```\n{err}\n```", [], [], "", "", "", {"error": err}) + yield (f"### error\n\n```\n{err}\n```", [], [], "", "", {"error": err}) return # Translate "auto" / "on" / "off" into bool / None. @@ -369,7 +323,6 @@ def _worker(): [], "", "", - "", {}, ) @@ -397,7 +350,7 @@ def _worker(): elapsed, status_lines, ) - yield (md, [], [], "", "", "", {}) + yield (md, [], [], "", "", {}) # Drain any final messages still in the queue. while not status_q.empty(): @@ -415,7 +368,6 @@ def _worker(): [], [], "", - "", _live_status_md( agent, resolved.kind, @@ -434,7 +386,6 @@ def _worker(): _command_rows(result.get("setup_results") or []), _command_rows(result.get("verify_results") or []), _files_md(result.get("files") or {}), - _logprobs_md(result.get("proxy_turns") or []), ( "### live phase log\n\n" + _live_status_md( @@ -445,8 +396,7 @@ def _worker(): time.time() - t_start, status_lines, ) - + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n" - f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```" + + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```" ), result, ) @@ -460,8 +410,7 @@ def apply_preset(name: str) -> tuple[str, str, str]: gr.Markdown( "Run one coding-agent rollout in an E2B sandbox against your chosen " "LLM endpoint. Pick an agent + endpoint, write the task as " - "`(instruction, setup, verify)`, and inspect reward + per-token " - "logprobs." + "`(instruction, setup, verify)`, and inspect reward + logs." ) gr.Markdown(_catalog_banner()) @@ -563,8 +512,6 @@ def apply_preset(name: str) -> tuple[str, str, str]: ) with gr.Tab("Files"): files_md = gr.Markdown("") - with gr.Tab("Logprobs"): - logprobs_md = gr.Markdown("") with gr.Tab("Logs"): logs_md = gr.Markdown("") with gr.Tab("Raw JSON"): @@ -604,7 +551,6 @@ def apply_preset(name: str) -> tuple[str, str, str]: setup_table, verify_table, files_md, - logprobs_md, logs_md, raw_json, ], diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py index 6397a1060..fa3dcae79 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_coding_agent_env.py @@ -55,7 +55,6 @@ def test_public_api_imports() -> None: E2BSandboxBackend, Provider, RolloutResult, - RolloutTurn, SandboxBackend, SandboxHandle, ) @@ -280,7 +279,7 @@ def test_build_session_factory_requires_e2b_dependency() -> None: def test_rollout_result_serializes_round_trip() -> None: - from coding_agent_env import CommandResult, RolloutResult, RolloutTurn + from coding_agent_env import CommandResult, RolloutResult r = RolloutResult( task_id="t1", @@ -291,22 +290,12 @@ def test_rollout_result_serializes_round_trip() -> None: mode="black_box", setup_results=[CommandResult(cmd="pip install pandas", exit_code=0)], verify_results=[CommandResult(cmd="pytest", exit_code=1, stderr="boom")], - proxy_turns=[ - RolloutTurn( - turn=1, - finish_reason="stop", - completion_tokens=["hi"], - per_token_logps=[-0.1], - latency_s=0.2, - ) - ], files={"/home/user/workdir/x.py": "print('x')"}, ) blob = r.model_dump_json() rebuilt = RolloutResult.model_validate_json(blob) assert rebuilt.reward == 0.75 assert rebuilt.verify_results[0].exit_code == 1 - assert rebuilt.proxy_turns[0].completion_tokens == ["hi"] def test_coding_agent_task_coerce_str() -> None: @@ -402,8 +391,6 @@ async def _go() -> RolloutResult: assert result.reward == 1.0, ( f"expected reward=1.0 got {result.reward}: {result.error}" ) - # proxy_turns is now always empty — logprob capture is trainer-owned - # via interception_gate mode, not captured by the environment. assert any(f.endswith("/binary_search.py") for f in result.files), ( f"expected binary_search.py in workdir, got {list(result.files)}" ) From 3c4ffa4a6a0fb3d120e136936d3627726193c4ba Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Wed, 20 May 2026 11:17:11 +0530 Subject: [PATCH 32/35] feat: add tool name allowlist validation --- .../harness/agents/interception_server.py | 70 +++++++++++++++++- tests/core/test_interception_server.py | 71 ++++++++++++++++++- 2 files changed, 136 insertions(+), 5 deletions(-) diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index a71082e69..fa735f0c0 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -24,7 +24,7 @@ Usage — training loop:: - server = InterceptionServer(port=8765) + server = InterceptionServer(port=8765, tool_name_allowlist={"answer"}) await server.start() # Make the server reachable — your responsibility. @@ -53,12 +53,15 @@ import json import logging import queue as _queue_mod +import re import secrets import threading import time import uuid from typing import Any, Awaitable, Callable +from openenv.core.env_server.mcp_types import RESERVED_TOOL_NAMES + from aiohttp import web @@ -66,6 +69,7 @@ _KEEPALIVE_INTERVAL_S = 3.0 _MAX_REQUEST_BODY = 16 * 1024 * 1024 +_TOOL_NAME_RE = re.compile(r"^[A-Za-z0-9_-]{1,64}$") ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]] @@ -82,12 +86,25 @@ def __init__( port: int = 0, secret: str | None = None, host: str = "127.0.0.1", + tool_name_allowlist: set[str] | None = None, ) -> None: self.port = port self.host = host self.secret = secret or secrets.token_urlsafe(32) if not self.secret.strip(): raise ValueError("InterceptionServer secret must not be blank.") + normalized_allowlist: set[str] = set() + for raw_name in tool_name_allowlist or set(): + name = raw_name.strip() + if not name: + raise ValueError("tool_name_allowlist must not include blank names") + if not _TOOL_NAME_RE.fullmatch(name): + raise ValueError( + "tool_name_allowlist entries must match " + f"^[A-Za-z0-9_-]{{1,64}}$ (got {raw_name!r})" + ) + normalized_allowlist.add(name) + self._tool_name_allowlist = frozenset(normalized_allowlist) self._app: web.Application | None = None self._runner: web.AppRunner | None = None self._site: web.TCPSite | None = None @@ -237,16 +254,25 @@ def register_tool_handler( Optionally provide ``tool_definition`` (OpenAI tool schema). Registered schemas are injected into intercepted chat-completion requests for the rollout when the incoming request does not already include the tool. + + Only tool names explicitly configured in ``tool_name_allowlist`` are + accepted. Control-plane names (``reset``, ``step``, ``state``, + ``close``) are always rejected to preserve the dual API boundary. """ + normalized_name = self._validate_tool_registration( + tool_name, + tool_definition=tool_definition, + ) + with self._state_lock: context = self.active_rollouts.get(rollout_id) if context is None: raise KeyError(f"rollout not found: {rollout_id}") handlers: dict[str, ToolHandler] = context["tool_handlers"] - handlers[tool_name] = handler + handlers[normalized_name] = handler if tool_definition is not None: tool_defs: dict[str, dict[str, Any]] = context["tool_defs"] - tool_defs[tool_name] = tool_definition + tool_defs[normalized_name] = tool_definition def unregister_tool_handler(self, rollout_id: str, tool_name: str) -> None: with self._state_lock: @@ -268,6 +294,44 @@ def _tool_name(tool: dict[str, Any]) -> str | None: name = function.get("name") return name if isinstance(name, str) and name else None + def _validate_tool_registration( + self, + tool_name: str, + *, + tool_definition: dict[str, Any] | None, + ) -> str: + normalized = tool_name.strip() + if not normalized: + raise ValueError("tool_name must not be blank") + if not _TOOL_NAME_RE.fullmatch(normalized): + raise ValueError( + f"tool_name must match ^[A-Za-z0-9_-]{{1,64}}$ (got {tool_name!r})" + ) + if normalized.lower() in RESERVED_TOOL_NAMES: + raise ValueError( + "Interception tool name is reserved for infrastructure/control " + f"APIs: {normalized!r}" + ) + if normalized not in self._tool_name_allowlist: + raise ValueError( + "Interception tool name is not in the configured allowlist: " + f"{normalized!r}" + ) + + if tool_definition is not None: + definition_name = self._tool_name(tool_definition) + if definition_name is None: + raise ValueError( + "tool_definition must be an OpenAI tool schema with function.name" + ) + if definition_name != normalized: + raise ValueError( + "tool_definition.function.name must exactly match tool_name " + f"({definition_name!r} != {normalized!r})" + ) + + return normalized + def _merge_rollout_tools( self, tools: Any, diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py index 73421e1a7..41ef38fe5 100644 --- a/tests/core/test_interception_server.py +++ b/tests/core/test_interception_server.py @@ -142,7 +142,11 @@ async def test_interception_server_unregister_rollout_cancels_pending_request() @pytest.mark.asyncio async def test_interception_server_tool_endpoint_executes_registered_handler() -> None: - server = InterceptionServer(port=0, secret="secret-token") + server = InterceptionServer( + port=0, + secret="secret-token", + tool_name_allowlist={"answer"}, + ) await server.start() server.register_rollout("r1") seen: dict[str, object] = {} @@ -186,11 +190,74 @@ async def test_interception_server_tool_endpoint_returns_404_for_unknown_tool() await server.stop() +def test_interception_server_rejects_reserved_tool_name_registration() -> None: + server = InterceptionServer( + port=0, + secret="secret-token", + tool_name_allowlist={"reset"}, + ) + server.register_rollout("r1") + + async def _handler(arguments: dict) -> dict: + return {"ok": True} + + with pytest.raises(ValueError, match="reserved"): + server.register_tool_handler("r1", "reset", _handler) + + +def test_interception_server_rejects_tool_definition_name_mismatch() -> None: + server = InterceptionServer( + port=0, + secret="secret-token", + tool_name_allowlist={"answer"}, + ) + server.register_rollout("r1") + + async def _handler(arguments: dict) -> dict: + return {"ok": True} + + mismatched = { + "type": "function", + "function": { + "name": "not_answer", + "description": "Mismatch", + "parameters": {"type": "object", "properties": {}}, + }, + } + + with pytest.raises(ValueError, match="must exactly match"): + server.register_tool_handler( + "r1", + "answer", + _handler, + tool_definition=mismatched, + ) + + +def test_interception_server_rejects_tool_not_in_allowlist() -> None: + server = InterceptionServer( + port=0, + secret="secret-token", + tool_name_allowlist={"answer"}, + ) + server.register_rollout("r1") + + async def _handler(arguments: dict) -> dict: + return {"ok": True} + + with pytest.raises(ValueError, match="allowlist"): + server.register_tool_handler("r1", "search", _handler) + + @pytest.mark.asyncio async def test_interception_server_injects_registered_tool_defs_into_intercept() -> ( None ): - server = InterceptionServer(port=0, secret="secret-token") + server = InterceptionServer( + port=0, + secret="secret-token", + tool_name_allowlist={"answer"}, + ) await server.start() queue = server.register_rollout("r1") From 151d1abc9d73e0508f008149337704b986a335ec Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Wed, 20 May 2026 11:23:27 +0530 Subject: [PATCH 33/35] feat: provider-specific env var handling for Pi agent --- src/openenv/core/harness/agents/pi.py | 56 ++++++++++++++++++++++----- tests/core/test_harness_adapters.py | 38 ++++++++++++++++-- 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index 03946c552..060b41dbd 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -11,8 +11,9 @@ pi --no-session --no-context-files --provider

--model --thinking off \\ -p @/home/user/task/instruction.txt 2>&1 | tee /home/user/logs/agent/pi.txt -The provider and model are passed as CLI flags so the spec's ``env`` dict -only needs auth credentials (``HF_TOKEN``, ``OPENAI_API_KEY``, etc.). +The provider and model are passed as CLI flags. Provider-specific credentials +are exported via ``build_env_vars`` according to Pi's provider docs +(``HF_TOKEN`` for ``huggingface``, ``OPENAI_API_KEY`` for ``openai``, etc.). Registered on import:: @@ -111,6 +112,48 @@ def _parse_events(line: str) -> AgentEvent | None: return AgentEvent(type="assistant", data=data, raw=line) +def _provider_api_key_env(provider: str) -> str: + provider_key = provider.strip().lower() + env_by_provider = { + # https://github.com/earendil-works/pi/tree/main/packages/coding-agent#providers--models + "openai": "OPENAI_API_KEY", + "openenv": "OPENAI_API_KEY", + "huggingface": "HF_TOKEN", + "anthropic": "ANTHROPIC_API_KEY", + "gemini": "GEMINI_API_KEY", + "google": "GEMINI_API_KEY", + } + env_name = env_by_provider.get(provider_key) + if env_name is None: + raise ValueError( + f"Unsupported pi provider {provider!r}; expected one of " + f"{sorted(env_by_provider)}" + ) + return env_name + + +def _build_env_vars(spec: CLIAgentSpec, config: Any) -> dict[str, str]: + provider = config.provider if hasattr(config, "provider") else "openai" + if not isinstance(provider, str) or not provider.strip(): + provider = "openai" + api_key = config.api_key if hasattr(config, "api_key") else "" + base_url = config.base_url if hasattr(config, "base_url") else "" + extra_env = config.extra_env if hasattr(config, "extra_env") else {} + + env = dict(extra_env) + env["PI_SKIP_VERSION_CHECK"] = "1" + env["PI_TELEMETRY"] = "0" + + if base_url: + env["OPENAI_BASE_URL"] = base_url + + key_env_var = _provider_api_key_env(provider) + if api_key: + env[key_env_var] = api_key + + return env + + PI_SPEC = CLIAgentSpec( name="pi", install_check_cmd=["pi", "--version"], @@ -137,17 +180,12 @@ def _parse_events(line: str) -> AgentEvent | None: artifacts={ "agent_log": ArtifactSpec(path="/home/user/logs/agent/pi.txt"), }, - env={ - "HF_TOKEN": "{api_key}", - "OPENAI_API_KEY": "{api_key}", - "OPENAI_BASE_URL": "{base_url}", - "PI_SKIP_VERSION_CHECK": "1", - "PI_TELEMETRY": "0", - }, + env=None, extension_dir_template="{home}/.pi/agent/extensions", build_command=_build_command, build_mcp_config=_build_mcp_config, parse_events=_parse_events, + build_env_vars=_build_env_vars, ) register_agent(PI_SPEC) diff --git a/tests/core/test_harness_adapters.py b/tests/core/test_harness_adapters.py index f5e1dc260..1766b8ad4 100644 --- a/tests/core/test_harness_adapters.py +++ b/tests/core/test_harness_adapters.py @@ -47,9 +47,41 @@ def test_fields(self): assert PI_SPEC.mcp_config.method == "config_file" assert PI_SPEC.mcp_config.path_template is not None assert ".mcp.json" in PI_SPEC.mcp_config.path_template - assert PI_SPEC.env is not None - assert "HF_TOKEN" in PI_SPEC.env - assert "PI_SKIP_VERSION_CHECK" in PI_SPEC.env + assert PI_SPEC.build_env_vars is not None + + def test_build_env_vars_provider_specific_api_key(self): + from openenv.core.harness.agents.pi import PI_SPEC + + @dataclass + class PiConfig: + provider: str + api_key: str = "secret" + base_url: str = "https://api.example.com/v1" + extra_env: dict[str, str] = field(default_factory=dict) + + assert PI_SPEC.build_env_vars is not None + + hf_env = PI_SPEC.build_env_vars(PI_SPEC, PiConfig(provider="huggingface")) + assert hf_env["HF_TOKEN"] == "secret" + assert "OPENAI_API_KEY" not in hf_env + + oa_env = PI_SPEC.build_env_vars(PI_SPEC, PiConfig(provider="openai")) + assert oa_env["OPENAI_API_KEY"] == "secret" + assert "HF_TOKEN" not in oa_env + + def test_build_env_vars_rejects_unknown_provider(self): + from openenv.core.harness.agents.pi import PI_SPEC + + @dataclass + class PiConfig: + provider: str = "unknown" + api_key: str = "secret" + base_url: str = "https://api.example.com/v1" + extra_env: dict[str, str] = field(default_factory=dict) + + assert PI_SPEC.build_env_vars is not None + with pytest.raises(ValueError, match="Unsupported pi provider"): + PI_SPEC.build_env_vars(PI_SPEC, PiConfig()) def test_build_command(self): from openenv.core.harness.agents.pi import PI_SPEC From 39624900aa00ae36b716b0cfdb707b2497f1aeb7 Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 1 Jun 2026 14:39:31 +0530 Subject: [PATCH 34/35] chore: exit notification handling and build interception rollout URL --- src/openenv/core/harness/agents/cli_driver.py | 38 +++++++++++++++++-- .../harness/agents/interception_server.py | 31 +++++++++++++++ src/openenv/core/harness/agents/pi.py | 2 +- 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py index 831d930af..80d482ef3 100644 --- a/src/openenv/core/harness/agents/cli_driver.py +++ b/src/openenv/core/harness/agents/cli_driver.py @@ -45,6 +45,11 @@ Verifier = Callable[..., VerifyResult] +def build_interception_rollout_url(base_url: str, rollout_id: str) -> str: + """Build OpenAI-compatible interception endpoint for one rollout.""" + return f"{base_url.rstrip('/')}/rollout/{rollout_id}/v1" + + class _ConfigOverrideView: """Read-only attribute view with optional overrides.""" @@ -205,6 +210,9 @@ async def next_request( self._interception_queue.get, timeout=min(remaining, 1.0), ) + # None sentinel = agent process exited (sent by /exit endpoint) + if request_id is None: + return None intercept = server.get_intercept(request_id) if intercept is not None: return intercept @@ -322,8 +330,9 @@ def create_session( rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}" interception_rollout_id = rollout_id interception_queue = self._interception_server.register_rollout(rollout_id) - base_url_override = ( - f"{self._interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1" + base_url_override = build_interception_rollout_url( + self._interception_base_url, + rollout_id, ) agent_bg_job = self._start_agent( @@ -501,6 +510,23 @@ def _start_agent( if self.mode == "interception_gate" and self._interception_server is not None: envs["OPENAI_API_KEY"] = self._interception_server.secret envs["ANTHROPIC_API_KEY"] = self._interception_server.secret + + # Append an exit notification so the InterceptionServer detects + # agent exit immediately instead of waiting for the full timeout. + # The /exit endpoint enqueues a None sentinel on the request queue, + # causing next_request() to return None. + if base_url_override: + exit_url = f"{base_url_override.rstrip('/')}/exit" + auth_header = ( + "Authorization: Bearer " + f"{self._interception_server.secret}" + ) + cmd = ( + f"{{ {cmd} ; }} ; " + f"curl -sf -X POST -H {shlex.quote(auth_header)} " + f"{shlex.quote(exit_url)} || true" + ) + return sandbox.start_bg(cmd, envs=envs) def _write_pi_models_config( @@ -631,4 +657,10 @@ def create( ) -__all__ = ["CLIAgentDriver", "CLIAgentSession", "CLIAgentSessionFactory", "Verifier"] +__all__ = [ + "CLIAgentDriver", + "CLIAgentSession", + "CLIAgentSessionFactory", + "Verifier", + "build_interception_rollout_url", +] diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py index fa735f0c0..97573b352 100644 --- a/src/openenv/core/harness/agents/interception_server.py +++ b/src/openenv/core/harness/agents/interception_server.py @@ -126,6 +126,10 @@ async def start(self) -> None: "/rollout/{rollout_id}/v1/tools/{tool_name}", self._handle_tool_call, ) + app.router.add_post( + "/rollout/{rollout_id}/v1/exit", + self._handle_exit, + ) app.router.add_get("/health", self._handle_health) runner = web.AppRunner(app) await runner.setup() @@ -363,6 +367,33 @@ def _authorized(self, request: web.Request) -> bool: async def _handle_health(self, request: web.Request) -> web.Response: return web.json_response({"status": "ok", **self.stats()}) + async def _handle_exit(self, request: web.Request) -> web.Response: + """Handle agent process exit notification. + + Called by the sandbox entrypoint after the agent process exits. + Enqueues a sentinel ``None`` on the rollout's request queue so that + ``next_request()`` returns immediately instead of waiting for the + full timeout. + """ + rollout_id = request.match_info["rollout_id"] + with self._state_lock: + rollout = self.active_rollouts.get(rollout_id) + if rollout is None: + return web.json_response({"status": "ignored", "reason": "unknown rollout_id"}) + + queue = rollout.get("request_id_queue") + if queue is not None: + try: + queue.put_nowait(None) # sentinel: signals "agent exited" + except Exception: + pass + + _log.info( + "interception_exit_signal rollout_id=%s", + rollout_id, + ) + return web.json_response({"status": "ok"}) + async def _handle_tool_call(self, request: web.Request) -> web.Response: if not self._authorized(request): return web.json_response({"error": "Unauthorized"}, status=401) diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py index 060b41dbd..a2fdd7537 100644 --- a/src/openenv/core/harness/agents/pi.py +++ b/src/openenv/core/harness/agents/pi.py @@ -166,7 +166,7 @@ def _build_env_vars(spec: CLIAgentSpec, config: Any) -> dict[str, str]: setup=( "set -e && " "apt-get update -qq && apt-get install -y -qq curl ca-certificates gnupg && " - "curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && " + "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && " "apt-get install -y -qq nodejs && " "curl -fsSL https://pi.dev/install.sh | sh && " "mkdir -p /home/user/logs/agent /home/user/task /home/user/workdir && " From 88f6a55354b20a6c236eec3c676f01e3ee60991b Mon Sep 17 00:00:00 2001 From: swappy <59965507+rycerzes@users.noreply.github.com> Date: Mon, 1 Jun 2026 15:57:19 +0530 Subject: [PATCH 35/35] refactor(opencode_env): migrate to core harness --- docs/source/environments.md | 8 +- docs/source/environments/coding_agent.md | 2 - docs/source/environments/opencode.md | 2 + envs/coding_agent_env/harness.py | 178 ----- .../.dockerignore | 0 .../.gitignore | 0 .../README.md | 113 ++- .../__init__.py | 33 +- .../client.py | 43 +- .../config.py | 15 +- envs/opencode_env/harness.py | 342 +++++++++ .../models.py | 28 +- .../opencode_runtime.py | 24 +- .../openenv.yaml | 2 +- .../pyproject.toml | 16 +- .../sandbox/__init__.py | 0 .../sandbox/build_template.py | 4 +- envs/opencode_env/sandbox/interception.py | 661 ++++++++++++++++++ .../server/Dockerfile | 6 +- .../server/__init__.py | 2 +- .../server/app.py | 22 +- .../server/catalog.py | 0 .../server/gradio_ui.py | 54 +- .../server/opencode_environment.py} | 195 ++---- .../task.py | 14 +- .../uv.lock | 2 +- ...t_env_simple.py => opencode_env_simple.py} | 24 +- tests/core/test_cli_agent_driver.py | 2 +- ...ding_agent_env.py => test_opencode_env.py} | 154 ++-- 29 files changed, 1349 insertions(+), 597 deletions(-) delete mode 100644 docs/source/environments/coding_agent.md create mode 100644 docs/source/environments/opencode.md delete mode 100644 envs/coding_agent_env/harness.py rename envs/{coding_agent_env => opencode_env}/.dockerignore (100%) rename envs/{coding_agent_env => opencode_env}/.gitignore (100%) rename envs/{coding_agent_env => opencode_env}/README.md (64%) rename envs/{coding_agent_env => opencode_env}/__init__.py (59%) rename envs/{coding_agent_env => opencode_env}/client.py (78%) rename envs/{coding_agent_env => opencode_env}/config.py (78%) create mode 100644 envs/opencode_env/harness.py rename envs/{coding_agent_env => opencode_env}/models.py (67%) rename envs/{coding_agent_env => opencode_env}/opencode_runtime.py (87%) rename envs/{coding_agent_env => opencode_env}/openenv.yaml (76%) rename envs/{coding_agent_env => opencode_env}/pyproject.toml (71%) rename envs/{coding_agent_env => opencode_env}/sandbox/__init__.py (100%) rename envs/{coding_agent_env => opencode_env}/sandbox/build_template.py (94%) create mode 100644 envs/opencode_env/sandbox/interception.py rename envs/{coding_agent_env => opencode_env}/server/Dockerfile (91%) rename envs/{coding_agent_env => opencode_env}/server/__init__.py (79%) rename envs/{coding_agent_env => opencode_env}/server/app.py (81%) rename envs/{coding_agent_env => opencode_env}/server/catalog.py (100%) rename envs/{coding_agent_env => opencode_env}/server/gradio_ui.py (92%) rename envs/{coding_agent_env/server/coding_environment.py => opencode_env/server/opencode_environment.py} (76%) rename envs/{coding_agent_env => opencode_env}/task.py (73%) rename envs/{coding_agent_env => opencode_env}/uv.lock (99%) rename examples/{coding_agent_env_simple.py => opencode_env_simple.py} (80%) rename tests/envs/{test_coding_agent_env.py => test_opencode_env.py} (71%) diff --git a/docs/source/environments.md b/docs/source/environments.md index 58f36c155..207df4e8c 100644 --- a/docs/source/environments.md +++ b/docs/source/environments.md @@ -549,13 +549,13 @@ AgentWorldModel-1K — 1,000 synthetic MCP tool-use environments with 10,000 tas ``` ```` -````{grid-item-card} Coding Agent +````{grid-item-card} OpenCode :class-card: sd-border-1 -`coding_agent_env` runs coding-agent harnesses (currently OpenCode + Pi) inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logpr... +`opencode_env` runs the OpenCode coding agent inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, with trainer-owned interception for RL workflows. +++ -```{button-link} environments/coding_agent.html +```{button-link} environments/opencode.html :color: primary :outline: @@ -633,5 +633,5 @@ environments/tbench2 environments/unity environments/wildfire environments/agent_world_model -environments/coding_agent +environments/opencode ``` diff --git a/docs/source/environments/coding_agent.md b/docs/source/environments/coding_agent.md deleted file mode 100644 index 2903e2322..000000000 --- a/docs/source/environments/coding_agent.md +++ /dev/null @@ -1,2 +0,0 @@ -```{include} ../../../envs/coding_agent_env/README.md -``` diff --git a/docs/source/environments/opencode.md b/docs/source/environments/opencode.md new file mode 100644 index 000000000..9a93ebe33 --- /dev/null +++ b/docs/source/environments/opencode.md @@ -0,0 +1,2 @@ +```{include} ../../../envs/opencode_env/README.md +``` diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py deleted file mode 100644 index de4ec91dd..000000000 --- a/envs/coding_agent_env/harness.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -"""Coding-agent session factory + session — backed by CLIAgentDriver.""" - -from __future__ import annotations - -import queue as _queue_mod -import uuid -from typing import Any, Literal - -from openenv.core.harness import ResourceSessionFactory -from openenv.core.harness.agents.cli_driver import ( - CLIAgentDriver, - CLIAgentSession, - Verifier, -) -from openenv.core.harness.agents.interception_server import InterceptionServer -from openenv.core.harness.agents.opencode import OPENCODE_SPEC -from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle - -from .config import CodingAgentConfig -from .opencode_runtime import agent_log_path, build_env_vars, build_run_cmd -from .task import CodingAgentTask - - -class CodingAgentSession(CLIAgentSession): - def __init__( - self, - *, - sandbox: SandboxHandle, - config: CodingAgentConfig, - task: CodingAgentTask, - verifier: Verifier | None = None, - base_url_override: str | None = None, - interception_server: InterceptionServer | None = None, - interception_rollout_id: str | None = None, - interception_queue: _queue_mod.Queue[str] | None = None, - ) -> None: - super().__init__( - spec=OPENCODE_SPEC, - sandbox=sandbox, - task=task, - config=config, - verifier=verifier, - base_url_override=base_url_override, - interception_server=interception_server, - interception_rollout_id=interception_rollout_id, - interception_queue=interception_queue, - ) - - def fetch_trace(self) -> str: - return self.sandbox.read_text(agent_log_path(self.config)) - - def wait_for_completion(self, timeout_s: float | None = None) -> int: - budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s - if self._agent_bg_job is None: - raise RuntimeError("Agent not started.") - return self._agent_bg_job.wait(timeout=budget) - - def start_agent(self) -> None: - if self._agent_bg_job is not None: - return - cmd = build_run_cmd(self.config) - envs = build_env_vars(self.config, base_url_override=self._base_url_override) - self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs) - - -class CodingAgentSessionFactory(ResourceSessionFactory): - def __init__( - self, - *, - config: CodingAgentConfig, - sandbox_backend: SandboxBackend, - mode: Literal["black_box", "interception_gate"] = "black_box", - verifier: Verifier | None = None, - install_timeout_s: int = 240, - setup_timeout_s: int = 300, - interception_server: InterceptionServer | None = None, - interception_base_url: str | None = None, - ) -> None: - if mode not in {"black_box", "interception_gate"}: - raise ValueError(f"Unknown mode: {mode!r}") - self._config = config - self._backend = sandbox_backend - self._verifier = verifier - self._driver = CLIAgentDriver( - spec=OPENCODE_SPEC, - sandbox_backend=sandbox_backend, - mode=mode, - install_timeout_s=install_timeout_s, - setup_timeout_s=setup_timeout_s, - interception_server=interception_server, - interception_base_url=interception_base_url, - ) - - def create( - self, - task: Any, - seed: int | None = None, - episode_id: str | None = None, - ) -> CodingAgentSession: - import logging - - _log = logging.getLogger(__name__) - oc_task = CodingAgentTask.coerce(task) - setup_parts: list[str] = [] - if self._config.extra_setup_shell: - setup_parts.append(self._config.extra_setup_shell) - if oc_task.setup_shell: - setup_parts.append(oc_task.setup_shell) - if setup_parts: - oc_task = oc_task.model_copy( - update={"setup_shell": "set -e\n" + "\n".join(setup_parts)} - ) - - sandbox_timeout = int(self._config.agent_timeout_s) + 300 - sandbox = self._backend.create( - timeout_s=sandbox_timeout, - metadata={"episode_id": episode_id} if episode_id else None, - ) - try: - self._bootstrap_sandbox(sandbox, oc_task) - except Exception as exc: - _log.error("factory.create: bootstrap failed: %r", exc) - sandbox.kill() - raise - - # Wire up interception_gate if the driver is configured for it - base_url_override: str | None = None - interception_rollout_id: str | None = None - interception_queue: _queue_mod.Queue[str] | None = None - - if self._driver.mode == "interception_gate": - interception_server = self._driver._interception_server - if interception_server is None: - raise RuntimeError( - "interception_gate mode requires an InterceptionServer" - ) - interception_base_url = self._driver._interception_base_url - if interception_base_url is None: - raise RuntimeError( - "interception_gate mode requires interception_base_url" - ) - rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}" - interception_rollout_id = rollout_id - interception_queue = interception_server.register_rollout(rollout_id) - base_url_override = ( - f"{interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1" - ) - - session = CodingAgentSession( - sandbox=sandbox, - config=self._config, - task=oc_task, - verifier=self._verifier, - base_url_override=base_url_override, - interception_server=self._driver._interception_server, - interception_rollout_id=interception_rollout_id, - interception_queue=interception_queue, - ) - - session.start_agent() - return session - - def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: CodingAgentTask) -> None: - self._driver.bootstrap_sandbox(sandbox, task, self._config) - - -__all__ = [ - "CodingAgentSession", - "CodingAgentSessionFactory", - "CodingAgentTask", - "Verifier", -] diff --git a/envs/coding_agent_env/.dockerignore b/envs/opencode_env/.dockerignore similarity index 100% rename from envs/coding_agent_env/.dockerignore rename to envs/opencode_env/.dockerignore diff --git a/envs/coding_agent_env/.gitignore b/envs/opencode_env/.gitignore similarity index 100% rename from envs/coding_agent_env/.gitignore rename to envs/opencode_env/.gitignore diff --git a/envs/coding_agent_env/README.md b/envs/opencode_env/README.md similarity index 64% rename from envs/coding_agent_env/README.md rename to envs/opencode_env/README.md index 347afdd05..6840bd3fd 100644 --- a/envs/coding_agent_env/README.md +++ b/envs/opencode_env/README.md @@ -1,5 +1,5 @@ --- -title: Coding Agent Environment Server +title: OpenCode Environment Server emoji: 🛠️ colorFrom: indigo colorTo: purple @@ -9,33 +9,33 @@ app_port: 8000 base_path: /web tags: - openenv -short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B +short_description: OpenCode coding agent in an E2B sandbox --- -# Coding Agent Environment for OpenEnv +# OpenCode Environment for OpenEnv -`coding_agent_env` runs coding-agent harnesses (currently -[OpenCode](https://opencode.ai) and [Pi](https://github.com/badlogic/pi-mono)) +`opencode_env` runs the [OpenCode](https://opencode.ai) coding agent inside an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible -LLM endpoint with optional trainer-owned interception for RL training. +LLM endpoint, optionally capturing per-token logprobs through a transparent +in-sandbox proxy for RL training data. -**🚀 Try it live**: [`AdithyaSK/coding-agent-env`](https://huggingface.co/spaces/AdithyaSK/coding-agent-env) +**🚀 Try it live**: [`AdithyaSK/opencode-env`](https://huggingface.co/spaces/AdithyaSK/opencode-env) The deployed Space exposes: -- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward. -- **MCP tool API** at [`/mcp`](https://adithyask-coding-agent-env.hf.space/mcp) — programmatic `run_rollout` calls. -- **OpenAPI docs** at [`/docs`](https://adithyask-coding-agent-env.hf.space/docs). -- **Health** at [`/health`](https://adithyask-coding-agent-env.hf.space/health). +- **Web UI** at [`/web`](https://adithyask-opencode-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward. +- **MCP tool API** at [`/mcp`](https://adithyask-opencode-env.hf.space/mcp) — programmatic `run_rollout` calls. +- **OpenAPI docs** at [`/docs`](https://adithyask-opencode-env.hf.space/docs). +- **Health** at [`/health`](https://adithyask-opencode-env.hf.space/health). The env is **task-agnostic** — every rollout is configured at call-time with a uniform Task shape: - - **`instruction`** — prompt for the agent - - **`setup`** — list of bash commands run *before* the agent (pip + - **`instruction`** — prompt for OpenCode + - **`setup`** — list of bash commands run *before* OpenCode (pip install, git clone, file downloads — anything you need staged in the sandbox) - - **`verify`** — list of bash commands run *after* the agent (asserts, + - **`verify`** — list of bash commands run *after* OpenCode (asserts, pytest invocations, score-file writes) Reward = `passed_verify / total_verify` unless any `verify` command writes @@ -48,21 +48,20 @@ a float to `/home/user/logs/verifier/reward.txt` (override). ```python import asyncio import os -from coding_agent_env import CodingAgentEnv -from coding_agent_env.client import _extract_text -from coding_agent_env.models import RolloutResult +from opencode_env import OpenCodeEnv +from opencode_env.client import _extract_text +from opencode_env.models import RolloutResult async def main(): - SPACE = "https://adithyask-coding-agent-env.hf.space" + SPACE = "https://adithyask-opencode-env.hf.space" - async with CodingAgentEnv(base_url=SPACE) as env: + async with OpenCodeEnv(base_url=SPACE) as env: await env.reset() # The MCP tool returns JSON; deserialize via the typed model. raw = await env.call_tool( "run_rollout", - agent="opencode", # opencode | pi endpoint="openai", # vllm | openai | hf_router api_key=os.environ["OPENAI_API_KEY"], # or set as a Space secret instruction=( @@ -77,7 +76,7 @@ async def main(): "import binary_search; " "assert binary_search.binary_search([1,2,3], 2) == 1; print('OK')\"", ], - template="coding-agent-rl", # prebaked E2B template + template="opencode-rl", # prebaked E2B template task_id="binary_search_v1", ) result = RolloutResult.model_validate_json(_extract_text(raw)) @@ -102,10 +101,10 @@ wall: 19.8 s ```python import os -from coding_agent_env import CodingAgentEnv +from opencode_env import OpenCodeEnv # .sync() returns a synchronous wrapper around the async client. -with CodingAgentEnv(base_url="https://adithyask-coding-agent-env.hf.space").sync() as env: +with OpenCodeEnv(base_url="https://adithyask-opencode-env.hf.space").sync() as env: env.reset() # MCP tools are reachable via env.call_tool(...) / env.step(...) sync-wrapped. # See the async example above for the full run_rollout signature. @@ -120,12 +119,12 @@ For trainers that want to drive a sandbox directly without an HTTP boundary: ```python import os -from coding_agent_env import ( - CodingAgentConfig, CodingAgentSessionFactory, CodingAgentTask, E2BSandboxBackend, +from opencode_env import ( + OpenCodeConfig, OpenCodeSessionFactory, OpenCodeTask, E2BSandboxBackend, ) -factory = CodingAgentSessionFactory( - config=CodingAgentConfig( +factory = OpenCodeSessionFactory( + config=OpenCodeConfig( provider="openai_compatible", base_url="https://api.openai.com/v1", api_key=os.environ["OPENAI_API_KEY"], @@ -134,7 +133,7 @@ factory = CodingAgentSessionFactory( sandbox_backend=E2BSandboxBackend(), mode="interception_gate", # trainer-owned interception mode ) -session = factory.create(task=CodingAgentTask(instruction="...")) +session = factory.create(task=OpenCodeTask(instruction="...")) session.wait_for_completion() session.close() ``` @@ -145,22 +144,22 @@ The Dockerfile lives at `server/Dockerfile`. Use the `openenv` CLI from the env root: ```bash -cd envs/coding_agent_env +cd envs/opencode_env openenv validate # check pyproject.toml + openenv.yaml + server/app.py + uv.lock -openenv build -t coding-agent-env # builds the image (uses server/Dockerfile) +openenv build -t opencode-env # builds the image (uses server/Dockerfile) # run locally with E2B credentials -docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env +docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env # push to HF Spaces (Docker variant) -openenv push --repo-id /coding-agent-env +openenv push --repo-id /opencode-env ``` Or build directly without the CLI: ```bash -docker build -t coding-agent-env -f envs/coding_agent_env/server/Dockerfile envs/coding_agent_env +docker build -t opencode-env -f envs/opencode_env/server/Dockerfile envs/opencode_env ``` The image: @@ -173,7 +172,7 @@ The image: ## The MCP Tool: `run_rollout` -Single tool, with an ``agent`` selector plus two ways to specify the LLM endpoint: +Single tool, with two ways to specify the LLM endpoint: **Option A — endpoint shorthand (recommended)**: pass `endpoint="vllm"` (or `"openai"` / `"hf_router"`). The server resolves @@ -185,30 +184,31 @@ directly. | Arg | Type | Default | Notes | |---|---|---|---| -| `agent` | `str` | `"opencode"` | Harness to run: `"opencode"` or `"pi"`. | | `endpoint` | `str` | `""` | One of `"vllm"` / `"openai"` / `"hf_router"`. | | `base_url` / `api_key` / `model` | `str` | `""` | Override / supply explicitly. | -| `instruction` | `str` | required | Prompt passed to the selected harness CLI. | -| `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. | -| `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. | +| `instruction` | `str` | required | Prompt passed to OpenCode. | +| `setup` | `list[str]` | `[]` | Bash commands run **before** OpenCode. | +| `verify` | `list[str]` | `[]` | Bash commands run **after** OpenCode. | | `task_id` | `str` | `""` | Echoed back in result. | -| `mode` | `str` | `"black_box"` | Or `"interception_gate"` for trainer-owned generation. | +| `mode` | `str` | `"transparent_proxy"` | Or `"black_box"` for direct LLM calls. In-process trainers can also construct `OpenCodeSessionFactory(mode="interception_gate", ...)`. | | `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. | | `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. | -| `top_logprobs` | `int` | `5` | Reserved for trainer-owned interception workflows. | -| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for the selected harness. | -| `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. | +| `top_logprobs` | `int` | `5` | Per-token top-k logprobs requested in `transparent_proxy` mode. | +| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for OpenCode. | +| `template` | `str` | `""` | E2B template name; `"opencode-rl"` skips ~2 min of install per rollout. | Returns `RolloutResult` JSON with: `reward`, `setup_results[]`, -`verify_results[]`, `files{}`, `agent_log_tail`, `wall_s`, +`verify_results[]`, `proxy_turns[]` (logprob records in transparent-proxy +mode), `files{}`, `agent_log_tail`, `proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`. ## Two Operating Modes | Mode | What it does | Best for | |---|---|---| -| **`black_box`** (default) | The selected harness talks directly to `base_url`. | Smoke tests, eval, SFT data collection. | -| **`interception_gate`** | Agent calls are routed through trainer-host interception endpoints. Trainer owns forward pass + trajectory capture. | RL training with trainer-owned generation. | +| **`transparent_proxy`** (default) | OpenCode talks to an in-sandbox proxy. The proxy forwards to `base_url`, requests logprobs, strips them before returning to OpenCode, and records `proxy_turns`. | RL data collection, GRPO-style traces. | +| **`black_box`** | OpenCode talks directly to `base_url`. No logprob capture. | Smoke tests, eval, SFT data collection. | +| **`interception_gate`** | Available through the in-process `OpenCodeSessionFactory`; OpenCode calls are routed through trainer-host interception endpoints. | Trainer-owned generation. | ## Environment Variables @@ -237,20 +237,20 @@ sibling `.env` file; on HF Spaces, set them as **Space secrets**. ## Pre-baked E2B Template The first rollout in a fresh E2B sandbox spends ~2 min installing -harness tooling. Build a one-time template that ships those pre-installed: +OpenCode tooling. Build a one-time template that ships it pre-installed: ```bash -.venv/bin/python envs/coding_agent_env/sandbox/build_template.py -# → builds `coding-agent-rl` template in your E2B account (~1m20s, one-time) +.venv/bin/python envs/opencode_env/sandbox/build_template.py +# → builds `opencode-rl` template in your E2B account (~1m20s, one-time) ``` -After this, pass `template="coding-agent-rl"` on every `run_rollout` call — +After this, pass `template="opencode-rl"` on every `run_rollout` call — each rollout drops to ~20–30s end-to-end. ## Project Structure ``` -coding_agent_env/ +opencode_env/ ├── README.md # this file ├── openenv.yaml # OpenEnv space spec ├── pyproject.toml # deps + ``server`` entrypoint @@ -258,18 +258,18 @@ coding_agent_env/ ├── .gitignore / .dockerignore # excludes .env / __pycache__ ├── __init__.py # re-exports primitive + client + models │ -├── client.py # CodingAgentEnv(MCPToolClient) -├── models.py # RolloutResult / CodingAgentState +├── client.py # OpenCodeEnv(MCPToolClient) +├── models.py # RolloutResult / OpenCodeState │ -├── config.py # CodingAgentConfig (primitive) -├── harness.py # CodingAgentSession / CodingAgentSessionFactory (CLI-only) +├── config.py # OpenCodeConfig (primitive) +├── harness.py # OpenCodeSession / OpenCodeSessionFactory (CLI-only) ├── opencode_runtime.py # opencode.json builder + cmds -├── task.py # CodingAgentTask +├── task.py # OpenCodeTask │ ├── server/ │ ├── __init__.py │ ├── app.py # FastAPI factory; mounts Gradio at /web -│ ├── coding_environment.py # MCPEnvironment with single ``run_rollout`` tool +│ ├── opencode_environment.py # MCPEnvironment with single ``run_rollout`` tool │ ├── gradio_ui.py # the /web Gradio Blocks UI │ ├── catalog.py # endpoint shorthand resolver │ └── Dockerfile # multi-stage uv build (used by ``openenv build``) @@ -291,6 +291,5 @@ src/openenv/core/harness/sandbox/ - [OpenEnv docs](https://meta-pytorch.org/OpenEnv/) - [OpenCode CLI](https://opencode.ai/docs/cli/) -- [Pi](https://github.com/badlogic/pi-mono) - [E2B Python SDK](https://e2b.dev/docs) diff --git a/envs/coding_agent_env/__init__.py b/envs/opencode_env/__init__.py similarity index 59% rename from envs/coding_agent_env/__init__.py rename to envs/opencode_env/__init__.py index bc04e7236..ea72f4fe5 100644 --- a/envs/coding_agent_env/__init__.py +++ b/envs/opencode_env/__init__.py @@ -4,29 +4,29 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Coding-agent environment for OpenEnv. +"""OpenCode environment for OpenEnv. Two layers in this package: -1. **Harness primitive** -- :class:`CodingAgentSessionFactory` / - :class:`CodingAgentSession` / :class:`CodingAgentConfig` / +1. **Harness primitive** -- :class:`OpenCodeSessionFactory` / + :class:`OpenCodeSession` / :class:`OpenCodeConfig` / :class:`E2BSandboxBackend`. Built on the generic :class:`CLIAgentDriver` from ``openenv.core.harness.agents``. -2. **Deployable env** -- :class:`CodingAgentEnv` (MCP client) talks to the +2. **Deployable env** -- :class:`OpenCodeEnv` (MCP client) talks to the FastAPI server at ``server/app.py`` over HTTP. Use this when the - sandbox + agent live behind an HTTP boundary (e.g. an HF Space). + sandbox + OpenCode live behind an HTTP boundary (e.g. an HF Space). See ``client.py`` and ``server/``. """ from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle -from .client import CodingAgentEnv -from .config import CodingAgentConfig, Provider -from .harness import CodingAgentSession, CodingAgentSessionFactory -from .models import CommandResult, CodingAgentState, RolloutResult -from .task import CodingAgentTask +from .client import OpenCodeEnv +from .config import OpenCodeConfig, Provider +from .harness import OpenCodeSession, OpenCodeSessionFactory +from .models import CommandResult, OpenCodeState, RolloutResult, RolloutTurn +from .task import OpenCodeTask try: from openenv.core.harness.sandbox import E2BSandboxBackend @@ -35,18 +35,19 @@ __all__ = [ # Deployed-env client - "CodingAgentEnv", + "OpenCodeEnv", "CallToolAction", "ListToolsAction", # HTTP API models "CommandResult", - "CodingAgentState", + "OpenCodeState", "RolloutResult", + "RolloutTurn", # Harness primitive - "CodingAgentConfig", - "CodingAgentSession", - "CodingAgentSessionFactory", - "CodingAgentTask", + "OpenCodeConfig", + "OpenCodeSession", + "OpenCodeSessionFactory", + "OpenCodeTask", "Provider", # Sandbox backend "E2BSandboxBackend", diff --git a/envs/coding_agent_env/client.py b/envs/opencode_env/client.py similarity index 78% rename from envs/coding_agent_env/client.py rename to envs/opencode_env/client.py index 492060a25..e11599b5e 100644 --- a/envs/coding_agent_env/client.py +++ b/envs/opencode_env/client.py @@ -4,17 +4,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Client for the deployed coding_agent_env server. +"""Client for the deployed opencode_env server. -The server exposes a single MCP tool ``run_rollout`` that runs one coding-agent -rollout (OpenCode or Pi) in an E2B sandbox and returns a JSON-serialized +The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode +rollout in an E2B sandbox and returns a JSON-serialized :class:`RolloutResult`. Example:: - from coding_agent_env import CodingAgentEnv + from opencode_env import OpenCodeEnv - with CodingAgentEnv(base_url="https://your-space.hf.space") as env: + with OpenCodeEnv(base_url="https://your-space.hf.space") as env: env.reset() result = env.run_rollout( base_url="https://api.openai.com/v1", @@ -41,8 +41,8 @@ from models import RolloutResult # type: ignore -class CodingAgentEnv(MCPToolClient): - """Typed client for the coding_agent_env MCP server. +class OpenCodeEnv(MCPToolClient): + """Typed client for the opencode_env MCP server. Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image`` / context-manager semantics from :class:`MCPToolClient`. @@ -51,8 +51,7 @@ class CodingAgentEnv(MCPToolClient): def run_rollout( self, *, - # Agent + endpoint — pass either shorthand endpoint or explicit fields. - agent: str = "opencode", # "opencode" | "pi" + # Endpoint — pass either shorthand endpoint or explicit fields. endpoint: str = "", # "vllm" | "openai" | "hf_router" base_url: str = "", api_key: str = "", @@ -63,50 +62,48 @@ def run_rollout( verify: list[str] | None = None, # Bookkeeping / tunables task_id: str = "", - mode: str = "black_box", + mode: str = "transparent_proxy", disable_thinking: bool | None = None, max_tokens_cap: int = 4096, top_logprobs: int = 5, agent_timeout_s: float = 600.0, template: str = "", ) -> RolloutResult: - """Run one coding-agent rollout and return the typed result. + """Run one opencode rollout and return the typed result. Args: - agent: Harness CLI to run in sandbox (``"opencode"`` or ``"pi"``). base_url: OpenAI-compatible LLM endpoint (with trailing /v1). api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM if it doesn't enforce auth. model: Model id understood by the LLM endpoint (e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``, ``"Qwen/Qwen3-4B-Instruct-2507:nscale"``). - instruction: Prompt passed to the selected harness CLI. - setup: Bash commands run sequentially **before** the agent starts. + instruction: Prompt passed to OpenCode. + setup: Bash commands run sequentially **before** OpenCode starts. Each command runs in the sandbox; non-zero exit aborts setup. - verify: Bash commands run sequentially **after** the agent exits. + verify: Bash commands run sequentially **after** OpenCode exits. Reward = ``passed_count / total`` unless any command writes a float to ``/home/user/logs/verifier/reward.txt`` (override). task_id: Echoed back in the result for traceability. - mode: ``"black_box"`` (agent talks directly to the LLM) or - ``"interception_gate"`` (LLM calls routed to trainer-side - InterceptionServer for trainer-owned generation). + mode: ``"transparent_proxy"`` (default, captures logprobs) or + ``"black_box"`` (OpenCode talks directly to the LLM). disable_thinking: Inject ``chat_template_kwargs.enable_thinking=false`` on forwarded requests. Needed for Qwen3.5 vLLM; harmless on Instruct variants; rejected by OpenAI direct. max_tokens_cap: Clamp on per-turn ``max_tokens``. - top_logprobs: Reserved for trainer-owned interception workflows. - agent_timeout_s: Hard wall-clock budget for one agent run. - template: E2B template name (e.g. ``"coding-agent-rl"``). Empty + top_logprobs: Per-token top-k logprobs requested in + ``transparent_proxy`` mode. + agent_timeout_s: Hard wall-clock budget for one OpenCode run. + template: E2B template name (e.g. ``"opencode-rl"``). Empty string uses the default (slow) base image. Returns: - A :class:`RolloutResult` with reward, file outputs, + A :class:`RolloutResult` with reward, proxy_turns, file outputs, setup/verify results, and diagnostic tails. """ raw = self.call_tool( "run_rollout", - agent=agent, endpoint=endpoint, base_url=base_url, api_key=api_key, diff --git a/envs/coding_agent_env/config.py b/envs/opencode_env/config.py similarity index 78% rename from envs/coding_agent_env/config.py rename to envs/opencode_env/config.py index d70610542..1e1a8b167 100644 --- a/envs/coding_agent_env/config.py +++ b/envs/opencode_env/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Configuration model for the coding-agent harness primitive.""" +"""Configuration model for the OpenCode harness primitive.""" from __future__ import annotations @@ -16,8 +16,8 @@ Provider = Literal["openai_compatible", "openai", "anthropic"] -class CodingAgentConfig(BaseModel): - """All configuration required to launch one coding-agent rollout in a sandbox. +class OpenCodeConfig(BaseModel): + """All configuration required to launch one OpenCode rollout in a sandbox. Field names are provider-agnostic. The primitive maps ``provider`` onto the correct ``opencode.json`` provider block (``@ai-sdk/openai-compatible``, @@ -46,9 +46,18 @@ class CodingAgentConfig(BaseModel): extra_setup_shell: str | None = None # --- Model behavior -------------------------------------------------------- + # Direct OpenCode config knobs (black_box / interception_gate). disable_thinking: bool = False max_tokens_cap: int | None = None + # --- Transparent-proxy logprob capture ------------------------------------ + # Compatibility knobs for the HTTP env's logprob-capturing mode. The proxy + # requests OpenAI-compatible logprobs upstream, records them, and strips + # them before returning the response to OpenCode. + proxy_max_tokens_cap: int | None = 16384 + proxy_top_logprobs: int = 5 + proxy_disable_thinking: bool = False + # --- Sandbox paths -------------------------------------------------------- # Root directory inside the sandbox where the primitive writes config, # task files, and logs. E2B's default user is ``user`` with home diff --git a/envs/opencode_env/harness.py b/envs/opencode_env/harness.py new file mode 100644 index 000000000..ca5c294c2 --- /dev/null +++ b/envs/opencode_env/harness.py @@ -0,0 +1,342 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""OpenCode session factory + session backed by CLIAgentDriver.""" + +from __future__ import annotations + +import json +import queue as _queue_mod +import shlex +import uuid +from pathlib import Path +from typing import Any, Literal + +from openenv.core.harness import ResourceSessionFactory +from openenv.core.harness.agents.cli_driver import ( + CLIAgentDriver, + CLIAgentSession, + Verifier, + build_interception_rollout_url, +) +from openenv.core.harness.agents.interception_server import InterceptionServer +from openenv.core.harness.agents.opencode import OPENCODE_SPEC +from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle + +from .config import OpenCodeConfig +from .opencode_runtime import ( + agent_log_path, + build_env_vars, + build_opencode_json, + build_run_cmd, + opencode_config_path, +) +from .task import OpenCodeTask + + +# Inside-sandbox transparent proxy paths. +_PROXY_PORT = 7000 +_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl" +_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log" +_PROXY_SOURCE_PATH = Path(__file__).parent / "sandbox" / "interception.py" + + +class OpenCodeSession(CLIAgentSession): + def __init__( + self, + *, + sandbox: SandboxHandle, + config: OpenCodeConfig, + task: OpenCodeTask, + verifier: Verifier | None = None, + base_url_override: str | None = None, + agent_bg_job: BgJob | None = None, + proxy_trace_path: str | None = None, + proxy_bg_job: BgJob | None = None, + interception_server: InterceptionServer | None = None, + interception_rollout_id: str | None = None, + interception_queue: _queue_mod.Queue[str | None] | None = None, + ) -> None: + super().__init__( + spec=OPENCODE_SPEC, + sandbox=sandbox, + task=task, + config=config, + verifier=verifier, + base_url_override=base_url_override, + agent_bg_job=agent_bg_job, + interception_server=interception_server, + interception_rollout_id=interception_rollout_id, + interception_queue=interception_queue, + ) + self._proxy_trace_path = proxy_trace_path + self._proxy_bg_job = proxy_bg_job + + def fetch_trace(self) -> str: + return self.sandbox.read_text(agent_log_path(self.config)) + + def fetch_proxy_trace(self) -> list[dict[str, Any]]: + """Return per-turn proxy-captured records (transparent_proxy only).""" + if self._proxy_trace_path is None: + return [] + try: + content = self.sandbox.read_text(self._proxy_trace_path) + except Exception: + return [] + records: list[dict[str, Any]] = [] + for line in content.splitlines(): + line = line.strip() + if not line: + continue + records.append(json.loads(line)) + return records + + def close(self) -> None: + if self._proxy_bg_job is not None: + try: + self._proxy_bg_job.kill() + except Exception: + pass + self._proxy_bg_job = None + super().close() + + def wait_for_completion(self, timeout_s: float | None = None) -> int: + budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s + if self._agent_bg_job is None: + raise RuntimeError("Agent not started.") + return self._agent_bg_job.wait(timeout=budget) + + def start_agent(self) -> None: + if self._agent_bg_job is not None: + return + cmd = build_run_cmd(self.config) + envs = build_env_vars(self.config, base_url_override=self._base_url_override) + self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs) + + +class OpenCodeSessionFactory(ResourceSessionFactory): + def __init__( + self, + *, + config: OpenCodeConfig, + sandbox_backend: SandboxBackend, + mode: Literal[ + "black_box", "transparent_proxy", "interception_gate" + ] = "transparent_proxy", + verifier: Verifier | None = None, + install_timeout_s: int = 240, + setup_timeout_s: int = 300, + interception_server: InterceptionServer | None = None, + interception_base_url: str | None = None, + ) -> None: + if mode not in {"black_box", "transparent_proxy", "interception_gate"}: + raise ValueError(f"Unknown mode: {mode!r}") + self._config = config + self._backend = sandbox_backend + self._mode = mode + self._verifier = verifier + driver_mode: Literal["black_box", "interception_gate"] = ( + "black_box" if mode == "transparent_proxy" else mode + ) + self._driver = CLIAgentDriver( + spec=OPENCODE_SPEC, + sandbox_backend=sandbox_backend, + mode=driver_mode, + install_timeout_s=install_timeout_s, + setup_timeout_s=setup_timeout_s, + interception_server=interception_server, + interception_base_url=interception_base_url, + ) + + def create( + self, + task: Any, + seed: int | None = None, + episode_id: str | None = None, + ) -> OpenCodeSession: + import logging + + _log = logging.getLogger(__name__) + oc_task = OpenCodeTask.coerce(task) + setup_parts: list[str] = [] + if self._config.extra_setup_shell: + setup_parts.append(self._config.extra_setup_shell) + if oc_task.setup_shell: + setup_parts.append(oc_task.setup_shell) + if setup_parts: + oc_task = oc_task.model_copy( + update={"setup_shell": "set -e\n" + "\n".join(setup_parts)} + ) + + sandbox_timeout = int(self._config.agent_timeout_s) + 300 + sandbox = self._backend.create( + timeout_s=sandbox_timeout, + metadata={"episode_id": episode_id} if episode_id else None, + ) + try: + self._bootstrap_sandbox(sandbox, oc_task) + except Exception as exc: + _log.error("factory.create: bootstrap failed: %r", exc) + sandbox.kill() + raise + + base_url_override: str | None = None + interception_rollout_id: str | None = None + interception_queue: _queue_mod.Queue[str | None] | None = None + proxy_trace_path: str | None = None + proxy_bg_job: BgJob | None = None + + if self._mode == "interception_gate": + interception_server = self._driver._interception_server + if interception_server is None: + raise RuntimeError( + "interception_gate mode requires an InterceptionServer" + ) + interception_base_url = self._driver._interception_base_url + if interception_base_url is None: + raise RuntimeError( + "interception_gate mode requires interception_base_url" + ) + rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}" + interception_rollout_id = rollout_id + interception_queue = interception_server.register_rollout(rollout_id) + base_url_override = build_interception_rollout_url( + interception_base_url, + rollout_id, + ) + elif self._mode == "transparent_proxy": + proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy( + sandbox + ) + + run_config = self._config + if base_url_override is not None: + api_key = self._config.api_key + if self._mode == "interception_gate": + assert self._driver._interception_server is not None + api_key = self._driver._interception_server.secret + run_config = self._config.model_copy( + update={ + "provider": "openai_compatible", + "base_url": base_url_override, + "api_key": api_key, + } + ) + sandbox.write_text( + opencode_config_path(self._config), + build_opencode_json(run_config), + ) + agent_bg_job = self._driver._start_agent( + sandbox, + oc_task, + run_config, + base_url_override=base_url_override, + ) + + return OpenCodeSession( + sandbox=sandbox, + config=run_config, + task=oc_task, + verifier=self._verifier, + base_url_override=base_url_override, + agent_bg_job=agent_bg_job, + proxy_trace_path=proxy_trace_path, + proxy_bg_job=proxy_bg_job, + interception_server=self._driver._interception_server, + interception_rollout_id=interception_rollout_id, + interception_queue=interception_queue, + ) + + def _start_proxy( + self, + sandbox: SandboxHandle, + ) -> tuple[BgJob, str, str]: + """Start the in-sandbox logprob-capturing proxy.""" + proxy_already_present = sandbox.exists("/home/user/proxy/interception.py") + + if not proxy_already_present: + self._driver._exec_with_retry( + sandbox, + "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' " + "'httpx>=0.27' 2>&1 | tail -20", + timeout=180, + attempts=3, + backoff_s=2.0, + label="proxy deps install", + ) + sandbox.write_text( + "/home/user/proxy/interception.py", + _PROXY_SOURCE_PATH.read_text(), + ) + sandbox.write_text("/home/user/proxy/__init__.py", "") + + proxy_args = [ + "python", + "interception.py", + "--upstream-url", + self._config.base_url, + "--trace", + _PROXY_TRACE_PATH, + "--port", + str(_PROXY_PORT), + "--top-logprobs", + str(self._config.proxy_top_logprobs), + ] + if self._config.proxy_max_tokens_cap is not None: + proxy_args.extend( + ["--max-tokens-cap", str(self._config.proxy_max_tokens_cap)] + ) + if self._config.proxy_disable_thinking: + proxy_args.append("--disable-thinking") + if self._config.model: + proxy_args.extend(["--model-override", self._config.model]) + + quoted_proxy_args = " ".join(shlex.quote(arg) for arg in proxy_args) + proxy_cmd = ( + "cd /home/user/proxy && " + f"{quoted_proxy_args} " + f"> {shlex.quote(_PROXY_LOG_PATH)} 2>&1" + ) + proxy_env = {"OPENCODE_UPSTREAM_API_KEY": self._config.api_key} + proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env) + + import time + + attempts = 120 + interval_s = 0.5 + for _ in range(attempts): + r = sandbox.exec( + f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz", + timeout=5, + ) + if r.exit_code == 0: + break + time.sleep(interval_s) + else: + log = "" + try: + log = sandbox.read_text(_PROXY_LOG_PATH) + except Exception: + pass + proxy_job.kill() + raise RuntimeError( + f"proxy did not start within {attempts * interval_s:.0f}s. " + f"log:\n{log[-2000:]}" + ) + + base_url_override = f"http://127.0.0.1:{_PROXY_PORT}/v1" + return proxy_job, base_url_override, _PROXY_TRACE_PATH + + def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: OpenCodeTask) -> None: + self._driver.bootstrap_sandbox(sandbox, task, self._config) + + +__all__ = [ + "OpenCodeSession", + "OpenCodeSessionFactory", + "OpenCodeTask", + "Verifier", +] diff --git a/envs/coding_agent_env/models.py b/envs/opencode_env/models.py similarity index 67% rename from envs/coding_agent_env/models.py rename to envs/opencode_env/models.py index 2bf19925e..d2b023839 100644 --- a/envs/coding_agent_env/models.py +++ b/envs/opencode_env/models.py @@ -4,20 +4,36 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Pydantic models for the deployed coding_agent_env HTTP server. +"""Pydantic models for the deployed opencode_env HTTP server. The server exposes a single MCP tool ``run_rollout`` that takes a Task (instruction + setup commands + verify commands) plus an LLM endpoint -config, runs one coding-agent rollout end-to-end inside an E2B sandbox, and +config, runs one OpenCode rollout end-to-end inside an E2B sandbox, and returns a :class:`RolloutResult` JSON. """ from __future__ import annotations +from typing import Any + from openenv.core.env_server.types import State from pydantic import BaseModel, Field +class RolloutTurn(BaseModel): + """One intercepted LLM turn captured by transparent-proxy mode.""" + + turn: int + finish_reason: str | None = None + completion_tokens: list[str] = Field(default_factory=list) + completion_token_ids: list[int] = Field(default_factory=list) + per_token_logps: list[float] = Field(default_factory=list) + latency_s: float = 0.0 + timestamp: float = 0.0 + upstream_status: int | None = None + upstream_error: dict[str, Any] | None = None + + class CommandResult(BaseModel): """Outcome of one bash command in setup/verify. @@ -50,19 +66,23 @@ class RolloutResult(BaseModel): setup_results: list[CommandResult] = Field(default_factory=list) verify_results: list[CommandResult] = Field(default_factory=list) + # Per-turn LLM trajectory (empty outside transparent_proxy mode) + proxy_turns: list[RolloutTurn] = Field(default_factory=list) + # Filesystem the agent produced (path -> contents, truncated) files: dict[str, str] = Field(default_factory=dict) files_extra: list[str] = Field(default_factory=list) # Diagnostic tails agent_log_tail: str = "" + proxy_log_tail: str = "" # Error surfacing error: str | None = None -class CodingAgentState(State): - """Per-session env state across calls to one CodingAgentEnvironment instance. +class OpenCodeState(State): + """Per-session env state across calls to one OpenCodeEnvironment instance. Each HTTP session gets its own env (``SUPPORTS_CONCURRENT_SESSIONS=True`` on the server class), so this state is per-session. diff --git a/envs/coding_agent_env/opencode_runtime.py b/envs/opencode_env/opencode_runtime.py similarity index 87% rename from envs/coding_agent_env/opencode_runtime.py rename to envs/opencode_env/opencode_runtime.py index 31285556e..0f1484e3a 100644 --- a/envs/coding_agent_env/opencode_runtime.py +++ b/envs/opencode_env/opencode_runtime.py @@ -16,34 +16,34 @@ import json from typing import Any -from .config import CodingAgentConfig, provider_npm_package +from .config import OpenCodeConfig, provider_npm_package -def opencode_config_path(config: CodingAgentConfig) -> str: +def opencode_config_path(config: OpenCodeConfig) -> str: return f"{config.sandbox_home}/.config/opencode/opencode.json" -def instruction_path(config: CodingAgentConfig) -> str: +def instruction_path(config: OpenCodeConfig) -> str: return f"{config.sandbox_home}/task/instruction.md" -def agent_log_path(config: CodingAgentConfig) -> str: +def agent_log_path(config: OpenCodeConfig) -> str: return f"{config.sandbox_home}/logs/agent/opencode.jsonl" -def system_prompt_path(config: CodingAgentConfig) -> str: +def system_prompt_path(config: OpenCodeConfig) -> str: return f"{config.sandbox_home}/task/system.md" -def verifier_reward_path(config: CodingAgentConfig) -> str: +def verifier_reward_path(config: OpenCodeConfig) -> str: return f"{config.sandbox_home}/logs/verifier/reward.txt" -def workdir_path(config: CodingAgentConfig) -> str: +def workdir_path(config: OpenCodeConfig) -> str: return f"{config.sandbox_home}/workdir" -def build_opencode_json(config: CodingAgentConfig) -> str: +def build_opencode_json(config: OpenCodeConfig) -> str: """Return the serialized ``opencode.json`` the sandbox should install. Provider block is keyed by a stable internal name (``intercepted``) so the @@ -90,7 +90,7 @@ def build_opencode_json(config: CodingAgentConfig) -> str: return json.dumps(doc, indent=2) -def build_install_cmd(config: CodingAgentConfig) -> str: +def build_install_cmd(config: OpenCodeConfig) -> str: """Return the shell command that installs OpenCode + ensures PATH. The upstream installer honors ``OPENCODE_VERSION=x.y.z`` for pinning; @@ -110,7 +110,7 @@ def build_install_cmd(config: CodingAgentConfig) -> str: ) -def build_run_cmd(config: CodingAgentConfig) -> str: +def build_run_cmd(config: OpenCodeConfig) -> str: """Return the shell command that launches OpenCode against a task.""" format_flag = "--format json" if config.run_format == "json" else "" @@ -123,7 +123,7 @@ def build_run_cmd(config: CodingAgentConfig) -> str: def build_env_vars( - config: CodingAgentConfig, *, base_url_override: str | None = None + config: OpenCodeConfig, *, base_url_override: str | None = None ) -> dict[str, str]: """Return env vars to set on the OpenCode process. @@ -140,7 +140,7 @@ def build_env_vars( return env -def _build_tools_block(config: CodingAgentConfig) -> dict[str, bool]: +def _build_tools_block(config: OpenCodeConfig) -> dict[str, bool]: """Translate enabled/disabled lists into opencode's ``tools`` map.""" if config.enabled_tools is not None: diff --git a/envs/coding_agent_env/openenv.yaml b/envs/opencode_env/openenv.yaml similarity index 76% rename from envs/coding_agent_env/openenv.yaml rename to envs/opencode_env/openenv.yaml index be34c3a51..2a534a088 100644 --- a/envs/coding_agent_env/openenv.yaml +++ b/envs/opencode_env/openenv.yaml @@ -1,5 +1,5 @@ spec_version: 1 -name: coding_agent_env +name: opencode_env type: space runtime: fastapi app: server.app:app diff --git a/envs/coding_agent_env/pyproject.toml b/envs/opencode_env/pyproject.toml similarity index 71% rename from envs/coding_agent_env/pyproject.toml rename to envs/opencode_env/pyproject.toml index d935a0bf5..a72ade07d 100644 --- a/envs/coding_agent_env/pyproject.toml +++ b/envs/opencode_env/pyproject.toml @@ -9,9 +9,9 @@ requires = ["setuptools>=45", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "openenv-coding-agent-env" +name = "openenv-opencode-env" version = "0.1.0" -description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints." +description = "OpenCode environment for OpenEnv — runs the OpenCode CLI in an E2B sandbox against OpenAI-compatible LLM endpoints." requires-python = ">=3.10" dependencies = [ # Core OpenEnv (server + MCP). 0.3.0 ships the harness runtime. @@ -40,16 +40,16 @@ dev = [ [project.scripts] # Server entrypoint — enables ``uv run --project . server``. -server = "coding_agent_env.server.app:main" +server = "opencode_env.server.app:main" [tool.setuptools] include-package-data = true packages = [ - "coding_agent_env", - "coding_agent_env.sandbox", - "coding_agent_env.server", + "opencode_env", + "opencode_env.sandbox", + "opencode_env.server", ] -package-dir = { "coding_agent_env" = ".", "coding_agent_env.sandbox" = "sandbox", "coding_agent_env.server" = "server" } +package-dir = { "opencode_env" = ".", "opencode_env.sandbox" = "sandbox", "opencode_env.server" = "server" } [tool.setuptools.package-data] -coding_agent_env = ["**/*.md"] +opencode_env = ["**/*.md"] diff --git a/envs/coding_agent_env/sandbox/__init__.py b/envs/opencode_env/sandbox/__init__.py similarity index 100% rename from envs/coding_agent_env/sandbox/__init__.py rename to envs/opencode_env/sandbox/__init__.py diff --git a/envs/coding_agent_env/sandbox/build_template.py b/envs/opencode_env/sandbox/build_template.py similarity index 94% rename from envs/coding_agent_env/sandbox/build_template.py rename to envs/opencode_env/sandbox/build_template.py index 01978b520..67cf0756d 100644 --- a/envs/coding_agent_env/sandbox/build_template.py +++ b/envs/opencode_env/sandbox/build_template.py @@ -60,10 +60,10 @@ def build_template(name: str, *, skip_cache: bool = False) -> str: def main(argv: list[str] | None = None) -> int: p = argparse.ArgumentParser(prog="build_e2b_template") - p.add_argument("--name", default="coding-agent-rl") + p.add_argument("--name", default="opencode-rl") p.add_argument("--skip-cache", action="store_true") args = p.parse_args(argv) - _load_env(_REPO_ROOT / "envs" / "coding_agent_env" / "sandbox" / ".env") + _load_env(_REPO_ROOT / "envs" / "opencode_env" / "sandbox" / ".env") if not os.environ.get("E2B_API_KEY"): print("ERROR: E2B_API_KEY required.", file=sys.stderr) return 2 diff --git a/envs/opencode_env/sandbox/interception.py b/envs/opencode_env/sandbox/interception.py new file mode 100644 index 000000000..131d41024 --- /dev/null +++ b/envs/opencode_env/sandbox/interception.py @@ -0,0 +1,661 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Transparent OpenAI-compatible forwarding proxy with logprob capture. + +The proxy is a small FastAPI app that OpenCode talks to instead of the upstream +LLM endpoint. It: + +1. Forwards every ``POST /v1/chat/completions`` request to the real upstream + URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream + returns per-token logprobs. +2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines + trace file. +3. Returns the upstream response to OpenCode verbatim (minus the ``logprobs`` + field, which we strip so the CLI never sees anything unexpected). + +The proxy is stateless beyond the trace file. One proxy instance runs per +session, normally inside the sandbox on ``localhost:7000``. + +Run standalone:: + + OPENCODE_UPSTREAM_API_KEY=... python -m opencode_env.interception \\ + --upstream-url https://vllm.example/v1 \\ + --trace /tmp/trace.jsonl \\ + --port 7000 +""" + +from __future__ import annotations + +import argparse +import asyncio +import copy +import json +import logging +import os +import socket +import threading +import time +from contextlib import asynccontextmanager, closing +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import httpx +import uvicorn +from fastapi import FastAPI, Request, Response +from fastapi.responses import JSONResponse, StreamingResponse + + +CHAT_COMPLETIONS_PATH = "/v1/chat/completions" +_LOG = logging.getLogger(__name__) + + +@dataclass +class ProxyConfig: + """Runtime configuration for one :class:`InterceptionProxy`.""" + + upstream_url: str + upstream_api_key: str = "intercepted" + trace_path: str = "/tmp/opencode-proxy-trace.jsonl" + host: str = "127.0.0.1" + port: int = 7000 + top_logprobs: int = 5 + request_timeout_s: float = 600.0 + # Cap ``max_tokens`` before forwarding. OpenCode historically asks for very + # large values (e.g. 32000) that exceed gpt-4o-mini's 16384 cap; capping + # here avoids spurious upstream 400s without requiring the caller to know + # per-model limits. + max_tokens_cap: int | None = 16384 + # Disable Qwen-style reasoning/thinking by injecting + # ``chat_template_kwargs.enable_thinking=false`` into forwarded requests. + disable_thinking: bool = False + # Override the ``model`` field on every forwarded request. Some opencode + # builds emit a stripped model id (e.g. ``Qwen3.5-4B`` instead of the + # ``Qwen/Qwen3.5-4B`` the upstream serves) for their internal + # title-generation call. Setting this to the exact upstream model id + # bypasses that mismatch. + model_override: str | None = None + + +@dataclass +class TurnRecord: + """One intercepted turn, written to the trace file as JSON-lines.""" + + turn: int + request: dict[str, Any] + response: dict[str, Any] + logprobs: list[dict[str, Any]] | None + completion_tokens: list[str] + completion_token_ids: list[int] + per_token_logps: list[float] + finish_reason: str | None + latency_s: float + timestamp: float = field(default_factory=time.time) + + def to_json(self) -> str: + return json.dumps(self.__dict__, default=str) + + +def _build_app(cfg: ProxyConfig) -> FastAPI: + """Construct the FastAPI app that serves one proxy session.""" + + state: dict[str, Any] = {"turn": 0, "lock": asyncio.Lock()} + + # HTTP client reused across requests. ``None`` auth header — we let each + # request carry its own ``Authorization`` populated from ``upstream_api_key``. + client = httpx.AsyncClient(timeout=cfg.request_timeout_s) + trace_file = open(cfg.trace_path, "a", buffering=1) + + @asynccontextmanager + async def lifespan(_: FastAPI) -> Any: + try: + yield + finally: + await client.aclose() + trace_file.close() + + app = FastAPI(title="opencode-interception-proxy", lifespan=lifespan) + + @app.get("/healthz") + def healthz() -> dict[str, str]: + return {"status": "ok"} + + @app.post(CHAT_COMPLETIONS_PATH) + async def chat_completions(request: Request) -> Response: + raw_body = await request.body() + try: + body = json.loads(raw_body) + except json.JSONDecodeError: + return JSONResponse( + status_code=400, content={"error": "invalid json body"} + ) + + forwarded_body = _prepare_forwarded_body(body, cfg) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {cfg.upstream_api_key}", + } + upstream_url = _resolve_upstream_url(cfg.upstream_url) + + async with state["lock"]: + state["turn"] += 1 + turn_idx = state["turn"] + + if forwarded_body.get("stream"): + return await _proxy_streaming( + client=client, + upstream_url=upstream_url, + headers=headers, + forwarded_body=forwarded_body, + original_body=body, + trace_file=trace_file, + turn_idx=turn_idx, + ) + return await _proxy_unary( + client=client, + upstream_url=upstream_url, + headers=headers, + forwarded_body=forwarded_body, + original_body=body, + trace_file=trace_file, + turn_idx=turn_idx, + ) + + return app + + +def _prepare_forwarded_body(body: dict[str, Any], cfg: ProxyConfig) -> dict[str, Any]: + """Return the body we actually send upstream. + + - Injects ``logprobs=true`` + ``top_logprobs`` so the upstream emits + per-token logprobs. + - Caps ``max_tokens`` / ``max_completion_tokens`` to ``max_tokens_cap``. + - For models that reject ``max_tokens`` (e.g. gpt-5.x), translates to + ``max_completion_tokens``. + """ + forwarded = copy.deepcopy(body) + forwarded.setdefault("logprobs", True) + forwarded.setdefault("top_logprobs", cfg.top_logprobs) + + # GPT-5.x and newer: ``max_tokens`` is rejected; must use + # ``max_completion_tokens``. Detect via model string so we don't break + # gpt-4.x or vLLM-hosted models that accept ``max_tokens``. + model = str(forwarded.get("model", "")) + needs_translation = _model_uses_max_completion_tokens(model) + if needs_translation and "max_tokens" in forwarded: + value = forwarded.pop("max_tokens") + forwarded.setdefault("max_completion_tokens", value) + + if cfg.max_tokens_cap is not None: + for key in ("max_tokens", "max_completion_tokens"): + value = forwarded.get(key) + if isinstance(value, int) and value > cfg.max_tokens_cap: + forwarded[key] = cfg.max_tokens_cap + + if cfg.disable_thinking: + # vLLM applies chat_template_kwargs to the tokenizer's chat template + # for Qwen3/Qwen3.5 models, turning off ... generation. + extra = forwarded.setdefault("chat_template_kwargs", {}) + extra.setdefault("enable_thinking", False) + + if cfg.model_override: + forwarded["model"] = cfg.model_override + + return forwarded + + +def _model_uses_max_completion_tokens(model: str) -> bool: + """Heuristic: ``True`` for models that reject ``max_tokens``.""" + # Strip a provider prefix opencode may have prepended (e.g. "intercepted/"). + bare = model.split("/", 1)[-1].lower() + return bare.startswith(("gpt-5", "o1", "o3", "o4")) + + +def _resolve_upstream_url(upstream: str) -> str: + """Build the fully qualified chat-completions URL from a base URL.""" + base = upstream.rstrip("/") + if base.endswith("/v1"): + return f"{base}/chat/completions" + return f"{base}{CHAT_COMPLETIONS_PATH}" + + +async def _proxy_unary( + *, + client: httpx.AsyncClient, + upstream_url: str, + headers: dict[str, str], + forwarded_body: dict[str, Any], + original_body: dict[str, Any], + trace_file: Any, + turn_idx: int, +) -> Response: + start = time.time() + upstream_response = await client.post( + upstream_url, content=json.dumps(forwarded_body), headers=headers + ) + latency = time.time() - start + try: + response_json = upstream_response.json() + except Exception: + return Response( + content=upstream_response.content, + status_code=upstream_response.status_code, + media_type=upstream_response.headers.get( + "content-type", "application/json" + ), + ) + + record = _build_turn_record( + turn_idx=turn_idx, + request_body=forwarded_body, + response_json=response_json, + latency_s=latency, + ) + trace_file.write(record.to_json() + "\n") + sanitized = _strip_logprobs(response_json) + return JSONResponse(content=sanitized, status_code=upstream_response.status_code) + + +async def _proxy_streaming( + *, + client: httpx.AsyncClient, + upstream_url: str, + headers: dict[str, str], + forwarded_body: dict[str, Any], + original_body: dict[str, Any], + trace_file: Any, + turn_idx: int, +) -> Response: + """Forward an SSE stream while accumulating the full response. + + Opens the upstream stream and inspects the status. On non-2xx, reads the + full body (an error JSON, not SSE) and returns it to the caller as a + regular JSON response — previously we silently emitted an empty + ``text/event-stream`` which opencode interpreted as an empty assistant + turn. Both the error body and the latency are written to the trace file + so debugging a broken rollout doesn't require another round-trip. + """ + + start = time.time() + + # Open the stream outside the generator so we can branch on status before + # committing to a streaming response shape. + upstream_cm = client.stream( + "POST", + upstream_url, + content=json.dumps(forwarded_body), + headers=headers, + ) + upstream = await upstream_cm.__aenter__() + + if upstream.status_code >= 400: + # Upstream responded with an error body (not SSE). Read it fully and + # return as a non-streaming JSON payload. + error_bytes = await upstream.aread() + await upstream_cm.__aexit__(None, None, None) + latency = time.time() - start + try: + error_json = json.loads(error_bytes.decode() or "{}") + except Exception: + error_json = {"error": error_bytes.decode(errors="replace")[:4000]} + record = _build_turn_record( + turn_idx=turn_idx, + request_body=forwarded_body, + response_json={ + "choices": [], + "usage": None, + "upstream_status": upstream.status_code, + "upstream_error": error_json, + }, + latency_s=latency, + ) + trace_file.write(record.to_json() + "\n") + _LOG.warning( + "proxy turn %s: upstream %s: %s", + turn_idx, + upstream.status_code, + str(error_json)[:400], + ) + return JSONResponse(content=error_json, status_code=upstream.status_code) + + async def _stream() -> Any: + accumulated: dict[str, Any] = { + "content_by_idx": {}, + "tool_calls_by_idx": {}, + "finish_by_idx": {}, + "logprobs_by_idx": {}, + } + last_chunk: dict[str, Any] = {} + try: + async for line in upstream.aiter_lines(): + if not line: + yield "\n" + continue + yield line + "\n" + if not line.startswith("data:"): + continue + data = line[len("data:"):].strip() + if data == "[DONE]": + continue + try: + chunk = json.loads(data) + except json.JSONDecodeError: + continue + last_chunk = chunk + _accumulate_stream_chunk(chunk, accumulated) + finally: + await upstream_cm.__aexit__(None, None, None) + + latency = time.time() - start + response_json = _assemble_streamed_response(last_chunk, accumulated) + record = _build_turn_record( + turn_idx=turn_idx, + request_body=forwarded_body, + response_json=response_json, + latency_s=latency, + ) + trace_file.write(record.to_json() + "\n") + + return StreamingResponse(_stream(), media_type="text/event-stream") + + +def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None: + for choice in chunk.get("choices", []) or []: + idx = choice.get("index", 0) + delta = choice.get("delta") or {} + content = delta.get("content") + if content: + acc["content_by_idx"].setdefault(idx, []).append(content) + # HF-Router's Qwen thinking mode streams the chain-of-thought under a + # separate ``reasoning`` field (per Together/Scaleway). Accumulate it + # so the assembled response surfaces it — otherwise it's dropped and + # proxy_turn observability is lost for thinking-mode rollouts. + reasoning = delta.get("reasoning") + if reasoning: + acc.setdefault("reasoning_by_idx", {}).setdefault(idx, []).append(reasoning) + for tc in delta.get("tool_calls") or []: + tc_idx = tc.get("index", 0) + bucket = acc["tool_calls_by_idx"].setdefault( + (idx, tc_idx), + {"id": None, "type": "function", "function": {"name": "", "arguments": ""}}, + ) + if tc.get("id"): + bucket["id"] = tc["id"] + fn = tc.get("function") or {} + if fn.get("name"): + bucket["function"]["name"] += fn["name"] + if fn.get("arguments"): + bucket["function"]["arguments"] += fn["arguments"] + if choice.get("finish_reason"): + acc["finish_by_idx"][idx] = choice["finish_reason"] + lp = choice.get("logprobs") or {} + content_lp = lp.get("content") + if content_lp: + acc["logprobs_by_idx"].setdefault(idx, []).extend(content_lp) + + +def _assemble_streamed_response( + last_chunk: dict[str, Any], acc: dict[str, Any] +) -> dict[str, Any]: + indices = sorted( + set(acc["content_by_idx"]) + | set(acc["finish_by_idx"]) + | {k[0] for k in acc["tool_calls_by_idx"]} + | set(acc["logprobs_by_idx"]) + | {0} + ) + choices: list[dict[str, Any]] = [] + for idx in indices: + tool_calls = [ + acc["tool_calls_by_idx"][k] + for k in sorted(acc["tool_calls_by_idx"]) + if k[0] == idx + ] + message: dict[str, Any] = {"role": "assistant"} + content = "".join(acc["content_by_idx"].get(idx, [])) + if content: + message["content"] = content + reasoning = "".join((acc.get("reasoning_by_idx") or {}).get(idx, [])) + if reasoning: + message["reasoning"] = reasoning + if tool_calls: + message["tool_calls"] = tool_calls + choice: dict[str, Any] = { + "index": idx, + "message": message, + "finish_reason": acc["finish_by_idx"].get(idx), + } + if acc["logprobs_by_idx"].get(idx): + choice["logprobs"] = {"content": acc["logprobs_by_idx"][idx]} + choices.append(choice) + return { + "id": last_chunk.get("id", ""), + "object": "chat.completion", + "model": last_chunk.get("model", ""), + "choices": choices, + "usage": last_chunk.get("usage"), + } + + +def _build_turn_record( + *, + turn_idx: int, + request_body: dict[str, Any], + response_json: dict[str, Any], + latency_s: float, +) -> TurnRecord: + """Extract per-token logprobs into a normalized :class:`TurnRecord`.""" + + choice = (response_json.get("choices") or [{}])[0] + logprobs_field = choice.get("logprobs") or {} + content_lp = logprobs_field.get("content") or [] + + tokens: list[str] = [] + token_ids: list[int] = [] + per_token_logps: list[float] = [] + for entry in content_lp: + tokens.append(entry.get("token", "")) + # OpenAI returns no raw token ids; vLLM returns them as ``token_id``. + token_id = entry.get("token_id") + if token_id is not None: + token_ids.append(int(token_id)) + lp = entry.get("logprob") + if lp is not None: + per_token_logps.append(float(lp)) + + return TurnRecord( + turn=turn_idx, + request=request_body, + response=response_json, + logprobs=content_lp, + completion_tokens=tokens, + completion_token_ids=token_ids, + per_token_logps=per_token_logps, + finish_reason=choice.get("finish_reason"), + latency_s=latency_s, + ) + + +def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]: + """Return a copy of the response with ``choices[*].logprobs`` removed.""" + + out = dict(response_json) + choices = out.get("choices") + if isinstance(choices, list): + out["choices"] = [ + {k: v for k, v in (ch or {}).items() if k != "logprobs"} + for ch in choices + ] + return out + + +# --------------------------------------------------------------------------- +# Standalone runner (used inside the sandbox) +# --------------------------------------------------------------------------- + + +def serve(cfg: ProxyConfig) -> None: + """Start the proxy and block (for use as the sandbox-side entry point).""" + + app = _build_app(cfg) + uvicorn.run(app, host=cfg.host, port=cfg.port, log_level="warning") + + +class InterceptionProxy: + """Thread-backed controller for running the proxy locally. + + Used by unit tests and by any in-process driver that wants a short-lived + proxy on the local machine. Inside a sandbox we invoke :func:`serve` + directly via ``python -m opencode_env.interception``. + """ + + def __init__(self, cfg: ProxyConfig) -> None: + self._cfg = cfg + self._server: uvicorn.Server | None = None + self._thread: threading.Thread | None = None + self._ready = threading.Event() + + @property + def url(self) -> str: + return f"http://{self._cfg.host}:{self._cfg.port}/v1" + + @property + def config(self) -> ProxyConfig: + return self._cfg + + def start(self) -> None: + app = _build_app(self._cfg) + config = uvicorn.Config( + app, + host=self._cfg.host, + port=self._cfg.port, + log_level="warning", + lifespan="on", + ) + self._server = uvicorn.Server(config) + self._thread = threading.Thread( + target=self._run_server, daemon=True + ) + self._thread.start() + # Wait for the server to accept connections. + deadline = time.time() + 10 + while time.time() < deadline: + if _port_open(self._cfg.host, self._cfg.port): + self._ready.set() + return + time.sleep(0.05) + raise RuntimeError("InterceptionProxy failed to start within 10s") + + def _run_server(self) -> None: + assert self._server is not None + self._server.run() + + def stop(self) -> None: + if self._server is None: + return + self._server.should_exit = True + if self._thread is not None: + self._thread.join(timeout=5) + self._server = None + self._thread = None + + def __enter__(self) -> "InterceptionProxy": + self.start() + return self + + def __exit__(self, *exc) -> None: + self.stop() + + +def _port_open(host: str, port: int) -> bool: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.settimeout(0.2) + return s.connect_ex((host, port)) == 0 + + +# --------------------------------------------------------------------------- +# Trace reader (used by the session to pull captured turns back) +# --------------------------------------------------------------------------- + + +def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]: + """Read a proxy trace file into a list of dicts.""" + + trace: list[dict[str, Any]] = [] + p = Path(path) + if not p.exists(): + return trace + for line in p.read_text().splitlines(): + line = line.strip() + if not line: + continue + trace.append(json.loads(line)) + return trace + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser(prog="opencode_env.interception") + parser.add_argument("--upstream-url", required=True) + parser.add_argument( + "--upstream-api-key", + default=None, + help=( + "Upstream API key. Prefer OPENCODE_UPSTREAM_API_KEY so the key " + "does not appear in process argv." + ), + ) + parser.add_argument("--trace", default="/tmp/opencode-proxy-trace.jsonl") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=7000) + parser.add_argument("--top-logprobs", type=int, default=5) + parser.add_argument("--request-timeout", type=float, default=600.0) + parser.add_argument( + "--max-tokens-cap", + type=int, + default=None, + help="Clamp max_tokens/max_completion_tokens on forwarded requests.", + ) + parser.add_argument( + "--disable-thinking", + action="store_true", + help="Inject chat_template_kwargs.enable_thinking=false (Qwen3/Qwen3.5).", + ) + parser.add_argument( + "--model-override", + default=None, + help="Rewrite the `model` field on every forwarded request.", + ) + args = parser.parse_args() + upstream_api_key = ( + args.upstream_api_key + or os.environ.get("OPENCODE_UPSTREAM_API_KEY") + or os.environ.get("UPSTREAM_API_KEY") + or "intercepted" + ) + + cfg = ProxyConfig( + upstream_url=args.upstream_url, + upstream_api_key=upstream_api_key, + trace_path=args.trace, + host=args.host, + port=args.port, + top_logprobs=args.top_logprobs, + request_timeout_s=args.request_timeout, + max_tokens_cap=args.max_tokens_cap, + disable_thinking=args.disable_thinking, + model_override=args.model_override, + ) + serve(cfg) + + +if __name__ == "__main__": + main() diff --git a/envs/coding_agent_env/server/Dockerfile b/envs/opencode_env/server/Dockerfile similarity index 91% rename from envs/coding_agent_env/server/Dockerfile rename to envs/opencode_env/server/Dockerfile index 97e880343..ad8319423 100644 --- a/envs/coding_agent_env/server/Dockerfile +++ b/envs/opencode_env/server/Dockerfile @@ -4,14 +4,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # -# coding_agent_env Dockerfile — mirrors the standard OpenEnv multi-stage uv +# opencode_env Dockerfile — mirrors the standard OpenEnv multi-stage uv # build used by echo_env / repl_env / jupyter_agent. # # Build: -# docker build -t coding-agent-env . +# docker build -t opencode-env . # # Run: -# docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env +# docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest FROM ${BASE_IMAGE} AS builder diff --git a/envs/coding_agent_env/server/__init__.py b/envs/opencode_env/server/__init__.py similarity index 79% rename from envs/coding_agent_env/server/__init__.py rename to envs/opencode_env/server/__init__.py index 2eac4fb05..56363edaa 100644 --- a/envs/coding_agent_env/server/__init__.py +++ b/envs/opencode_env/server/__init__.py @@ -4,4 +4,4 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Server-side for the deployed coding_agent_env.""" +"""Server-side for the deployed opencode_env.""" diff --git a/envs/coding_agent_env/server/app.py b/envs/opencode_env/server/app.py similarity index 81% rename from envs/coding_agent_env/server/app.py rename to envs/opencode_env/server/app.py index df40b507f..0757ef229 100644 --- a/envs/coding_agent_env/server/app.py +++ b/envs/opencode_env/server/app.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""FastAPI app for the coding_agent_env MCP server. +"""FastAPI app for the opencode_env MCP server. Mirrors the standard OpenEnv pattern (echo_env / repl_env / jupyter_agent) plus the custom Gradio UI mounted at ``/web`` per the @@ -16,7 +16,7 @@ E2B_API_KEY=... uvicorn server.app:app --host 0.0.0.0 --port 8000 # Docker: - docker run -p 8000:8000 -e E2B_API_KEY=... coding-agent-env + docker run -p 8000:8000 -e E2B_API_KEY=... opencode-env # HF Space: deploys via the root ``Dockerfile``. @@ -58,13 +58,13 @@ def _load_env_file() -> None: from openenv.core.env_server.http_server import create_app from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation - from .gradio_ui import coding_agent_gradio_builder - from .coding_environment import CodingAgentEnvironment + from .gradio_ui import opencode_gradio_builder + from .opencode_environment import OpenCodeEnvironment except ImportError: # pragma: no cover from openenv.core.env_server.http_server import create_app from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation - from server.gradio_ui import coding_agent_gradio_builder # type: ignore - from server.coding_environment import CodingAgentEnvironment # type: ignore + from server.gradio_ui import opencode_gradio_builder # type: ignore + from server.opencode_environment import OpenCodeEnvironment # type: ignore # Always expose the Gradio UI at /web. Set ENABLE_WEB_INTERFACE=false to @@ -80,22 +80,22 @@ def _custom_gradio_builder( title, quick_start_md, ): - """Hand off to ``server.gradio_ui.coding_agent_gradio_builder``.""" - return coding_agent_gradio_builder( + """Hand off to ``server.gradio_ui.opencode_gradio_builder``.""" + return opencode_gradio_builder( web_manager, action_fields, metadata, is_chat_env, - title or "coding_agent_env", + title or "opencode_env", quick_start_md, ) app = create_app( - CodingAgentEnvironment, + OpenCodeEnvironment, CallToolAction, CallToolObservation, - env_name="coding_agent_env", + env_name="opencode_env", max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")), gradio_builder=_custom_gradio_builder, ) diff --git a/envs/coding_agent_env/server/catalog.py b/envs/opencode_env/server/catalog.py similarity index 100% rename from envs/coding_agent_env/server/catalog.py rename to envs/opencode_env/server/catalog.py diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/opencode_env/server/gradio_ui.py similarity index 92% rename from envs/coding_agent_env/server/gradio_ui.py rename to envs/opencode_env/server/gradio_ui.py index ea9cdb81f..bb4340aef 100644 --- a/envs/coding_agent_env/server/gradio_ui.py +++ b/envs/opencode_env/server/gradio_ui.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Minimal Gradio UI for coding_agent_env. +"""Minimal Gradio UI for opencode_env. Mounts under the standard OpenEnv ``/web`` path via the ``gradio_builder=`` callback documented at @@ -19,7 +19,7 @@ agent_timeout_s, template). - Preset buttons for the ready-made example tasks. - Run button → result panel with reward, setup/verify per-command - results, file outputs, agent log tail, and the raw RolloutResult JSON. + results, file outputs, proxy/OpenCode log tails, and the raw RolloutResult JSON. """ from __future__ import annotations @@ -31,14 +31,14 @@ try: from .catalog import catalog_summary, ENDPOINT_KINDS, resolve_endpoint - from .coding_environment import CodingAgentEnvironment + from .opencode_environment import OpenCodeEnvironment except ImportError: # pragma: no cover from server.catalog import ( # type: ignore catalog_summary, ENDPOINT_KINDS, resolve_endpoint, ) - from server.coding_environment import CodingAgentEnvironment # type: ignore + from server.opencode_environment import OpenCodeEnvironment # type: ignore # ──────────────────────────────────────────────────────────────────────────── @@ -156,7 +156,6 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]: def _live_status_md( - agent: str, endpoint_kind: str, model: str, mode: str, @@ -166,7 +165,7 @@ def _live_status_md( """Render a live phase log (latest at the bottom) with elapsed timestamps.""" head = ( f"### running… `elapsed={elapsed_s:.1f}s`\n\n" - f"_agent=`{agent}` endpoint=`{endpoint_kind}` model=`{model}` mode=`{mode}`_\n\n" + f"_endpoint=`{endpoint_kind}` model=`{model}` mode=`{mode}`_\n\n" ) if not lines: body = "_(waiting for first phase update…)_" @@ -210,7 +209,7 @@ def _catalog_banner() -> str: # ──────────────────────────────────────────────────────────────────────────── -def coding_agent_gradio_builder( +def opencode_gradio_builder( web_manager, # noqa: ARG001 (unused: we instantiate the env directly) action_fields, # noqa: ARG001 metadata, # noqa: ARG001 @@ -218,17 +217,16 @@ def coding_agent_gradio_builder( title, quick_start_md, # noqa: ARG001 ) -> gr.Blocks: - """Build the coding_agent_env console. + """Build the opencode_env console. Compatible with ``create_app(..., gradio_builder=...)``. We ignore - ``web_manager`` and instantiate :class:`CodingAgentEnvironment` ourselves - inside the run handler — coding_agent_env's run_rollout doesn't need any + ``web_manager`` and instantiate :class:`OpenCodeEnvironment` ourselves + inside the run handler — opencode_env's run_rollout doesn't need any per-session state beyond the env's own bookkeeping, and instantiating is cheap (no sandbox is created until the tool fires). """ def run( - agent: str, endpoint: str, model: str, base_url: str, @@ -273,7 +271,7 @@ def run( else: dt = None - env = CodingAgentEnvironment() + env = OpenCodeEnvironment() # The worker fires _run_rollout_impl in a background thread and # streams progress messages into a queue; this generator polls the @@ -287,7 +285,6 @@ def _cb(msg: str) -> None: def _worker(): try: payload = env._run_rollout_impl( - agent=agent, base_url=resolved.base_url, api_key=resolved.api_key, model=resolved.model, @@ -318,7 +315,7 @@ def _worker(): # First yield: announce we've started. Empty result panels. yield ( - f"### running…\n\n_agent=`{agent}` endpoint=`{resolved.kind}` model=`{resolved.model}` mode=`{mode}`_", + f"### running…\n\n_endpoint=`{resolved.kind}` model=`{resolved.model}` mode=`{mode}`_", [], [], "", @@ -343,7 +340,6 @@ def _worker(): # Render the live status pane. elapsed = time.time() - t_start md = _live_status_md( - agent, resolved.kind, resolved.model, mode, @@ -369,7 +365,6 @@ def _worker(): [], "", _live_status_md( - agent, resolved.kind, resolved.model, mode, @@ -389,13 +384,13 @@ def _worker(): ( "### live phase log\n\n" + _live_status_md( - agent, resolved.kind, resolved.model, mode, time.time() - t_start, status_lines, ) + + f"\n\n### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:3000]}\n```" + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```" ), result, @@ -405,23 +400,17 @@ def apply_preset(name: str) -> tuple[str, str, str]: p = PRESETS.get(name) or {"instruction": "", "setup": "", "verify": ""} return p["instruction"], p["setup"], p["verify"] - with gr.Blocks(title=title or "coding_agent_env") as app: - gr.Markdown(f"# {title or 'coding_agent_env'}") + with gr.Blocks(title=title or "opencode_env") as app: + gr.Markdown(f"# {title or 'opencode_env'}") gr.Markdown( - "Run one coding-agent rollout in an E2B sandbox against your chosen " - "LLM endpoint. Pick an agent + endpoint, write the task as " + "Run one OpenCode rollout in an E2B sandbox against your chosen " + "LLM endpoint. Pick an endpoint, write the task as " "`(instruction, setup, verify)`, and inspect reward + logs." ) gr.Markdown(_catalog_banner()) with gr.Row(): - agent = gr.Dropdown( - choices=["opencode", "pi"], - value="opencode", - label="Agent", - scale=1, - ) endpoint = gr.Dropdown( choices=list(ENDPOINT_KINDS), value="openai", @@ -447,19 +436,19 @@ def apply_preset(name: str) -> tuple[str, str, str]: ) instruction = gr.Textbox( - label="Instruction (the prompt the selected agent runs)", + label="Instruction (the prompt OpenCode runs)", lines=4, value=PRESETS["binary_search"]["instruction"], ) with gr.Row(): setup_text = gr.Textbox( - label="Setup (one bash command per line — runs BEFORE the agent)", + label="Setup (one bash command per line — runs BEFORE OpenCode)", lines=5, value=PRESETS["binary_search"]["setup"], ) verify_text = gr.Textbox( - label="Verify (one bash command per line — runs AFTER the agent)", + label="Verify (one bash command per line — runs AFTER OpenCode)", lines=5, value=PRESETS["binary_search"]["verify"], ) @@ -472,8 +461,8 @@ def apply_preset(name: str) -> tuple[str, str, str]: with gr.Accordion("Tunables", open=False): with gr.Row(): mode = gr.Dropdown( - choices=["black_box", "interception_gate"], - value="black_box", + choices=["transparent_proxy", "black_box"], + value="transparent_proxy", label="mode", ) disable_thinking = gr.Dropdown( @@ -531,7 +520,6 @@ def apply_preset(name: str) -> tuple[str, str, str]: run_btn.click( fn=run, inputs=[ - agent, endpoint, model, base_url, diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/opencode_env/server/opencode_environment.py similarity index 76% rename from envs/coding_agent_env/server/coding_environment.py rename to envs/opencode_env/server/opencode_environment.py index 9174666e7..52ae27b4d 100644 --- a/envs/coding_agent_env/server/coding_environment.py +++ b/envs/opencode_env/server/opencode_environment.py @@ -4,19 +4,19 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Coding-agent MCP environment. +"""OpenCode MCP environment. Single MCP tool ``run_rollout`` with a uniform task shape: - - ``instruction`` — prompt for the selected agent - - ``setup`` — bash commands run BEFORE the agent (in the sandbox) - - ``verify`` — bash commands run AFTER the agent + - ``instruction`` — prompt for OpenCode + - ``setup`` — bash commands run BEFORE OpenCode (in the sandbox) + - ``verify`` — bash commands run AFTER OpenCode Reward = ``passed_verify_commands / total`` unless a verify command writes a float to ``/home/user/logs/verifier/reward.txt`` (override). Returns a JSON-serialized :class:`RolloutResult` with reward, -setup/verify command results, and file outputs. +setup/verify command results, transparent-proxy logprob turns, and file outputs. """ from __future__ import annotations @@ -29,7 +29,6 @@ from uuid import uuid4 from fastmcp import FastMCP -from pydantic import BaseModel, Field try: from openenv.core.env_server.mcp_environment import MCPEnvironment @@ -42,7 +41,7 @@ from server.catalog import ENDPOINT_KINDS, resolve_endpoint # type: ignore -# One rollout (sandbox cold start + harness install + agent run + +# One rollout (sandbox cold start + OpenCode install + agent run + # verifier) typically takes 30-180s; can spike to ~600s under load. Override # OpenEnv's 30s MCP-tool default so the server doesn't cut us off. _RUN_ROLLOUT_TIMEOUT_S = 900.0 @@ -54,29 +53,12 @@ _log = logging.getLogger(__name__) REWARD_FILE = f"{HOME}/logs/verifier/reward.txt" +PROXY_LOG = f"{HOME}/logs/agent/proxy.log" AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl" VERIFY_TIMEOUT_S = 120 -_SUPPORTED_AGENTS = ("opencode", "pi") -_AGENT_LOG_BY_AGENT: dict[str, str] = { - "opencode": f"{HOME}/logs/agent/opencode.jsonl", - "pi": f"{HOME}/logs/agent/pi.txt", -} -class _GenericAgentConfig(BaseModel): - """Minimal config shape for CLIAgentSessionFactory-backed agents.""" - - base_url: str - api_key: str - model: str - agent_timeout_s: float = 600.0 - sandbox_home: str = HOME - provider: str | None = None - thinking: str | None = "off" - extra_env: dict[str, str] = Field(default_factory=dict) - - -class CodingAgentEnvironment(MCPEnvironment): +class OpenCodeEnvironment(MCPEnvironment): """Per-session environment exposing a single ``run_rollout`` MCP tool.""" SUPPORTS_CONCURRENT_SESSIONS = True @@ -85,23 +67,22 @@ def __init__(self) -> None: # Lazy imports so module import stays cheap and so tests can patch. try: from ..models import ( - CodingAgentState, + OpenCodeState, CommandResult, RolloutResult, + RolloutTurn, ) except ImportError: # pragma: no cover from models import ( # type: ignore - CodingAgentState, + OpenCodeState, CommandResult, RolloutResult, + RolloutTurn, ) - from openenv.core.harness.agents import get_agent_spec - from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory - - from coding_agent_env.config import CodingAgentConfig - from coding_agent_env.harness import CodingAgentSessionFactory - from coding_agent_env.task import CodingAgentTask + from opencode_env.config import OpenCodeConfig + from opencode_env.harness import OpenCodeSessionFactory + from opencode_env.task import OpenCodeTask try: from openenv.core.harness.sandbox import E2BSandboxBackend @@ -110,13 +91,12 @@ def __init__(self) -> None: self._CommandResult = CommandResult self._RolloutResult = RolloutResult - self._CodingAgentState = CodingAgentState - self._CodingAgentConfig = CodingAgentConfig - self._CodingAgentSessionFactory = CodingAgentSessionFactory - self._CodingAgentTask = CodingAgentTask + self._RolloutTurn = RolloutTurn + self._OpenCodeState = OpenCodeState + self._OpenCodeConfig = OpenCodeConfig + self._OpenCodeSessionFactory = OpenCodeSessionFactory + self._OpenCodeTask = OpenCodeTask self._E2BSandboxBackend = E2BSandboxBackend - self._CLIAgentSessionFactory = CLIAgentSessionFactory - self._get_agent_spec = get_agent_spec # Don't raise on missing E2B_API_KEY here — OpenEnv's web-interface # layer instantiates the env at import time for schema introspection, @@ -124,14 +104,12 @@ def __init__(self) -> None: # just exploring. The real check happens lazily in # ``_run_rollout_impl`` (any rollout without creds fails fast there # with a clear error in the result payload). - self._state = self._CodingAgentState(episode_id=str(uuid4())) + self._state = self._OpenCodeState(episode_id=str(uuid4())) - mcp = FastMCP("coding_agent_env") + mcp = FastMCP("opencode_env") @mcp.tool def run_rollout( - # Agent + endpoint. - agent: str = "opencode", # Endpoint — either a shorthand (resolved from env vars + catalog # defaults) OR explicit base_url+api_key+model. Explicit fields # always win over the catalog. @@ -145,24 +123,21 @@ def run_rollout( verify: Optional[list[str]] = None, # Bookkeeping / tunables task_id: str = "", - mode: str = "black_box", + mode: str = "transparent_proxy", disable_thinking: Optional[bool] = None, max_tokens_cap: int = 4096, top_logprobs: int = 5, agent_timeout_s: float = 600.0, template: str = "", ) -> str: - """Run one coding-agent rollout end-to-end. - - ``agent`` selects the harness CLI to run inside the sandbox. - Currently supported: ``"opencode"``, ``"pi"``. + """Run one OpenCode rollout end-to-end. ``endpoint`` is the shorthand selector (one of ``"vllm"`` / ``"openai"`` / ``"hf_router"``) — the server resolves base_url / api_key / model from env vars + catalog defaults. Pass any of those explicitly to override. - See ``coding_agent_env.client.CodingAgentEnv.run_rollout`` for full + See ``opencode_env.client.OpenCodeEnv.run_rollout`` for full arg docs. Returns a JSON-serialized ``RolloutResult``. """ # Resolve via catalog when shorthand is provided. @@ -179,11 +154,6 @@ def run_rollout( if disable_thinking_resolved is None: disable_thinking_resolved = False - agent = (agent or "opencode").strip() - if agent not in _SUPPORTED_AGENTS: - raise ValueError( - f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}" - ) if not (base_url and api_key and model): raise ValueError( "must provide either ``endpoint`` (one of " @@ -193,7 +163,6 @@ def run_rollout( raise ValueError("instruction is required") return self._run_rollout_impl( - agent=agent, base_url=base_url, api_key=api_key, model=model, @@ -219,14 +188,14 @@ def reset( episode_id: Optional[str] = None, **_: Any, ) -> Observation: - self._state = self._CodingAgentState(episode_id=episode_id or str(uuid4())) + self._state = self._OpenCodeState(episode_id=episode_id or str(uuid4())) return Observation( done=False, reward=None, metadata={ "status": "ready", "message": ( - "coding_agent_env ready. Call run_rollout(agent=..., ...) with a task." + "opencode_env ready. Call run_rollout(...) with a task." ), }, ) @@ -277,7 +246,6 @@ def state(self) -> Any: def _run_rollout_impl( self, *, - agent: str, base_url: str, api_key: str, model: str, @@ -318,11 +286,9 @@ def _emit(msg: str) -> None: _emit("error: E2B_API_KEY missing on server") return result.model_dump_json() - _emit(f"resolving config (agent={agent}, model={model}, mode={mode})") + _emit(f"resolving config (model={model}, mode={mode})") config = self._build_agent_config( - agent=agent, - mode=mode, base_url=base_url, api_key=api_key, model=model, @@ -341,22 +307,18 @@ def _emit(msg: str) -> None: # ``set -e`` makes the script abort on the first failing command. setup_shell = "set -e\n" + "\n".join(setup) - rollout_task = self._CodingAgentTask( + rollout_task = self._OpenCodeTask( instruction=instruction, setup_shell=setup_shell, - metadata={"task_id": task_id, "agent": agent}, + metadata={"task_id": task_id}, ) session = None try: factory = self._build_session_factory( - agent=agent, config=config, mode=mode, template=template, - disable_thinking=disable_thinking, - top_logprobs=top_logprobs, - max_tokens_cap=max_tokens_cap, ) _emit( f"creating E2B sandbox (template={template or 'default'}) — " @@ -384,7 +346,7 @@ def _emit(msg: str) -> None: # Block until the agent is done. if result.error is None: _emit( - f"agent running — {agent} CLI in sandbox " + "agent running — OpenCode CLI in sandbox " f"(timeout {int(agent_timeout_s)}s)" ) try: @@ -414,19 +376,23 @@ def _emit(msg: str) -> None: else: result.reward = None - # Collect filesystem + agent log tail. + # Collect filesystem + logs + transparent-proxy trace. _emit("collecting workdir files + logs") result.files, result.files_extra = self._collect_files(session.sandbox) - result.agent_log_tail = self._collect_agent_log_tail(session, agent) + result.proxy_turns = self._collect_proxy_turns(session) + result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:] + result.agent_log_tail = self._collect_agent_log_tail(session) _emit( f"collected: {len(result.files)} file(s), " + f"{len(result.proxy_turns)} proxy turn(s), " f"reward={'%.2f' % result.reward if result.reward is not None else 'n/a'}" ) except Exception as exc: # noqa: BLE001 result.error = f"{type(exc).__name__}: {exc}" _emit(f"ERROR: {result.error}") if session is not None: - result.agent_log_tail = self._collect_agent_log_tail(session, agent) + result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:] + result.agent_log_tail = self._collect_agent_log_tail(session) finally: if session is not None: try: @@ -449,8 +415,6 @@ def _emit(msg: str) -> None: def _build_agent_config( self, *, - agent: str, - mode: str, base_url: str, api_key: str, model: str, @@ -459,44 +423,26 @@ def _build_agent_config( top_logprobs: int, max_tokens_cap: int, ) -> Any: - if agent == "opencode": - if top_logprobs: - _log.warning( - "top_logprobs=%d is not supported for agent='opencode' " - "and will have no effect. Use interception_gate mode for " - "logprob capture.", - top_logprobs, - ) - return self._CodingAgentConfig( - provider="openai_compatible", - base_url=base_url.rstrip("/"), - api_key=api_key, - model=model, - agent_timeout_s=agent_timeout_s, - disable_thinking=disable_thinking, - max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None, - ) - - provider = self._infer_pi_provider(base_url) - return _GenericAgentConfig( + cap = max_tokens_cap if max_tokens_cap > 0 else None + return self._OpenCodeConfig( + provider="openai_compatible", base_url=base_url.rstrip("/"), api_key=api_key, model=model, agent_timeout_s=agent_timeout_s, - provider=provider, - thinking="off" if disable_thinking else None, + disable_thinking=disable_thinking, + max_tokens_cap=cap, + proxy_disable_thinking=disable_thinking, + proxy_top_logprobs=max(0, int(top_logprobs)), + proxy_max_tokens_cap=cap, ) def _build_session_factory( self, *, - agent: str, config: Any, mode: str, template: str, - disable_thinking: bool, - top_logprobs: int, - max_tokens_cap: int, ) -> Any: if self._E2BSandboxBackend is None: raise RuntimeError( @@ -508,35 +454,14 @@ def _build_session_factory( backend_kwargs["template"] = template backend = self._E2BSandboxBackend(**backend_kwargs) - if agent == "opencode": - return self._CodingAgentSessionFactory( - config=config, - sandbox_backend=backend, - mode=mode, - verifier=None, - ) - - spec = self._get_agent_spec(agent) - return self._CLIAgentSessionFactory( - spec=spec, + return self._OpenCodeSessionFactory( config=config, sandbox_backend=backend, mode=mode, verifier=None, ) - @staticmethod - def _infer_pi_provider(base_url: str) -> str: - url = (base_url or "").lower() - if "router.huggingface.co" in url: - return "huggingface" - if "anthropic" in url: - return "anthropic" - if "googleapis.com" in url or "generativelanguage" in url: - return "gemini" - return "openai" - - def _collect_agent_log_tail(self, session: Any, agent: str) -> str: + def _collect_agent_log_tail(self, session: Any) -> str: if hasattr(session, "collect_artifacts"): try: artifacts = session.collect_artifacts() @@ -547,8 +472,7 @@ def _collect_agent_log_tail(self, session: Any, agent: str) -> str: return json.dumps(val, default=str)[-2000:] except Exception: pass - path = _AGENT_LOG_BY_AGENT.get(agent, AGENT_LOG) - return self._safe_read(session.sandbox, path)[-2000:] + return self._safe_read(session.sandbox, AGENT_LOG)[-2000:] # ── Helpers ──────────────────────────────────────────────────────────── @@ -597,6 +521,27 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]: extras.append(path) return files, extras + def _collect_proxy_turns(self, session: Any) -> list[Any]: + turns: list[Any] = [] + if not hasattr(session, "fetch_proxy_trace"): + return turns + for rec in session.fetch_proxy_trace(): + response = rec.get("response") or {} + turns.append( + self._RolloutTurn( + turn=int(rec.get("turn") or 0), + finish_reason=rec.get("finish_reason"), + completion_tokens=list(rec.get("completion_tokens") or []), + completion_token_ids=list(rec.get("completion_token_ids") or []), + per_token_logps=list(rec.get("per_token_logps") or []), + latency_s=float(rec.get("latency_s") or 0.0), + timestamp=float(rec.get("timestamp") or 0.0), + upstream_status=response.get("upstream_status"), + upstream_error=response.get("upstream_error"), + ) + ) + return turns + @staticmethod def _safe_read(sandbox: Any, path: str) -> str: try: diff --git a/envs/coding_agent_env/task.py b/envs/opencode_env/task.py similarity index 73% rename from envs/coding_agent_env/task.py rename to envs/opencode_env/task.py index 8633eb7aa..f9d208d84 100644 --- a/envs/coding_agent_env/task.py +++ b/envs/opencode_env/task.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Task payload accepted by :class:`CodingAgentSessionFactory`.""" +"""Task payload accepted by :class:`OpenCodeSessionFactory`.""" from __future__ import annotations @@ -13,8 +13,8 @@ from pydantic import BaseModel, Field -class CodingAgentTask(BaseModel): - """One task for a coding-agent rollout. +class OpenCodeTask(BaseModel): + """One task for an OpenCode rollout. The primitive only needs ``instruction`` (the prompt handed to ``opencode run``). Callers may attach ``setup_shell`` (run once inside the sandbox @@ -29,8 +29,8 @@ class CodingAgentTask(BaseModel): metadata: dict[str, Any] = Field(default_factory=dict) @classmethod - def coerce(cls, value: Any) -> "CodingAgentTask": - """Accept a bare string, a dict, or an existing ``CodingAgentTask``.""" + def coerce(cls, value: Any) -> "OpenCodeTask": + """Accept a bare string, a dict, or an existing ``OpenCodeTask``.""" if isinstance(value, cls): return value if isinstance(value, str): @@ -38,6 +38,6 @@ def coerce(cls, value: Any) -> "CodingAgentTask": if isinstance(value, dict): return cls(**value) raise TypeError( - f"Cannot coerce {type(value).__name__} to CodingAgentTask; " - "pass a str, dict, or CodingAgentTask." + f"Cannot coerce {type(value).__name__} to OpenCodeTask; " + "pass a str, dict, or OpenCodeTask." ) diff --git a/envs/coding_agent_env/uv.lock b/envs/opencode_env/uv.lock similarity index 99% rename from envs/coding_agent_env/uv.lock rename to envs/opencode_env/uv.lock index aa35531cc..aa802ee9d 100644 --- a/envs/coding_agent_env/uv.lock +++ b/envs/opencode_env/uv.lock @@ -1664,7 +1664,7 @@ wheels = [ ] [[package]] -name = "openenv-coding-agent-env" +name = "openenv-opencode-env" version = "0.1.0" source = { editable = "." } dependencies = [ diff --git a/examples/coding_agent_env_simple.py b/examples/opencode_env_simple.py similarity index 80% rename from examples/coding_agent_env_simple.py rename to examples/opencode_env_simple.py index caf81bad8..660421fdd 100644 --- a/examples/coding_agent_env_simple.py +++ b/examples/opencode_env_simple.py @@ -5,16 +5,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""End-to-end coding_agent_env example: write binary_search.py and verify it. +"""End-to-end opencode_env example: write binary_search.py and verify it. -Hits the deployed HF Space ``AdithyaSK/coding-agent-env`` (override via -``CODING_AGENT_ENV_SPACE`` env var to point at your own Space or a local +Hits the deployed HF Space ``AdithyaSK/opencode-env`` (override via +``OPENCODE_ENV_SPACE`` env var to point at your own Space or a local container). The single MCP tool ``run_rollout`` does: - 1. Spawns a fresh E2B sandbox (using the prebaked ``coding-agent-rl`` + 1. Spawns a fresh E2B sandbox (using the prebaked ``opencode-rl`` template — falls back to a cold install if the template isn't present in your E2B account). - 2. Runs the selected harness CLI with the instruction. + 2. Runs OpenCode with the instruction. 3. Executes the verify bash commands; reward = passed / total. 4. Returns a ``RolloutResult`` with reward + produced file contents. @@ -26,7 +26,7 @@ Usage:: - PYTHONPATH=src:envs uv run python examples/coding_agent_env_simple.py + PYTHONPATH=src:envs uv run python examples/opencode_env_simple.py Expected output (~20s with the prebaked template):: @@ -45,13 +45,13 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "envs")) -from coding_agent_env import CodingAgentEnv # noqa: E402 -from coding_agent_env.client import _extract_text # noqa: E402 -from coding_agent_env.models import RolloutResult # noqa: E402 +from opencode_env import OpenCodeEnv # noqa: E402 +from opencode_env.client import _extract_text # noqa: E402 +from opencode_env.models import RolloutResult # noqa: E402 SPACE = os.environ.get( - "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space" + "OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space" ) INSTRUCTION = ( @@ -89,7 +89,7 @@ async def main() -> int: print(f"Instruction: {INSTRUCTION.splitlines()[0]} ...") print() - async with CodingAgentEnv(base_url=SPACE) as env: + async with OpenCodeEnv(base_url=SPACE) as env: await env.reset() raw = await env.call_tool( "run_rollout", @@ -99,7 +99,7 @@ async def main() -> int: instruction=INSTRUCTION, setup=[], # no setup commands verify=VERIFY, - template="coding-agent-rl", # prebaked E2B template + template="opencode-rl", # prebaked E2B template task_id="binary_search_simple", agent_timeout_s=600, ) diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py index 7338fc323..f174b6733 100644 --- a/tests/core/test_cli_agent_driver.py +++ b/tests/core/test_cli_agent_driver.py @@ -29,7 +29,7 @@ from openenv.core.harness.sandbox.base import ExecResult, SandboxHandle -# Fake sandbox infrastructure (mirrors test_coding_agent_env.py pattern) +# Fake sandbox infrastructure (mirrors test_opencode_env.py pattern) @dataclass diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_opencode_env.py similarity index 71% rename from tests/envs/test_coding_agent_env.py rename to tests/envs/test_opencode_env.py index fa3dcae79..701d562e9 100644 --- a/tests/envs/test_coding_agent_env.py +++ b/tests/envs/test_opencode_env.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""Smoke tests for ``coding_agent_env``. +"""Smoke tests for ``opencode_env``. The default suite runs in CI without any external dependencies (no E2B, no LLM, no network). It covers: @@ -13,7 +13,7 @@ - The endpoint catalog (`vllm` / `openai` / `hf_router`) resolves explicit + env-var + default-value precedence correctly. - Pydantic models accept their expected shapes. - - The `CodingAgentTask` coercion helper handles str / dict / `CodingAgentTask`. + - The `OpenCodeTask` coercion helper handles str / dict / `OpenCodeTask`. A second class is marked ``@pytest.mark.integration`` and exercises the deployed Space end-to-end. It only runs when ``E2B_API_KEY`` and at least @@ -44,17 +44,18 @@ def test_public_api_imports() -> None: """Top-level package re-exports the documented surface.""" - from coding_agent_env import ( # noqa: F401 - CodingAgentConfig, - CodingAgentEnv, - CodingAgentSession, - CodingAgentSessionFactory, - CodingAgentState, - CodingAgentTask, + from opencode_env import ( # noqa: F401 + OpenCodeConfig, + OpenCodeEnv, + OpenCodeSession, + OpenCodeSessionFactory, + OpenCodeState, + OpenCodeTask, CommandResult, E2BSandboxBackend, Provider, RolloutResult, + RolloutTurn, SandboxBackend, SandboxHandle, ) @@ -62,14 +63,14 @@ def test_public_api_imports() -> None: def test_server_modules_import() -> None: """Server-side modules (FastAPI app, MCP env, catalog) import cleanly.""" - from coding_agent_env.server.app import app # noqa: F401 - from coding_agent_env.server.catalog import ( # noqa: F401 + from opencode_env.server.app import app # noqa: F401 + from opencode_env.server.catalog import ( # noqa: F401 catalog_summary, ENDPOINT_KINDS, resolve_endpoint, ) - from coding_agent_env.server.coding_environment import ( # noqa: F401 - CodingAgentEnvironment, + from opencode_env.server.opencode_environment import ( # noqa: F401 + OpenCodeEnvironment, ) @@ -79,14 +80,14 @@ def test_server_modules_import() -> None: def test_catalog_kinds() -> None: - from coding_agent_env.server.catalog import ENDPOINT_KINDS + from opencode_env.server.catalog import ENDPOINT_KINDS assert ENDPOINT_KINDS == ("vllm", "openai", "hf_router") def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) -> None: """Explicit args beat env vars beat catalog defaults.""" - from coding_agent_env.server.catalog import resolve_endpoint + from opencode_env.server.catalog import resolve_endpoint monkeypatch.setenv("OPENAI_API_KEY", "from-env") r = resolve_endpoint( @@ -105,7 +106,7 @@ def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) -> def test_resolve_endpoint_env_var_used_when_arg_missing( monkeypatch: pytest.MonkeyPatch, ) -> None: - from coding_agent_env.server.catalog import resolve_endpoint + from opencode_env.server.catalog import resolve_endpoint monkeypatch.setenv("OPENAI_API_KEY", "key-from-env") monkeypatch.setenv("OPENAI_MODEL", "gpt-4o") @@ -119,7 +120,7 @@ def test_resolve_endpoint_normalizes_v1_suffix( monkeypatch: pytest.MonkeyPatch, ) -> None: """Base URL gets ``/v1`` appended if missing, otherwise left alone.""" - from coding_agent_env.server.catalog import resolve_endpoint + from opencode_env.server.catalog import resolve_endpoint monkeypatch.setenv("VLLM_URL", "https://my-vllm.example/") monkeypatch.setenv("VLLM_API_KEY", "x") @@ -132,7 +133,7 @@ def test_resolve_endpoint_normalizes_v1_suffix( def test_resolve_endpoint_unknown_kind_raises() -> None: - from coding_agent_env.server.catalog import resolve_endpoint + from opencode_env.server.catalog import resolve_endpoint with pytest.raises(ValueError, match="unknown endpoint kind"): resolve_endpoint("bogus", base_url="x", api_key="y", model="z") @@ -141,7 +142,7 @@ def test_resolve_endpoint_unknown_kind_raises() -> None: def test_resolve_endpoint_missing_creds_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: - from coding_agent_env.server.catalog import resolve_endpoint + from opencode_env.server.catalog import resolve_endpoint # Strip any inherited env vars. for k in ("OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL"): @@ -151,7 +152,7 @@ def test_resolve_endpoint_missing_creds_raises( def test_catalog_summary_shape() -> None: - from coding_agent_env.server.catalog import catalog_summary + from opencode_env.server.catalog import catalog_summary summary = catalog_summary() assert {entry["kind"] for entry in summary} == {"vllm", "openai", "hf_router"} @@ -165,12 +166,10 @@ def test_catalog_summary_shape() -> None: def test_build_agent_config_opencode() -> None: - from coding_agent_env.server.coding_environment import CodingAgentEnvironment + from opencode_env.server.opencode_environment import OpenCodeEnvironment - env = CodingAgentEnvironment() + env = OpenCodeEnvironment() cfg = env._build_agent_config( - agent="opencode", - mode="black_box", base_url="https://api.openai.com/v1", api_key="sk-test", model="gpt-4o-mini", @@ -179,14 +178,15 @@ def test_build_agent_config_opencode() -> None: top_logprobs=7, max_tokens_cap=2048, ) - assert isinstance(cfg, env._CodingAgentConfig) + assert isinstance(cfg, env._OpenCodeConfig) assert cfg.model == "gpt-4o-mini" assert cfg.agent_timeout_s == 123.0 assert cfg.max_tokens_cap == 2048 + assert cfg.proxy_max_tokens_cap == 2048 + assert cfg.proxy_top_logprobs == 7 + assert cfg.proxy_disable_thinking is True cfg_4096 = env._build_agent_config( - agent="opencode", - mode="black_box", base_url="https://api.openai.com/v1", api_key="sk-test", model="gpt-4o-mini", @@ -198,8 +198,6 @@ def test_build_agent_config_opencode() -> None: assert cfg_4096.max_tokens_cap == 4096 cfg_uncapped = env._build_agent_config( - agent="opencode", - mode="black_box", base_url="https://api.openai.com/v1", api_key="sk-test", model="gpt-4o-mini", @@ -211,50 +209,15 @@ def test_build_agent_config_opencode() -> None: assert cfg_uncapped.max_tokens_cap is None -def test_build_agent_config_pi() -> None: - from coding_agent_env.server.coding_environment import CodingAgentEnvironment - - env = CodingAgentEnvironment() - cfg = env._build_agent_config( - agent="pi", - mode="black_box", - base_url="https://router.huggingface.co/v1", - api_key="hf_xxx", - model="zai-org/GLM-5.1", - agent_timeout_s=180.0, - disable_thinking=True, - top_logprobs=5, - max_tokens_cap=4096, - ) - assert cfg.provider == "huggingface" - assert cfg.thinking == "off" - assert cfg.model == "zai-org/GLM-5.1" - - cfg_gate = env._build_agent_config( - agent="pi", - mode="interception_gate", - base_url="https://router.huggingface.co/v1", - api_key="hf_xxx", - model="zai-org/GLM-5.1", - agent_timeout_s=180.0, - disable_thinking=False, - top_logprobs=5, - max_tokens_cap=4096, - ) - assert cfg_gate.provider == "huggingface" - - def test_build_session_factory_requires_e2b_dependency() -> None: - from coding_agent_env.server.coding_environment import CodingAgentEnvironment + from opencode_env.server.opencode_environment import OpenCodeEnvironment - env = CodingAgentEnvironment() + env = OpenCodeEnvironment() env._E2BSandboxBackend = None cfg = env._build_agent_config( - agent="pi", - mode="black_box", - base_url="https://router.huggingface.co/v1", - api_key="hf_xxx", - model="zai-org/GLM-5.1", + base_url="https://api.openai.com/v1", + api_key="sk-test", + model="gpt-4o-mini", agent_timeout_s=180.0, disable_thinking=False, top_logprobs=5, @@ -263,13 +226,9 @@ def test_build_session_factory_requires_e2b_dependency() -> None: with pytest.raises(RuntimeError, match="E2BSandboxBackend unavailable"): env._build_session_factory( - agent="pi", config=cfg, mode="black_box", template="", - disable_thinking=False, - top_logprobs=5, - max_tokens_cap=4096, ) @@ -279,7 +238,7 @@ def test_build_session_factory_requires_e2b_dependency() -> None: def test_rollout_result_serializes_round_trip() -> None: - from coding_agent_env import CommandResult, RolloutResult + from opencode_env import CommandResult, RolloutResult, RolloutTurn r = RolloutResult( task_id="t1", @@ -290,45 +249,54 @@ def test_rollout_result_serializes_round_trip() -> None: mode="black_box", setup_results=[CommandResult(cmd="pip install pandas", exit_code=0)], verify_results=[CommandResult(cmd="pytest", exit_code=1, stderr="boom")], + proxy_turns=[ + RolloutTurn( + turn=1, + completion_tokens=["ok"], + completion_token_ids=[123], + per_token_logps=[-0.1], + ) + ], files={"/home/user/workdir/x.py": "print('x')"}, ) blob = r.model_dump_json() rebuilt = RolloutResult.model_validate_json(blob) assert rebuilt.reward == 0.75 assert rebuilt.verify_results[0].exit_code == 1 + assert rebuilt.proxy_turns[0].per_token_logps == [-0.1] -def test_coding_agent_task_coerce_str() -> None: - from coding_agent_env import CodingAgentTask +def test_opencode_task_coerce_str() -> None: + from opencode_env import OpenCodeTask - t = CodingAgentTask.coerce("write fizzbuzz.py") + t = OpenCodeTask.coerce("write fizzbuzz.py") assert t.instruction == "write fizzbuzz.py" assert t.setup_shell is None assert t.upload_files == {} -def test_coding_agent_task_coerce_dict() -> None: - from coding_agent_env import CodingAgentTask +def test_opencode_task_coerce_dict() -> None: + from opencode_env import OpenCodeTask - t = CodingAgentTask.coerce( + t = OpenCodeTask.coerce( {"instruction": "x", "setup_shell": "pip install pandas"} ) assert t.instruction == "x" assert t.setup_shell == "pip install pandas" -def test_coding_agent_task_coerce_existing_passthrough() -> None: - from coding_agent_env import CodingAgentTask +def test_opencode_task_coerce_existing_passthrough() -> None: + from opencode_env import OpenCodeTask - src = CodingAgentTask(instruction="y") - assert CodingAgentTask.coerce(src) is src + src = OpenCodeTask(instruction="y") + assert OpenCodeTask.coerce(src) is src -def test_coding_agent_task_coerce_rejects_unknown_type() -> None: - from coding_agent_env import CodingAgentTask +def test_opencode_task_coerce_rejects_unknown_type() -> None: + from opencode_env import OpenCodeTask with pytest.raises(TypeError, match="Cannot coerce"): - CodingAgentTask.coerce(42) # type: ignore[arg-type] + OpenCodeTask.coerce(42) # type: ignore[arg-type] # --------------------------------------------------------------------------- @@ -354,16 +322,16 @@ def test_run_rollout_e2e_via_deployed_space() -> None: import asyncio - from coding_agent_env import CodingAgentEnv - from coding_agent_env.client import _extract_text - from coding_agent_env.models import RolloutResult + from opencode_env import OpenCodeEnv + from opencode_env.client import _extract_text + from opencode_env.models import RolloutResult SPACE = os.environ.get( - "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space" + "OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space" ) async def _go() -> RolloutResult: - async with CodingAgentEnv(base_url=SPACE) as env: + async with OpenCodeEnv(base_url=SPACE) as env: await env.reset() raw = await env.call_tool( "run_rollout", @@ -382,7 +350,7 @@ async def _go() -> RolloutResult: "import binary_search; " "assert binary_search.binary_search([1,2,3,4,5], 3) == 2; print('OK')\"", ], - template="coding-agent-rl", + template="opencode-rl", agent_timeout_s=600, ) return RolloutResult.model_validate_json(_extract_text(raw))