From 57f9b70665e3f8d928a9fe15008c2d46b0dffdc5 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Tue, 12 May 2026 22:13:26 +0530
Subject: [PATCH 01/35] refactor: core sandbox infra to use
 openenv.core.harness.sandbox

---
 envs/opencode_env/__init__.py                 |  7 ++-
 envs/opencode_env/harness.py                  |  7 ++-
 envs/opencode_env/sandbox/__init__.py         | 51 +++----------------
 envs/opencode_env/sandbox/build_template.py   | 10 ++--
 src/openenv/core/harness/sandbox/__init__.py  | 31 +++++++++++
 .../openenv/core/harness}/sandbox/base.py     | 10 ++--
 .../core/harness/sandbox/e2b_backend.py       | 11 +++-
 .../core/harness}/sandbox/interception.py     |  8 +--
 tests/envs/test_opencode_env.py               |  2 +-
 9 files changed, 74 insertions(+), 63 deletions(-)
 create mode 100644 src/openenv/core/harness/sandbox/__init__.py
 rename {envs/opencode_env => src/openenv/core/harness}/sandbox/base.py (87%)
 rename envs/opencode_env/sandbox/e2b.py => src/openenv/core/harness/sandbox/e2b_backend.py (94%)
 rename {envs/opencode_env => src/openenv/core/harness}/sandbox/interception.py (98%)

diff --git a/envs/opencode_env/__init__.py b/envs/opencode_env/__init__.py
index 223be6f7b..17cd145b3 100644
--- a/envs/opencode_env/__init__.py
+++ b/envs/opencode_env/__init__.py
@@ -30,7 +30,12 @@
     RolloutResult,
     RolloutTurn,
 )
-from .sandbox import E2BSandboxBackend, SandboxBackend, SandboxHandle
+from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
+
+try:
+    from openenv.core.harness.sandbox import E2BSandboxBackend
+except ImportError:  # e2b not installed
+    E2BSandboxBackend = None  # type: ignore[assignment,misc]
 from .task import OpenCodeTask
 
 __all__ = [
diff --git a/envs/opencode_env/harness.py b/envs/opencode_env/harness.py
index da4410dd4..dc0eb55be 100644
--- a/envs/opencode_env/harness.py
+++ b/envs/opencode_env/harness.py
@@ -52,7 +52,7 @@
     opencode_config_path,
     system_prompt_path,
 )
-from .sandbox.base import BgJob, SandboxBackend, SandboxHandle
+from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 from .task import OpenCodeTask
 
 
@@ -64,7 +64,10 @@
 # Where the proxy source lives on disk (in this repo). Uploaded into the
 # sandbox at /home/user/proxy/interception.py before each rollout, unless
 # the sandbox was created from a template that already has it baked in.
-_PROXY_SOURCE_PATH = Path(__file__).parent / "sandbox" / "interception.py"
+_PROXY_SOURCE_PATH = (
+    Path(__file__).resolve().parents[2]
+    / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py"
+)
 
 
 Verifier = Callable[[SandboxHandle, OpenCodeTask], VerifyResult]
diff --git a/envs/opencode_env/sandbox/__init__.py b/envs/opencode_env/sandbox/__init__.py
index 321f81547..a3496a2b1 100644
--- a/envs/opencode_env/sandbox/__init__.py
+++ b/envs/opencode_env/sandbox/__init__.py
@@ -4,50 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Sandbox backends for the OpenCode harness.
+"""Sandbox backends — re-exported from ``openenv.core.harness.sandbox``.
 
-The primitive ships with :class:`E2BSandboxBackend` as the default; any backend
-that satisfies the :class:`SandboxBackend` / :class:`SandboxHandle` protocols
-can be swapped in.
-
-The ``e2b`` import is wrapped in ``try/except`` so this package can be loaded
-in environments where ``e2b`` isn't installed (CI smoke tests, lint runs).
-Instantiating ``E2BSandboxBackend`` without ``e2b`` raises a clear error.
+The canonical source for sandbox protocols and implementations now lives in
+``src/openenv/core/harness/sandbox/``.  This package re-exports everything
+so that ``from opencode_env.sandbox import ...`` keeps working, but all new
+code should import from ``openenv.core.harness.sandbox`` directly.
 """
 
-from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
-
-try:
-    from .e2b import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle  # noqa: F401
-except ImportError as _e2b_err:  # pragma: no cover
-
-    class _RequiresE2B:
-        """Stub raised when ``e2b`` is not installed.
-
-        Lets the package import cleanly so unit tests, ``openenv validate``,
-        and the docs build can run without the heavy ``e2b`` dependency.
-        Actually constructing one of these classes raises a clear ImportError.
-        """
-
-        _e2b_import_error = _e2b_err
-
-        def __init__(self, *_args, **_kwargs):
-            raise ImportError(
-                "e2b is not installed; install it via "
-                "`pip install 'openenv-opencode-env[dev]'` or "
-                "`pip install e2b` to use E2BSandboxBackend. "
-                f"Original import error: {self._e2b_import_error}"
-            )
-
-    E2BBgJob = E2BSandboxBackend = E2BSandboxHandle = _RequiresE2B  # type: ignore[assignment]
-
-
-__all__ = [
-    "BgJob",
-    "ExecResult",
-    "SandboxBackend",
-    "SandboxHandle",
-    "E2BBgJob",
-    "E2BSandboxBackend",
-    "E2BSandboxHandle",
-]
+from openenv.core.harness.sandbox import *  # noqa: F401,F403
+from openenv.core.harness.sandbox import __all__  # noqa: F401
diff --git a/envs/opencode_env/sandbox/build_template.py b/envs/opencode_env/sandbox/build_template.py
index 01c32d537..084a95e64 100644
--- a/envs/opencode_env/sandbox/build_template.py
+++ b/envs/opencode_env/sandbox/build_template.py
@@ -44,8 +44,10 @@
 from e2b import Template, default_build_logger
 
 
-_ENV_DIR = Path(__file__).resolve().parent
-_PROXY_SOURCE = _ENV_DIR / "interception.py"
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+_PROXY_SOURCE = (
+    _REPO_ROOT / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py"
+)
 
 
 def _load_env(path: Path) -> None:
@@ -91,7 +93,7 @@ def build_template(name: str, *, skip_cache: bool = False) -> str:
         .make_dir("/home/user/task")
         .make_dir("/home/user/workdir")
         .make_dir("/home/user/proxy")
-        .copy("interception.py", "/home/user/proxy/interception.py")
+        .copy(str(_PROXY_SOURCE), "/home/user/proxy/interception.py")
         .set_workdir("/home/user/workdir")
     )
     if skip_cache:
@@ -121,7 +123,7 @@ def main(argv: list[str] | None = None) -> int:
     )
     args = p.parse_args(argv)
 
-    _load_env(_ENV_DIR / ".env")
+    _load_env(_REPO_ROOT / "envs" / "opencode_env" / "sandbox" / ".env")
     if not os.environ.get("E2B_API_KEY"):
         print("ERROR: E2B_API_KEY required.", file=sys.stderr)
         return 2
diff --git a/src/openenv/core/harness/sandbox/__init__.py b/src/openenv/core/harness/sandbox/__init__.py
new file mode 100644
index 000000000..d0324a7d7
--- /dev/null
+++ b/src/openenv/core/harness/sandbox/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Sandbox backends for harness-driven rollouts.
+
+Provides the :class:`SandboxBackend` / :class:`SandboxHandle` protocols and
+concrete implementations. Any harness adapter can use any backend — the
+sandbox layer is orthogonal to the agent CLI choice.
+
+The ``e2b`` import is wrapped in ``try/except`` so this package loads cleanly
+in environments where ``e2b`` isn't installed (CI smoke tests, lint runs).
+"""
+
+from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
+
+__all__ = [
+    "BgJob",
+    "ExecResult",
+    "SandboxBackend",
+    "SandboxHandle",
+]
+
+try:
+    from .e2b_backend import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle
+
+    __all__.extend(["E2BBgJob", "E2BSandboxBackend", "E2BSandboxHandle"])
+except ImportError:
+    pass  # e2b not installed — stubs live in envs/opencode_env/sandbox/__init__.py
diff --git a/envs/opencode_env/sandbox/base.py b/src/openenv/core/harness/sandbox/base.py
similarity index 87%
rename from envs/opencode_env/sandbox/base.py
rename to src/openenv/core/harness/sandbox/base.py
index 76869149a..4b2620799 100644
--- a/envs/opencode_env/sandbox/base.py
+++ b/src/openenv/core/harness/sandbox/base.py
@@ -6,12 +6,12 @@
 
 """Sandbox backend protocol.
 
-A ``SandboxBackend`` produces ``SandboxHandle`` instances that the harness uses
-to stage files, run the OpenCode install, launch the agent as a background
-process, and later tear the sandbox down.
+A ``SandboxBackend`` produces ``SandboxHandle`` instances that harnesses use
+to stage files, install agent CLIs, launch the agent as a background process,
+and later tear the sandbox down.
 
-Backends can be implemented against any provider (E2B, Docker, Modal, Prime)
-as long as they satisfy the Protocols defined here.
+Backends can be implemented against any provider (E2B, CubeSandbox, Docker,
+Modal) as long as they satisfy the Protocols defined here.
 """
 
 from __future__ import annotations
diff --git a/envs/opencode_env/sandbox/e2b.py b/src/openenv/core/harness/sandbox/e2b_backend.py
similarity index 94%
rename from envs/opencode_env/sandbox/e2b.py
rename to src/openenv/core/harness/sandbox/e2b_backend.py
index b567a9e65..f344346ba 100644
--- a/envs/opencode_env/sandbox/e2b.py
+++ b/src/openenv/core/harness/sandbox/e2b_backend.py
@@ -4,7 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""E2B implementation of :class:`SandboxBackend`."""
+"""E2B implementation of :class:`SandboxBackend`.
+
+Works with both E2B cloud (api.e2b.dev) and self-hosted E2B-compatible
+backends like CubeSandbox. For CubeSandbox, set::
+
+    E2B_API_URL=http://your-cubesandbox:3000
+    E2B_API_KEY=dummy  # any non-empty string
+"""
 
 from __future__ import annotations
 
@@ -15,7 +22,7 @@
 from e2b import Sandbox
 from e2b.sandbox_sync.commands.command_handle import CommandHandle
 
-from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
+from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxBackend, SandboxHandle
 
 
 class E2BBgJob:
diff --git a/envs/opencode_env/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py
similarity index 98%
rename from envs/opencode_env/sandbox/interception.py
rename to src/openenv/core/harness/sandbox/interception.py
index 131d41024..dc3dbe5be 100644
--- a/envs/opencode_env/sandbox/interception.py
+++ b/src/openenv/core/harness/sandbox/interception.py
@@ -6,15 +6,15 @@
 
 """Transparent OpenAI-compatible forwarding proxy with logprob capture.
 
-The proxy is a small FastAPI app that OpenCode talks to instead of the upstream
-LLM endpoint. It:
+The proxy is a small FastAPI app that agent CLIs (OpenCode, Claude Code,
+Codex, Pi, etc.) talk to instead of the upstream LLM endpoint. It:
 
 1. Forwards every ``POST /v1/chat/completions`` request to the real upstream
    URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream
    returns per-token logprobs.
 2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines
    trace file.
-3. Returns the upstream response to OpenCode verbatim (minus the ``logprobs``
+3. Returns the upstream response to the agent verbatim (minus the ``logprobs``
    field, which we strip so the CLI never sees anything unexpected).
 
 The proxy is stateless beyond the trace file. One proxy instance runs per
@@ -22,7 +22,7 @@
 
 Run standalone::
 
-    OPENCODE_UPSTREAM_API_KEY=... python -m opencode_env.interception \\
+    UPSTREAM_API_KEY=... python -m openenv.core.harness.sandbox.interception \\
         --upstream-url https://vllm.example/v1 \\
         --trace /tmp/trace.jsonl \\
         --port 7000
diff --git a/tests/envs/test_opencode_env.py b/tests/envs/test_opencode_env.py
index 812ade194..6014c9199 100644
--- a/tests/envs/test_opencode_env.py
+++ b/tests/envs/test_opencode_env.py
@@ -309,7 +309,7 @@ def _exec_with_retry(self, *args, **kwargs):
 def test_interception_cli_reads_upstream_key_from_env(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    from opencode_env.sandbox import interception
+    from openenv.core.harness.sandbox import interception
 
     captured = {}
 

From 024e9042c3833fbd253f7c9981eb6909f29fb0f1 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Tue, 12 May 2026 23:00:13 +0530
Subject: [PATCH 02/35] feat: CLIAgentDriver Abstraction

---
 envs/opencode_env/__init__.py                 |  19 +-
 envs/opencode_env/client.py                   |   2 +-
 envs/opencode_env/config.py                   |   4 +-
 envs/opencode_env/harness.py                  | 422 +++--------
 envs/opencode_env/opencode_runtime.py         |   4 +-
 envs/opencode_env/sandbox/__init__.py         |  12 +-
 envs/opencode_env/sandbox/build_template.py   |   5 +-
 envs/opencode_env/server/app.py               |  10 +-
 envs/opencode_env/server/gradio_ui.py         |  95 ++-
 .../server/opencode_environment.py            |  19 +-
 src/openenv/core/harness/agents/__init__.py   | 107 +++
 src/openenv/core/harness/agents/base.py       | 251 ++++++
 src/openenv/core/harness/agents/cli_driver.py | 716 ++++++++++++++++++
 src/openenv/core/harness/agents/opencode.py   | 191 +++++
 14 files changed, 1467 insertions(+), 390 deletions(-)
 create mode 100644 src/openenv/core/harness/agents/__init__.py
 create mode 100644 src/openenv/core/harness/agents/base.py
 create mode 100644 src/openenv/core/harness/agents/cli_driver.py
 create mode 100644 src/openenv/core/harness/agents/opencode.py

diff --git a/envs/opencode_env/__init__.py b/envs/opencode_env/__init__.py
index 17cd145b3..dcd48a01c 100644
--- a/envs/opencode_env/__init__.py
+++ b/envs/opencode_env/__init__.py
@@ -8,35 +8,30 @@
 
 Two layers in this package:
 
-1. **Harness primitive** — :class:`OpenCodeSessionFactory` /
+1. **Harness primitive** -- :class:`OpenCodeSessionFactory` /
    :class:`OpenCodeSession` / :class:`OpenCodeConfig` /
-   :class:`E2BSandboxBackend`. Used in-process to drive one rollout
-   inside an E2B sandbox. See ``harness.py``.
+   :class:`E2BSandboxBackend`. Built on the generic
+   :class:`CLIAgentDriver` from ``openenv.core.harness.agents``.
 
-2. **Deployable env** — :class:`OpenCodeEnv` (MCP client) talks to the
+2. **Deployable env** -- :class:`OpenCodeEnv` (MCP client) talks to the
    FastAPI server at ``server/app.py`` over HTTP. Use this when the
    sandbox + agent live behind an HTTP boundary (e.g. an HF Space).
    See ``client.py`` and ``server/``.
 """
 
 from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
+from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
 
 from .client import OpenCodeEnv
 from .config import OpenCodeConfig, Provider
 from .harness import OpenCodeSession, OpenCodeSessionFactory
-from .models import (
-    CommandResult,
-    OpenCodeState,
-    RolloutResult,
-    RolloutTurn,
-)
-from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
+from .models import CommandResult, OpenCodeState, RolloutResult, RolloutTurn
+from .task import OpenCodeTask
 
 try:
     from openenv.core.harness.sandbox import E2BSandboxBackend
 except ImportError:  # e2b not installed
     E2BSandboxBackend = None  # type: ignore[assignment,misc]
-from .task import OpenCodeTask
 
 __all__ = [
     # Deployed-env client
diff --git a/envs/opencode_env/client.py b/envs/opencode_env/client.py
index a00afc4e1..52e76e2d5 100644
--- a/envs/opencode_env/client.py
+++ b/envs/opencode_env/client.py
@@ -51,7 +51,7 @@ def run_rollout(
         self,
         *,
         # Endpoint — pass either the shorthand selector OR explicit fields.
-        endpoint: str = "",                # "vllm" | "openai" | "hf_router"
+        endpoint: str = "",  # "vllm" | "openai" | "hf_router"
         base_url: str = "",
         api_key: str = "",
         model: str = "",
diff --git a/envs/opencode_env/config.py b/envs/opencode_env/config.py
index 57273b9eb..2b6bae0a2 100644
--- a/envs/opencode_env/config.py
+++ b/envs/opencode_env/config.py
@@ -34,9 +34,7 @@ class OpenCodeConfig(BaseModel):
 
     # --- OpenCode CLI ---------------------------------------------------------
     opencode_version: str = "latest"
-    disabled_tools: list[str] = Field(
-        default_factory=lambda: ["webfetch", "question"]
-    )
+    disabled_tools: list[str] = Field(default_factory=lambda: ["webfetch", "question"])
     enabled_tools: list[str] | None = None
     system_prompt: str | None = None
     extra_opencode_json: dict[str, Any] = Field(default_factory=dict)
diff --git a/envs/opencode_env/harness.py b/envs/opencode_env/harness.py
index dc0eb55be..600aafa82 100644
--- a/envs/opencode_env/harness.py
+++ b/envs/opencode_env/harness.py
@@ -4,42 +4,32 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""OpenCode session factory + session implementation.
+"""OpenCode session factory + session — backed by CLIAgentDriver.
 
-Implements the :class:`ResourceSessionFactory` / :class:`ResourceSession`
-contracts from ``openenv.core.harness`` (PR #471). The session wraps one
-sandbox running the ``opencode`` CLI agent.
+This module exposes :class:`OpenCodeSession` and
+:class:`OpenCodeSessionFactory` built on top of the generic
+:class:`CLIAgentDriver` / :class:`CLIAgentSession` /
+:class:`CLIAgentSessionFactory` from ``openenv.core.harness.agents``.
 
-Two operating modes:
-
-  - ``mode="black_box"`` — opencode talks directly to ``config.base_url``.
-    No proxy, no logprob capture. Use for smoke tests / SFT / eval.
-  - ``mode="transparent_proxy"`` (default) — an in-sandbox FastAPI proxy
-    sits between opencode and the upstream LLM. It injects ``logprobs=true``
-    on every request and writes per-turn ``(messages, completion_tokens,
-    per_token_logps)`` to ``proxy_trace.jsonl`` for GRPO consumption.
-
-Single driver path: opencode is started as a background subprocess via
-``opencode run --format json --dangerously-skip-permissions ...`` and we
-poll its exit code. The previous ``opencode serve`` driver was removed —
-opencode CLI is the only path now.
+OpenCode-specific configuration (``opencode.json`` generation, provider
+mapping, tool enable/disable) is handled by
+:mod:`opencode_env.opencode_runtime` builders wired into the
+:data:`OPENCODE_SPEC` via callable hooks.
 """
 
 from __future__ import annotations
 
-import json
-import shlex
 from pathlib import Path
-from typing import Any, Callable, Literal
-
-from openenv.core.env_server.mcp_types import Tool
-from openenv.core.harness import (
-    Message,
-    ResourceSession,
-    ResourceSessionFactory,
-    ToolResult,
-    VerifyResult,
+from typing import Any, Literal
+
+from openenv.core.harness import ResourceSessionFactory
+from openenv.core.harness.agents.cli_driver import (
+    CLIAgentDriver,
+    CLIAgentSession,
+    Verifier,
 )
+from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 
 from .config import OpenCodeConfig
 from .opencode_runtime import (
@@ -52,7 +42,6 @@
     opencode_config_path,
     system_prompt_path,
 )
-from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 from .task import OpenCodeTask
 
 
@@ -61,28 +50,24 @@
 _PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
 _PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
 
-# Where the proxy source lives on disk (in this repo). Uploaded into the
-# sandbox at /home/user/proxy/interception.py before each rollout, unless
-# the sandbox was created from a template that already has it baked in.
 _PROXY_SOURCE_PATH = (
     Path(__file__).resolve().parents[2]
-    / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py"
+    / "src"
+    / "openenv"
+    / "core"
+    / "harness"
+    / "sandbox"
+    / "interception.py"
 )
 
 
-Verifier = Callable[[SandboxHandle, OpenCodeTask], VerifyResult]
-
-
-class OpenCodeSession(ResourceSession):
+class OpenCodeSession(CLIAgentSession):
     """One live OpenCode rollout inside a sandbox.
 
-    The session is created already-running: :meth:`OpenCodeSessionFactory.create`
-    calls :meth:`start_agent` before returning. Typical usage::
-
-        session = factory.create(task)
-        session.wait_for_completion()
-        result = session.verify([])
-        session.close()
+    Extends :class:`CLIAgentSession` with OpenCode-specific convenience
+    methods (``fetch_trace``, ``wait_for_completion`` with config-aware
+    timeout). Fully backward-compatible with code that used the old
+    ``OpenCodeSession`` API.
     """
 
     def __init__(
@@ -95,100 +80,43 @@ def __init__(
         base_url_override: str | None = None,
         proxy_trace_path: str | None = None,
         proxy_bg_job: BgJob | None = None,
+        agent_bg_job: BgJob | None = None,
     ) -> None:
-        self.sandbox = sandbox
-        self.config = config
-        self.task = task
-        self._verifier = verifier
-        self._base_url_override = base_url_override
-        self._bg_job: BgJob | None = None
-        self._proxy_trace_path = proxy_trace_path
-        self._proxy_bg_job = proxy_bg_job
-
-    # ------------------------------------------------------------------
-    # ResourceSession contract (PR #471)
-    # ------------------------------------------------------------------
-    def initial_messages(self) -> list[Message]:
-        return [{"role": "user", "content": self.task.instruction}]
-
-    def list_tools(self) -> list[Tool]:
-        # OpenCode owns its own tool loop — none are exposed to the harness.
-        return []
-
-    def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
-        return ToolResult(
-            error=(
-                "OpenCodeSession does not expose external tool calls; the "
-                "CLI agent owns its own tool loop."
-            )
+        super().__init__(
+            spec=OPENCODE_SPEC,
+            sandbox=sandbox,
+            task=task,
+            config=config,
+            verifier=verifier,
+            base_url_override=base_url_override,
+            proxy_trace_path=proxy_trace_path,
+            proxy_bg_job=proxy_bg_job,
+            agent_bg_job=agent_bg_job,
         )
 
-    def verify(
-        self,
-        transcript: list[Message],
-        final_state: Any | None = None,
-    ) -> VerifyResult:
-        if self._verifier is None:
-            return VerifyResult(env_reward=None, done=True)
-        return self._verifier(self.sandbox, self.task)
-
-    def close(self) -> None:
-        if self._bg_job is not None:
-            try:
-                self._bg_job.kill()
-            except Exception:
-                pass
-            self._bg_job = None
-        if self._proxy_bg_job is not None:
-            try:
-                self._proxy_bg_job.kill()
-            except Exception:
-                pass
-            self._proxy_bg_job = None
-        self.sandbox.kill()
-
-    # ------------------------------------------------------------------
-    # OpenCode-specific session API
-    # ------------------------------------------------------------------
-    def start_agent(self) -> None:
-        """Launch ``opencode run`` as a background subprocess in the sandbox."""
-        if self._bg_job is not None:
-            return
-        cmd = build_run_cmd(self.config)
-        envs = build_env_vars(self.config, base_url_override=self._base_url_override)
-        self._bg_job = self.sandbox.start_bg(cmd, envs=envs)
+    def fetch_trace(self) -> str:
+        """Return the raw ``opencode run`` log (JSONL when ``run_format=json``)."""
+        return self.sandbox.read_text(agent_log_path(self.config))
 
     def wait_for_completion(self, timeout_s: float | None = None) -> int:
         """Block until the agent exits, returning its exit code."""
         budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
-        if self._bg_job is None:
+        if self._agent_bg_job is None:
             raise RuntimeError("Agent not started; call start_agent() first.")
-        return self._bg_job.wait(timeout=budget)
+        return self._agent_bg_job.wait(timeout=budget)
 
-    def fetch_trace(self) -> str:
-        """Return the raw ``opencode run`` log (JSON-lines when ``run_format=json``)."""
-        return self.sandbox.read_text(agent_log_path(self.config))
-
-    def fetch_proxy_trace(self) -> list[dict[str, Any]]:
-        """Return per-turn proxy-captured records (Mode B only).
+    def start_agent(self) -> None:
+        """Launch ``opencode run`` as a background subprocess in the sandbox.
 
-        Each entry has ``request``, ``response``, ``completion_tokens``,
-        ``completion_token_ids``, ``per_token_logps``, ``finish_reason``,
-        and ``latency_s``. Returns ``[]`` in Mode A.
+        Provided for backward compatibility — the factory now starts the
+        agent during ``create()``, so calling this manually is a no-op
+        if the agent is already running.
         """
-        if self._proxy_trace_path is None:
-            return []
-        try:
-            content = self.sandbox.read_text(self._proxy_trace_path)
-        except Exception:
-            return []
-        records: list[dict[str, Any]] = []
-        for line in content.splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            records.append(json.loads(line))
-        return records
+        if self._agent_bg_job is not None:
+            return
+        cmd = build_run_cmd(self.config)
+        envs = build_env_vars(self.config, base_url_override=self._base_url_override)
+        self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs)
 
 
 class OpenCodeSessionFactory(ResourceSessionFactory):
@@ -197,6 +125,10 @@ class OpenCodeSessionFactory(ResourceSessionFactory):
     The factory owns sandbox provisioning, opencode install, config injection,
     and (Mode B) proxy startup. Each :meth:`create` call returns a fresh
     sandbox with a running agent.
+
+    Internally delegates to :class:`CLIAgentDriver` for the generic
+    sandbox lifecycle (readiness probing, install retry, proxy startup).
+    OpenCode-specific config generation uses ``opencode_runtime`` builders.
     """
 
     def __init__(
@@ -218,6 +150,18 @@ def __init__(
         self._install_timeout_s = install_timeout_s
         self._setup_timeout_s = setup_timeout_s
 
+        # Build a CLIAgentDriver for the shared lifecycle.
+        self._driver = CLIAgentDriver(
+            spec=OPENCODE_SPEC,
+            sandbox_backend=sandbox_backend,
+            mode=mode,
+            install_timeout_s=install_timeout_s,
+            setup_timeout_s=setup_timeout_s,
+            proxy_top_logprobs=config.proxy_top_logprobs,
+            proxy_max_tokens_cap=config.proxy_max_tokens_cap,
+            proxy_disable_thinking=config.proxy_disable_thinking,
+        )
+
     def create(
         self,
         task: Any,
@@ -225,6 +169,7 @@ def create(
         episode_id: str | None = None,
     ) -> OpenCodeSession:
         import logging
+
         _log = logging.getLogger(__name__)
 
         oc_task = OpenCodeTask.coerce(task)
@@ -232,17 +177,16 @@ def create(
 
         _log.info(
             "factory.create: creating sandbox timeout=%ds mode=%s",
-            sandbox_timeout, self._mode,
+            sandbox_timeout,
+            self._mode,
         )
         sandbox = self._backend.create(
             timeout_s=sandbox_timeout,
             metadata={"episode_id": episode_id} if episode_id else None,
         )
-        sid = (
-            getattr(sandbox, "sandbox_id", None)
-            or getattr(getattr(sandbox, "raw", None), "sandbox_id", "?")
-        )
+        sid = getattr(sandbox, "sandbox_id", "?")
         _log.info("factory.create: sandbox=%s — bootstrapping…", sid)
+
         try:
             self._bootstrap_sandbox(sandbox, oc_task)
         except Exception as exc:
@@ -256,18 +200,20 @@ def create(
         if self._mode == "transparent_proxy":
             _log.info(
                 "factory.create: starting interception proxy on :%d → %s",
-                _PROXY_PORT, self._config.base_url,
+                _PROXY_PORT,
+                self._config.base_url,
             )
-            proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy(
-                sandbox
+            proxy_bg_job, base_url_override, proxy_trace_path = (
+                self._driver._start_proxy(
+                    sandbox,
+                    base_url=self._config.base_url,
+                    api_key=self._config.api_key,
+                    model=self._config.model,
+                )
             )
             _log.info("factory.create: proxy up at %s", base_url_override)
-            # Rewrite opencode.json so opencode points at the proxy. Force
-            # ``openai_compatible`` so opencode hits ``/v1/chat/completions``
-            # (which the proxy serves) rather than provider-specific paths.
-            from .config import OpenCodeConfig as _OCC
-
-            proxy_cfg = _OCC(
+            # Rewrite opencode.json so opencode points at the proxy.
+            proxy_cfg = OpenCodeConfig(
                 **{
                     **self._config.model_dump(),
                     "provider": "openai_compatible",
@@ -292,92 +238,8 @@ def create(
         return session
 
     # ------------------------------------------------------------------
-    def _wait_for_sandbox_ready(
-        self,
-        sandbox: SandboxHandle,
-        *,
-        attempts: int = 15,
-        delay_s: float = 1.0,
-    ) -> None:
-        """Probe the sandbox until ``echo ok`` succeeds.
-
-        E2B (and other backends) sometimes return the handle before the
-        guest is fully ready. Issue ``echo ok`` with short timeouts until
-        it succeeds. Returns silently on success; raises ``RuntimeError``
-        on prolonged failure.
-        """
-        import time
-
-        last_err = ""
-        for _ in range(attempts):
-            try:
-                r = sandbox.exec("echo ok", timeout=5)
-                if r.exit_code == 0 and "ok" in (r.stdout or ""):
-                    return
-                last_err = (r.stderr or r.stdout or "").strip() or f"exit={r.exit_code}"
-            except Exception as exc:  # noqa: BLE001
-                last_err = f"{type(exc).__name__}: {exc}"
-            time.sleep(delay_s)
-        raise RuntimeError(
-            f"sandbox did not become ready within {attempts * delay_s:.0f}s "
-            f"(last error: {last_err})"
-        )
-
-    def _exec_with_retry(
-        self,
-        sandbox: SandboxHandle,
-        cmd: str,
-        *,
-        timeout: float,
-        attempts: int = 3,
-        backoff_s: float = 3.0,
-        label: str = "cmd",
-    ):
-        """Run ``sandbox.exec`` with exponential backoff on transient failure.
-
-        Transient = ``exit_code != 0`` AND empty stderr (SIGKILL / network
-        blip signature) OR an exception during exec. Final failure is raised
-        as ``RuntimeError`` carrying the last exit code + stderr.
-        """
-        import time
-
-        last_stdout = ""
-        last_stderr = ""
-        last_exit = 0
-        for i in range(attempts):
-            try:
-                r = sandbox.exec(cmd, timeout=timeout)
-                if r.exit_code == 0:
-                    return r
-                last_stdout = r.stdout or ""
-                last_stderr = r.stderr or ""
-                last_exit = r.exit_code
-                if last_stderr.strip():
-                    break
-            except Exception as exc:  # noqa: BLE001
-                last_stderr = f"{type(exc).__name__}: {exc}"
-                last_exit = -1
-            if i + 1 < attempts:
-                time.sleep(backoff_s * (2**i))
-        raise RuntimeError(
-            f"{label} failed after {attempts} attempts "
-            f"(exit={last_exit}, stderr={last_stderr!r}, stdout_tail={last_stdout[-400:]!r})"
-        )
-
-    def _opencode_already_installed(self, sandbox: SandboxHandle) -> bool:
-        """Cheap probe — returns True if opencode is on disk in the sandbox.
-
-        Used to skip the slow ``curl install`` step when running against a
-        prebaked template that already ships opencode.
-        """
-        try:
-            r = sandbox.exec(
-                "/home/user/.opencode/bin/opencode --version",
-                timeout=10,
-            )
-            return r.exit_code == 0
-        except Exception:
-            return False
+    # Bootstrap — delegates to CLIAgentDriver utilities
+    # ------------------------------------------------------------------
 
     def _bootstrap_sandbox(
         self,
@@ -387,12 +249,11 @@ def _bootstrap_sandbox(
         """Install opencode, write config + task files, run optional setup."""
 
         # Stage 1: wait for the sandbox to be responsive.
-        self._wait_for_sandbox_ready(sandbox)
+        self._driver._wait_for_sandbox_ready(sandbox)
 
-        # Stage 2: install opencode (skipped if a prebaked template already
-        # has it). curl|bash is flaky — retry with backoff.
-        if not self._opencode_already_installed(sandbox):
-            self._exec_with_retry(
+        # Stage 2: install opencode (skipped if pre-baked).
+        if not self._driver._agent_already_installed(sandbox):
+            self._driver._exec_with_retry(
                 sandbox,
                 build_install_cmd(self._config),
                 timeout=self._install_timeout_s,
@@ -401,6 +262,7 @@ def _bootstrap_sandbox(
                 label="opencode install",
             )
 
+        # Stage 3: write opencode.json + task files.
         sandbox.write_text(
             opencode_config_path(self._config),
             build_opencode_json(self._config),
@@ -416,8 +278,9 @@ def _bootstrap_sandbox(
         for remote_path, content in task.upload_files.items():
             sandbox.write_text(remote_path, content)
 
+        # Stage 4: extra setup
         if self._config.extra_setup_shell:
-            self._exec_with_retry(
+            self._driver._exec_with_retry(
                 sandbox,
                 self._config.extra_setup_shell,
                 timeout=self._setup_timeout_s,
@@ -437,95 +300,14 @@ def _start_proxy(
         self,
         sandbox: SandboxHandle,
     ) -> tuple[BgJob, str, str]:
-        """Install proxy deps + start the proxy as a bg job inside the sandbox.
-
-        Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``.
-        Skips the pip install + source-upload steps when the prebaked
-        template already has them in place.
-        """
-        proxy_already_present = sandbox.exists(
-            "/home/user/proxy/interception.py"
+        """Start proxy — delegates to driver."""
+        return self._driver._start_proxy(
+            sandbox,
+            base_url=self._config.base_url,
+            api_key=self._config.api_key,
+            model=self._config.model,
         )
 
-        if not proxy_already_present:
-            # Install proxy deps (idempotent on retries).
-            self._exec_with_retry(
-                sandbox,
-                "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
-                "'httpx>=0.27' 2>&1 | tail -20",
-                timeout=180,
-                attempts=3,
-                backoff_s=2.0,
-                label="proxy deps install",
-            )
-            # Upload the proxy module into the sandbox.
-            sandbox.write_text(
-                "/home/user/proxy/interception.py",
-                _PROXY_SOURCE_PATH.read_text(),
-            )
-            sandbox.write_text("/home/user/proxy/__init__.py", "")
-
-        proxy_args = [
-            "python",
-            "interception.py",
-            "--upstream-url",
-            self._config.base_url,
-            "--trace",
-            _PROXY_TRACE_PATH,
-            "--port",
-            str(_PROXY_PORT),
-            "--top-logprobs",
-            str(self._config.proxy_top_logprobs),
-        ]
-        if self._config.proxy_max_tokens_cap is not None:
-            proxy_args.extend(
-                ["--max-tokens-cap", str(self._config.proxy_max_tokens_cap)]
-            )
-        if self._config.proxy_disable_thinking:
-            proxy_args.append("--disable-thinking")
-        # Force the upstream model id on every forwarded request — opencode's
-        # internal title-gen call sometimes strips the provider prefix.
-        if self._config.model:
-            proxy_args.extend(["--model-override", self._config.model])
-
-        quoted_proxy_args = " ".join(shlex.quote(arg) for arg in proxy_args)
-        proxy_cmd = (
-            "cd /home/user/proxy && "
-            f"{quoted_proxy_args} "
-            f"> {shlex.quote(_PROXY_LOG_PATH)} 2>&1"
-        )
-        proxy_env = {"OPENCODE_UPSTREAM_API_KEY": self._config.api_key}
-        proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env)
-
-        # Wait for the proxy to start listening. Cold uvicorn boot inside
-        # E2B can take anywhere from <1s to ~30s depending on cache state.
-        import time
-
-        attempts = 120
-        interval_s = 0.5
-        for _ in range(attempts):
-            r = sandbox.exec(
-                f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz",
-                timeout=5,
-            )
-            if r.exit_code == 0:
-                break
-            time.sleep(interval_s)
-        else:
-            log = ""
-            try:
-                log = sandbox.read_text(_PROXY_LOG_PATH)
-            except Exception:
-                pass
-            proxy_job.kill()
-            raise RuntimeError(
-                f"proxy did not start within {attempts * interval_s:.0f}s. "
-                f"log:\n{log[-2000:]}"
-            )
-
-        base_url_override = f"http://127.0.0.1:{_PROXY_PORT}/v1"
-        return proxy_job, base_url_override, _PROXY_TRACE_PATH
-
 
 __all__ = [
     "OpenCodeSession",
diff --git a/envs/opencode_env/opencode_runtime.py b/envs/opencode_env/opencode_runtime.py
index 07fd5322d..75fed41e3 100644
--- a/envs/opencode_env/opencode_runtime.py
+++ b/envs/opencode_env/opencode_runtime.py
@@ -111,7 +111,9 @@ def build_run_cmd(config: OpenCodeConfig) -> str:
     ).strip()
 
 
-def build_env_vars(config: OpenCodeConfig, *, base_url_override: str | None = None) -> dict[str, str]:
+def build_env_vars(
+    config: OpenCodeConfig, *, base_url_override: str | None = None
+) -> dict[str, str]:
     """Return env vars to set on the OpenCode process.
 
     When a proxy is wrapping ``config.base_url`` the factory passes the proxy's
diff --git a/envs/opencode_env/sandbox/__init__.py b/envs/opencode_env/sandbox/__init__.py
index a3496a2b1..8a2477104 100644
--- a/envs/opencode_env/sandbox/__init__.py
+++ b/envs/opencode_env/sandbox/__init__.py
@@ -4,13 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Sandbox backends — re-exported from ``openenv.core.harness.sandbox``.
+"""Sandbox backends live in ``openenv.core.harness.sandbox``.
 
-The canonical source for sandbox protocols and implementations now lives in
-``src/openenv/core/harness/sandbox/``.  This package re-exports everything
-so that ``from opencode_env.sandbox import ...`` keeps working, but all new
-code should import from ``openenv.core.harness.sandbox`` directly.
+This package exists only for the ``build_template`` helper used by E2B
+template builds. Import sandbox protocols and backends from
+``openenv.core.harness.sandbox`` directly.
 """
-
-from openenv.core.harness.sandbox import *  # noqa: F401,F403
-from openenv.core.harness.sandbox import __all__  # noqa: F401
diff --git a/envs/opencode_env/sandbox/build_template.py b/envs/opencode_env/sandbox/build_template.py
index 084a95e64..6e0ba4f75 100644
--- a/envs/opencode_env/sandbox/build_template.py
+++ b/envs/opencode_env/sandbox/build_template.py
@@ -41,7 +41,7 @@
 import sys
 from pathlib import Path
 
-from e2b import Template, default_build_logger
+from e2b import default_build_logger, Template
 
 
 _REPO_ROOT = Path(__file__).resolve().parents[3]
@@ -128,8 +128,7 @@ def main(argv: list[str] | None = None) -> int:
         print("ERROR: E2B_API_KEY required.", file=sys.stderr)
         return 2
 
-    print(f"Building template '{args.name}' "
-          f"(proxy source: {_PROXY_SOURCE})")
+    print(f"Building template '{args.name}' (proxy source: {_PROXY_SOURCE})")
     print(f"Skip cache: {args.skip_cache}")
     print()
 
diff --git a/envs/opencode_env/server/app.py b/envs/opencode_env/server/app.py
index 200c7f2d7..0757ef229 100644
--- a/envs/opencode_env/server/app.py
+++ b/envs/opencode_env/server/app.py
@@ -56,19 +56,13 @@ def _load_env_file() -> None:
 
 try:
     from openenv.core.env_server.http_server import create_app
-    from openenv.core.env_server.mcp_types import (
-        CallToolAction,
-        CallToolObservation,
-    )
+    from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
 
     from .gradio_ui import opencode_gradio_builder
     from .opencode_environment import OpenCodeEnvironment
 except ImportError:  # pragma: no cover
     from openenv.core.env_server.http_server import create_app
-    from openenv.core.env_server.mcp_types import (
-        CallToolAction,
-        CallToolObservation,
-    )
+    from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
     from server.gradio_ui import opencode_gradio_builder  # type: ignore
     from server.opencode_environment import OpenCodeEnvironment  # type: ignore
 
diff --git a/envs/opencode_env/server/gradio_ui.py b/envs/opencode_env/server/gradio_ui.py
index 79a696d75..d1ee6e403 100644
--- a/envs/opencode_env/server/gradio_ui.py
+++ b/envs/opencode_env/server/gradio_ui.py
@@ -31,10 +31,14 @@
 import gradio as gr
 
 try:
-    from .catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint
+    from .catalog import catalog_summary, ENDPOINT_KINDS, resolve_endpoint
     from .opencode_environment import OpenCodeEnvironment
 except ImportError:  # pragma: no cover
-    from server.catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint  # type: ignore
+    from server.catalog import (  # type: ignore
+        catalog_summary,
+        ENDPOINT_KINDS,
+        resolve_endpoint,
+    )
     from server.opencode_environment import OpenCodeEnvironment  # type: ignore
 
 
@@ -144,7 +148,9 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]:
                 cmd if len(cmd) <= 80 else cmd[:77] + "...",
                 str(it.get("exit_code", "")),
                 f"{it.get('duration_s', 0):.2f}s",
-                (it.get("stderr") or "").splitlines()[-1][:80] if it.get("exit_code") else "",
+                (it.get("stderr") or "").splitlines()[-1][:80]
+                if it.get("exit_code")
+                else "",
             ]
         )
     return rows
@@ -175,7 +181,8 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str:
         finishes[f] = finishes.get(f, 0) + 1
     if finishes:
         lines.append(
-            "**finish_reasons**: " + "  ".join(f"`{k}={v}`" for k, v in finishes.items())
+            "**finish_reasons**: "
+            + "  ".join(f"`{k}={v}`" for k, v in finishes.items())
         )
     productive_rows = [t for t in turns if t.get("completion_tokens")]
     if productive_rows:
@@ -249,12 +256,12 @@ def _catalog_banner() -> str:
 
 
 def opencode_gradio_builder(
-    web_manager,        # noqa: ARG001 (unused: we instantiate the env directly)
-    action_fields,      # noqa: ARG001
-    metadata,           # noqa: ARG001
-    is_chat_env,        # noqa: ARG001
+    web_manager,  # noqa: ARG001 (unused: we instantiate the env directly)
+    action_fields,  # noqa: ARG001
+    metadata,  # noqa: ARG001
+    is_chat_env,  # noqa: ARG001
     title,
-    quick_start_md,     # noqa: ARG001
+    quick_start_md,  # noqa: ARG001
 ) -> gr.Blocks:
     """Build the opencode_env console.
 
@@ -355,7 +362,12 @@ def _worker():
         # First yield: announce we've started. Empty result panels.
         yield (
             f"### running…\n\n_endpoint=`{resolved.kind}`  model=`{resolved.model}`  mode=`{mode}`_",
-            [], [], "", "", "", {},
+            [],
+            [],
+            "",
+            "",
+            "",
+            {},
         )
 
         status_lines: list[tuple[float, str]] = []
@@ -374,7 +386,9 @@ def _worker():
 
             # Render the live status pane.
             elapsed = time.time() - t_start
-            md = _live_status_md(resolved.kind, resolved.model, mode, elapsed, status_lines)
+            md = _live_status_md(
+                resolved.kind, resolved.model, mode, elapsed, status_lines
+            )
             yield (md, [], [], "", "", "", {})
 
         # Drain any final messages still in the queue.
@@ -390,9 +404,17 @@ def _worker():
             err = result_holder.get("error", "unknown error")
             yield (
                 f"### error\n\n```\n{err}\n```",
-                [], [], "", "",
-                _live_status_md(resolved.kind, resolved.model, mode,
-                                time.time() - t_start, status_lines),
+                [],
+                [],
+                "",
+                "",
+                _live_status_md(
+                    resolved.kind,
+                    resolved.model,
+                    mode,
+                    time.time() - t_start,
+                    status_lines,
+                ),
                 {"error": err},
             )
             return
@@ -406,8 +428,13 @@ def _worker():
             _logprobs_md(result.get("proxy_turns") or []),
             (
                 f"### live phase log\n\n"
-                + _live_status_md(resolved.kind, resolved.model, mode,
-                                  time.time() - t_start, status_lines)
+                + _live_status_md(
+                    resolved.kind,
+                    resolved.model,
+                    mode,
+                    time.time() - t_start,
+                    status_lines,
+                )
                 + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n"
                 f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```"
             ),
@@ -436,17 +463,21 @@ def apply_preset(name: str) -> tuple[str, str, str]:
                 scale=1,
             )
             model = gr.Textbox(
-                label="Model (blank → catalog default)", placeholder="gpt-4o-mini",
+                label="Model (blank → catalog default)",
+                placeholder="gpt-4o-mini",
                 scale=2,
             )
         with gr.Row():
             base_url = gr.Textbox(
                 label="Base URL (blank → env / catalog default)",
-                placeholder="https://api.openai.com/v1", scale=2,
+                placeholder="https://api.openai.com/v1",
+                scale=2,
             )
             api_key = gr.Textbox(
                 label="API key (blank → server env var)",
-                placeholder="(server env)", type="password", scale=1,
+                placeholder="(server env)",
+                type="password",
+                scale=1,
             )
 
         instruction = gr.Textbox(
@@ -536,14 +567,28 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         run_btn.click(
             fn=run,
             inputs=[
-                endpoint, model, base_url, api_key,
-                instruction, setup_text, verify_text,
-                mode, disable_thinking, template,
-                max_tokens_cap, top_logprobs, agent_timeout_s,
+                endpoint,
+                model,
+                base_url,
+                api_key,
+                instruction,
+                setup_text,
+                verify_text,
+                mode,
+                disable_thinking,
+                template,
+                max_tokens_cap,
+                top_logprobs,
+                agent_timeout_s,
             ],
             outputs=[
-                summary_md, setup_table, verify_table,
-                files_md, logprobs_md, logs_md, raw_json,
+                summary_md,
+                setup_table,
+                verify_table,
+                files_md,
+                logprobs_md,
+                logs_md,
+                raw_json,
             ],
         )
 
diff --git a/envs/opencode_env/server/opencode_environment.py b/envs/opencode_env/server/opencode_environment.py
index 07f0d69ed..638dd5473 100644
--- a/envs/opencode_env/server/opencode_environment.py
+++ b/envs/opencode_env/server/opencode_environment.py
@@ -189,9 +189,7 @@ def reset(
             reward=None,
             metadata={
                 "status": "ready",
-                "message": (
-                    "opencode_env ready. Call run_rollout(...) with a task."
-                ),
+                "message": ("opencode_env ready. Call run_rollout(...) with a task."),
             },
         )
 
@@ -399,8 +397,12 @@ def _emit(msg: str) -> None:
             result.error = f"{type(exc).__name__}: {exc}"
             _emit(f"ERROR: {result.error}")
             if session is not None:
-                result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
-                result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
+                result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[
+                    -2000:
+                ]
+                result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[
+                    -2000:
+                ]
         finally:
             if session is not None:
                 try:
@@ -450,9 +452,7 @@ def _read_reward(self, sandbox: Any) -> float | None:
         except ValueError:
             return None
 
-    def _collect_files(
-        self, sandbox: Any
-    ) -> tuple[dict[str, str], list[str]]:
+    def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]:
         listing = sandbox.exec(
             f"find {WORKDIR} -maxdepth 2 -type f -size -64k 2>/dev/null | head -32",
             timeout=10,
@@ -491,7 +491,8 @@ def _collect_proxy_turns(self, session: Any) -> list[Any]:
                     completion_tokens=list(rec.get("completion_tokens") or []),
                     completion_token_ids=list(rec.get("completion_token_ids") or []),
                     per_token_logps=[
-                        float(x) for x in (rec.get("per_token_logps") or [])
+                        float(x)
+                        for x in (rec.get("per_token_logps") or [])
                         if x is not None
                     ],
                     latency_s=float(rec.get("latency_s") or 0.0),
diff --git a/src/openenv/core/harness/agents/__init__.py b/src/openenv/core/harness/agents/__init__.py
new file mode 100644
index 000000000..8ef31976b
--- /dev/null
+++ b/src/openenv/core/harness/agents/__init__.py
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Agent registry and public API for CLI-based agentic harnesses.
+
+The registry maps agent names (``"opencode"``, ``"claude-code"``, etc.) to
+their :class:`CLIAgentSpec` declarations. Each agent module registers itself
+via :func:`register_agent` at import time.
+
+Usage::
+
+    from openenv.core.harness.agents import get_agent_spec, list_agents
+
+    spec = get_agent_spec("opencode")
+    print(list_agents())  # ["opencode"]
+"""
+
+from __future__ import annotations
+
+from .base import (
+    AgentConfig,
+    AgentEvent,
+    AgentTask,
+    ArtifactSpec,
+    CLIAgentSpec,
+    MCPConfigSpec,
+)
+
+# Registry
+
+_REGISTRY: dict[str, CLIAgentSpec] = {}
+
+
+def register_agent(spec: CLIAgentSpec) -> None:
+    """Register a :class:`CLIAgentSpec` under ``spec.name``.
+
+    Raises :class:`ValueError` if the name is already registered with a
+    *different* spec object (re-registering the same object is a no-op,
+    which makes ``importlib.reload`` safe).
+    """
+    existing = _REGISTRY.get(spec.name)
+    if existing is not None and existing is not spec:
+        raise ValueError(
+            f"Agent {spec.name!r} is already registered. "
+            "Use a unique name or call unregister_agent() first."
+        )
+    _REGISTRY[spec.name] = spec
+
+
+def unregister_agent(name: str) -> CLIAgentSpec | None:
+    """Remove a registered agent spec, returning it (or ``None``)."""
+    return _REGISTRY.pop(name, None)
+
+
+def get_agent_spec(name: str) -> CLIAgentSpec:
+    """Look up a registered agent spec by name.
+
+    Raises :class:`KeyError` if not found. To trigger auto-registration of
+    built-in agents, import the specific module first (e.g.
+    ``import openenv.core.harness.agents.opencode``).
+    """
+    if name not in _REGISTRY:
+        # Auto-import built-in agent modules to trigger registration.
+        _auto_import(name)
+    try:
+        return _REGISTRY[name]
+    except KeyError:
+        available = ", ".join(sorted(_REGISTRY)) or "(none)"
+        raise KeyError(
+            f"Unknown agent {name!r}. Registered agents: {available}"
+        ) from None
+
+
+def list_agents() -> list[str]:
+    """Return sorted names of all registered agents."""
+    return sorted(_REGISTRY)
+
+
+def _auto_import(name: str) -> None:
+    """Try to import the built-in module for ``name`` to trigger registration."""
+    # Map agent names to module names (handles hyphens).
+    module_name = name.replace("-", "_")
+    try:
+        __import__(f"openenv.core.harness.agents.{module_name}", fromlist=["_"])
+    except ImportError:
+        pass
+
+
+# Convenience re-exports
+
+__all__ = [
+    # Registry
+    "get_agent_spec",
+    "list_agents",
+    "register_agent",
+    "unregister_agent",
+    # Base types
+    "AgentConfig",
+    "AgentEvent",
+    "AgentTask",
+    "ArtifactSpec",
+    "CLIAgentSpec",
+    "MCPConfigSpec",
+]
diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py
new file mode 100644
index 000000000..145d3001e
--- /dev/null
+++ b/src/openenv/core/harness/agents/base.py
@@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Agent spec and event protocols for CLI-based agentic harnesses.
+
+Defines the declarative :class:`CLIAgentSpec` data model that captures
+*everything* a CLI harness needs — install commands, file uploads, MCP
+config format, environment variables, artifacts to collect, and three
+small callables (command builder, MCP config builder, event parser).
+
+The :class:`CLIAgentDriver` reads these fields mechanically without knowing
+anything about the specific agent. Adding a new agent is filling in a
+dataclass, not writing driver code.
+
+Pattern borrowed from `verifiers <https://github.com/PrimeIntellect-ai/verifiers>`_
+(Prime Intellect), where OpenCode, MiniSWEAgent, Pi, and RLM all express
+their differences through constructor data passed to ``CLIHarness.__init__()``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Callable, Literal, Protocol
+
+
+# MCP config injection
+
+
+@dataclass(frozen=True)
+class MCPConfigSpec:
+    """How a harness discovers MCP tools.
+
+    ``method`` controls how the driver injects MCP server configuration:
+
+    - ``"config_file"`` — write a JSON file at ``path_template`` (e.g.
+      ``"{workdir}/mcp.json"``).  The template receives ``{workdir}``
+      and ``{home}`` substitutions at runtime.
+    - ``"cli_flags"`` — the driver passes MCP configuration via CLI
+      flags built by :attr:`CLIAgentSpec.build_command`.
+    - ``"settings_file"`` — write into a global settings file (e.g.
+      Gemini's ``~/.gemini/settings.json``).
+    """
+
+    method: Literal["config_file", "cli_flags", "settings_file"]
+    path_template: str | None = None
+
+
+# Artifacts
+
+
+@dataclass(frozen=True)
+class ArtifactSpec:
+    """Declares a file to collect from the sandbox after the agent exits.
+
+    The driver iterates :attr:`CLIAgentSpec.artifacts` and calls
+    ``sandbox.read_text(spec.path)`` for each entry. No per-agent collection
+    methods needed — the spec declares *what* to collect, the driver collects
+    it.
+    """
+
+    path: str
+    format: Literal["text", "json", "jsonl"] = "text"
+    optional: bool = True
+
+
+# Agent events (normalized across harnesses)
+
+
+@dataclass
+class AgentEvent:
+    """Normalized event from any CLI harness's stdout.
+
+    The :attr:`CLIAgentSpec.parse_events` callable converts raw JSONL lines
+    into these events so the driver can log and observe the agent's progress
+    without knowing which agent is running.
+    """
+
+    type: Literal[
+        "assistant",
+        "tool_call",
+        "tool_result",
+        "reasoning",
+        "error",
+        "done",
+    ]
+    data: dict[str, Any] = field(default_factory=dict)
+    raw: str = ""
+
+
+# Task protocol
+
+
+class AgentTask(Protocol):
+    """Minimal interface a task must satisfy for the CLI agent driver."""
+
+    @property
+    def instruction(self) -> str: ...
+
+    @property
+    def setup_shell(self) -> str | None: ...
+
+    @property
+    def upload_files(self) -> dict[str, str]: ...
+
+    @property
+    def metadata(self) -> dict[str, Any]: ...
+
+
+# Agent config protocol
+
+
+class AgentConfig(Protocol):
+    """Minimal interface a config must satisfy for the CLI agent driver.
+
+    This is intentionally thin — concrete configs like :class:`OpenCodeConfig`
+    carry much more, but the generic driver only accesses these.
+    """
+
+    @property
+    def base_url(self) -> str: ...
+
+    @property
+    def api_key(self) -> str: ...
+
+    @property
+    def model(self) -> str: ...
+
+    @property
+    def agent_timeout_s(self) -> float: ...
+
+
+# CLIAgentSpec — the core declarative data model
+
+
+@dataclass
+class CLIAgentSpec:
+    """Declarative specification for a CLI-based agentic harness.
+
+    Following the pattern established by verifiers' ``CLIHarness`` (Prime
+    Intellect), as much per-agent knowledge as possible is expressed as
+    *data* rather than imperative code. The :class:`CLIAgentDriver`
+    iterates these fields mechanically — it never needs to know what
+    ``"pi"`` or ``"claude-code"`` means.
+
+    Three callables cover the remaining agent-specific logic that can't
+    be expressed as pure data:
+
+    - :attr:`build_command` — constructs the CLI argv
+    - :attr:`build_mcp_config` — serializes MCP server configuration
+    - :attr:`parse_events` — converts raw stdout lines to :class:`AgentEvent`
+
+    Everything else — file uploads, env vars, install scripts, artifact
+    collection — is pure data.
+    """
+
+    name: str
+    """Unique identifier: ``"opencode"``, ``"claude-code"``, ``"codex"``, etc."""
+
+    install_check_cmd: list[str]
+    """Command to probe whether the agent is already installed.
+
+    Example: ``["claude", "--version"]``
+    """
+
+    base_command: list[str]
+    """Base CLI invocation (before task-specific flags).
+
+    Example: ``["claude", "--print", "--output-format", "stream-json"]``
+    """
+
+    mcp_config: MCPConfigSpec
+    """How MCP tool configuration is injected."""
+
+    supports_logprob_proxy: bool = True
+    """Whether this agent can be routed through the interception proxy."""
+
+    default_timeout_s: float = 600.0
+    """Default per-rollout timeout in seconds."""
+
+    setup: str | list[str] | None = None
+    """Shell command(s) to install the agent CLI inside the sandbox.
+
+    Run once after the sandbox is created, before any files are written.
+    Skipped when ``install_check_cmd`` succeeds (pre-baked template).
+    Can be a single string or a list of strings executed in order.
+    """
+
+    files: dict[str, str | Callable] | None = None
+    """Files to upload into the sandbox before the agent starts.
+
+    Keys are absolute sandbox paths. Values are either literal strings or
+    callables ``(task, config) -> str`` resolved at rollout time.
+    """
+
+    artifacts: dict[str, ArtifactSpec] | None = None
+    """Files to collect from the sandbox after the agent exits.
+
+    The driver iterates this dict and calls ``sandbox.read_text(spec.path)``
+    for each entry.
+    """
+
+    env: dict[str, str] | None = None
+    """Environment variables for the agent process.
+
+    Values can contain ``{model}``, ``{base_url}``, ``{api_key}`` placeholders
+    resolved from the rollout config at runtime.
+    """
+
+    build_command: Callable[..., str] | None = None
+    """``(spec, config, task, mcp_config_path) -> str``
+
+    Build the full shell command line for launching the agent. Returns a
+    string (not a list) because sandbox ``start_bg`` / ``exec`` take shell
+    strings.
+    """
+
+    build_mcp_config: Callable[..., str] | None = None
+    """``(spec, tools, workdir) -> str``
+
+    Serialize MCP server configuration in the format the agent expects.
+    Returns the file content (for ``config_file``/``settings_file`` methods)
+    or empty string (for ``cli_flags``, where the command builder handles it).
+    """
+
+    parse_events: Callable[[str], AgentEvent | None] | None = None
+    """``(line: str) -> AgentEvent | None``
+
+    Parse one line of the agent's stdout into a normalized event.
+    Return ``None`` for lines that are not parseable events.
+    """
+
+    build_env_vars: Callable[..., dict[str, str]] | None = None
+    """``(spec, config) -> dict[str, str]``
+
+    Optional override for env var construction. When provided, this is
+    called *instead of* resolving placeholders in :attr:`env`. Prefer
+    the declarative :attr:`env` dict for new agents.
+    """
+
+
+__all__ = [
+    "AgentConfig",
+    "AgentEvent",
+    "AgentTask",
+    "ArtifactSpec",
+    "CLIAgentSpec",
+    "MCPConfigSpec",
+]
diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
new file mode 100644
index 000000000..8e8179889
--- /dev/null
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -0,0 +1,716 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Shared CLI agent driver, session, and session factory.
+
+The :class:`CLIAgentDriver` factors out the common 70% of CLI harness
+lifecycle — sandbox creation, MCP config injection, interception proxy
+setup, subprocess management, and result collection.
+
+It is **fully generic**: it reads the :class:`CLIAgentSpec`'s declarative
+data fields and executes them mechanically. No per-agent code lives here.
+
+The :class:`CLIAgentSession` implements :class:`ResourceSession` and
+the :class:`CLIAgentSessionFactory` implements :class:`ResourceSessionFactory`,
+so the CLI agent driver integrates seamlessly with the existing harness
+runtime from PR #603.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import shlex
+import time
+from pathlib import Path
+from typing import Any, Callable, Literal
+
+from openenv.core.env_server.mcp_types import Tool
+from openenv.core.harness import (
+    Message,
+    ResourceSession,
+    ResourceSessionFactory,
+    ToolResult,
+    VerifyResult,
+)
+from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
+
+from .base import CLIAgentSpec
+
+
+_log = logging.getLogger(__name__)
+
+# Interception proxy defaults
+_PROXY_PORT = 7000
+_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
+_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
+
+# Where the proxy source lives on disk. Uploaded into sandboxes that don't
+# already have it baked in.
+_PROXY_SOURCE_PATH = Path(__file__).resolve().parents[1] / "sandbox" / "interception.py"
+
+# Verifier type — same as opencode_env's Verifier alias
+Verifier = Callable[..., VerifyResult]
+
+
+# CLIAgentSession
+
+
+class CLIAgentSession(ResourceSession):
+    """Per-rollout session wrapping one sandbox with one running agent CLI.
+
+    The session is created already-running: :meth:`CLIAgentSessionFactory.create`
+    launches the agent before returning. Typical usage::
+
+        session = factory.create(task)
+        session.wait_for_completion()
+        result = session.verify([])
+        session.close()
+    """
+
+    def __init__(
+        self,
+        *,
+        spec: CLIAgentSpec,
+        sandbox: SandboxHandle,
+        task: Any,
+        config: Any,
+        verifier: Verifier | None = None,
+        base_url_override: str | None = None,
+        proxy_trace_path: str | None = None,
+        proxy_bg_job: BgJob | None = None,
+        agent_bg_job: BgJob | None = None,
+    ) -> None:
+        self.spec = spec
+        self.sandbox = sandbox
+        self.task = task
+        self.config = config
+        self._verifier = verifier
+        self._base_url_override = base_url_override
+        self._proxy_trace_path = proxy_trace_path
+        self._proxy_bg_job = proxy_bg_job
+        self._agent_bg_job = agent_bg_job
+
+    # ResourceSession contract
+
+    def initial_messages(self) -> list[Message]:
+        instruction = (
+            self.task.instruction
+            if hasattr(self.task, "instruction")
+            else str(self.task)
+        )
+        return [{"role": "user", "content": instruction}]
+
+    def list_tools(self) -> list[Tool]:
+        # CLI agents own their own tool loop — none are exposed to the harness.
+        return []
+
+    def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
+        return ToolResult(
+            error=(
+                f"{self.spec.name} session does not expose external tool calls; "
+                "the CLI agent owns its own tool loop."
+            )
+        )
+
+    def verify(
+        self,
+        transcript: list[Message],
+        final_state: Any | None = None,
+    ) -> VerifyResult:
+        if self._verifier is None:
+            return VerifyResult(env_reward=None, done=True)
+        return self._verifier(self.sandbox, self.task)
+
+    def close(self) -> None:
+        if self._agent_bg_job is not None:
+            try:
+                self._agent_bg_job.kill()
+            except Exception:
+                pass
+            self._agent_bg_job = None
+        if self._proxy_bg_job is not None:
+            try:
+                self._proxy_bg_job.kill()
+            except Exception:
+                pass
+            self._proxy_bg_job = None
+        self.sandbox.kill()
+
+    # CLI-agent-specific API
+
+    def wait_for_completion(self, timeout_s: float | None = None) -> int:
+        """Block until the agent exits, returning its exit code."""
+        budget = timeout_s if timeout_s is not None else self.spec.default_timeout_s
+        if hasattr(self.config, "agent_timeout_s"):
+            budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
+        if self._agent_bg_job is None:
+            raise RuntimeError("Agent not started.")
+        return self._agent_bg_job.wait(timeout=budget)
+
+    def collect_artifacts(self) -> dict[str, Any]:
+        """Collect all artifacts declared in ``spec.artifacts`` from the sandbox.
+
+        Returns a dict keyed by artifact name. Missing optional artifacts are
+        silently skipped.
+        """
+        result: dict[str, Any] = {}
+        if not self.spec.artifacts:
+            return result
+        for name, artifact_spec in self.spec.artifacts.items():
+            try:
+                content = self.sandbox.read_text(artifact_spec.path)
+                if artifact_spec.format == "json":
+                    result[name] = json.loads(content)
+                elif artifact_spec.format == "jsonl":
+                    result[name] = [
+                        json.loads(line)
+                        for line in content.splitlines()
+                        if line.strip()
+                    ]
+                else:
+                    result[name] = content
+            except Exception:
+                if not artifact_spec.optional:
+                    raise
+                _log.debug(
+                    "Optional artifact %r (%s) not found, skipping",
+                    name,
+                    artifact_spec.path,
+                )
+        return result
+
+    def fetch_proxy_trace(self) -> list[dict[str, Any]]:
+        """Return per-turn proxy-captured records (transparent_proxy mode only).
+
+        Each entry has ``request``, ``response``, ``completion_tokens``,
+        ``completion_token_ids``, ``per_token_logps``, ``finish_reason``,
+        and ``latency_s``. Returns ``[]`` in black_box mode.
+        """
+        if self._proxy_trace_path is None:
+            return []
+        try:
+            content = self.sandbox.read_text(self._proxy_trace_path)
+        except Exception:
+            return []
+        records: list[dict[str, Any]] = []
+        for line in content.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            records.append(json.loads(line))
+        return records
+
+
+# CLIAgentDriver — shared lifecycle
+
+
+class CLIAgentDriver:
+    """Shared driver for all CLI-based agentic harnesses.
+
+    Implements the common lifecycle:
+
+    1. Create sandbox (via :class:`SandboxBackend`)
+    2. Wait for sandbox ready (``echo ok`` probe)
+    3. Install agent CLI — run ``spec.setup`` commands (skipped if
+       ``spec.install_check_cmd`` succeeds, i.e. pre-baked template)
+    4. Upload ``spec.files`` into the sandbox
+    5. Write MCP config (via ``spec.build_mcp_config``)
+    6. Set environment variables from ``spec.env`` (with placeholder
+       resolution)
+    7. Optionally start interception proxy (transparent_proxy mode)
+    8. Build CLI command (via ``spec.build_command``)
+    9. Launch agent as bg process
+    10. Return a :class:`CLIAgentSession`
+    """
+
+    def __init__(
+        self,
+        spec: CLIAgentSpec,
+        sandbox_backend: SandboxBackend,
+        mode: Literal["black_box", "transparent_proxy"] = "black_box",
+        *,
+        install_timeout_s: int = 240,
+        setup_timeout_s: int = 300,
+        proxy_top_logprobs: int = 5,
+        proxy_max_tokens_cap: int | None = 16384,
+        proxy_disable_thinking: bool = False,
+    ) -> None:
+        if mode not in {"black_box", "transparent_proxy"}:
+            raise ValueError(f"Unknown mode: {mode!r}")
+        self.spec = spec
+        self.sandbox_backend = sandbox_backend
+        self.mode = mode
+        self._install_timeout_s = install_timeout_s
+        self._setup_timeout_s = setup_timeout_s
+        self._proxy_top_logprobs = proxy_top_logprobs
+        self._proxy_max_tokens_cap = proxy_max_tokens_cap
+        self._proxy_disable_thinking = proxy_disable_thinking
+
+    def create_session(
+        self,
+        task: Any,
+        config: Any,
+        *,
+        verifier: Verifier | None = None,
+        seed: int | None = None,
+        episode_id: str | None = None,
+    ) -> CLIAgentSession:
+        """Create a fully bootstrapped session with a running agent.
+
+        This is the main entry point. It:
+        1. Creates a sandbox
+        2. Bootstraps it (install agent, upload files, write MCP config)
+        3. Optionally starts the interception proxy
+        4. Launches the agent subprocess
+        5. Returns a ready-to-use :class:`CLIAgentSession`
+        """
+        timeout_s = (
+            config.agent_timeout_s
+            if hasattr(config, "agent_timeout_s")
+            else self.spec.default_timeout_s
+        )
+        sandbox_timeout = int(timeout_s) + 300
+
+        _log.info(
+            "%s driver: creating sandbox timeout=%ds mode=%s",
+            self.spec.name,
+            sandbox_timeout,
+            self.mode,
+        )
+        sandbox = self.sandbox_backend.create(
+            timeout_s=sandbox_timeout,
+            metadata={"episode_id": episode_id} if episode_id else None,
+        )
+        sid = getattr(sandbox, "sandbox_id", "?")
+        _log.info("%s driver: sandbox=%s — bootstrapping…", self.spec.name, sid)
+
+        try:
+            self._bootstrap_sandbox(sandbox, task, config)
+        except Exception as exc:
+            _log.error("%s driver: bootstrap failed: %r", self.spec.name, exc)
+            sandbox.kill()
+            raise
+
+        base_url_override: str | None = None
+        proxy_trace_path: str | None = None
+        proxy_bg_job: BgJob | None = None
+
+        if self.mode == "transparent_proxy":
+            base_url = config.base_url if hasattr(config, "base_url") else ""
+            api_key = config.api_key if hasattr(config, "api_key") else "intercepted"
+            model = config.model if hasattr(config, "model") else ""
+
+            _log.info(
+                "%s driver: starting interception proxy on :%d → %s",
+                self.spec.name,
+                _PROXY_PORT,
+                base_url,
+            )
+            proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy(
+                sandbox,
+                base_url=base_url,
+                api_key=api_key,
+                model=model,
+            )
+            _log.info("%s driver: proxy up at %s", self.spec.name, base_url_override)
+
+        agent_bg_job = self._start_agent(
+            sandbox,
+            task,
+            config,
+            base_url_override=base_url_override,
+        )
+
+        return CLIAgentSession(
+            spec=self.spec,
+            sandbox=sandbox,
+            task=task,
+            config=config,
+            verifier=verifier,
+            base_url_override=base_url_override,
+            proxy_trace_path=proxy_trace_path,
+            proxy_bg_job=proxy_bg_job,
+            agent_bg_job=agent_bg_job,
+        )
+
+    # Bootstrap stages
+
+    def _bootstrap_sandbox(
+        self,
+        sandbox: SandboxHandle,
+        task: Any,
+        config: Any,
+    ) -> None:
+        """Install agent, upload files, write MCP config."""
+
+        # Stage 1: wait for sandbox readiness
+        self._wait_for_sandbox_ready(sandbox)
+
+        # Stage 2: install agent CLI (skip if pre-baked)
+        if not self._agent_already_installed(sandbox):
+            self._install_agent(sandbox)
+
+        # Stage 3: upload spec.files
+        self._upload_files(sandbox, task, config)
+
+        # Stage 4: write MCP config (if the spec provides a builder)
+        self._write_mcp_config(sandbox, config)
+
+        # Stage 5: run task.setup_shell if present
+        setup_shell = task.setup_shell if hasattr(task, "setup_shell") else None
+        if setup_shell:
+            r = sandbox.exec(setup_shell, timeout=self._setup_timeout_s)
+            if r.exit_code != 0:
+                raise RuntimeError(
+                    f"task.setup_shell failed ({r.exit_code}): {r.stderr}"
+                )
+
+    def _wait_for_sandbox_ready(
+        self,
+        sandbox: SandboxHandle,
+        *,
+        attempts: int = 15,
+        delay_s: float = 1.0,
+    ) -> None:
+        """Probe sandbox until ``echo ok`` succeeds."""
+        last_err = ""
+        for _ in range(attempts):
+            try:
+                r = sandbox.exec("echo ok", timeout=5)
+                if r.exit_code == 0 and "ok" in (r.stdout or ""):
+                    return
+                last_err = (r.stderr or r.stdout or "").strip() or f"exit={r.exit_code}"
+            except Exception as exc:
+                last_err = f"{type(exc).__name__}: {exc}"
+            time.sleep(delay_s)
+        raise RuntimeError(
+            f"sandbox did not become ready within {attempts * delay_s:.0f}s "
+            f"(last error: {last_err})"
+        )
+
+    def _agent_already_installed(self, sandbox: SandboxHandle) -> bool:
+        """Check if the agent CLI is already available in the sandbox."""
+        cmd = " ".join(shlex.quote(c) for c in self.spec.install_check_cmd)
+        try:
+            r = sandbox.exec(cmd, timeout=10)
+            return r.exit_code == 0
+        except Exception:
+            return False
+
+    def _install_agent(self, sandbox: SandboxHandle) -> None:
+        """Run ``spec.setup`` commands to install the agent CLI."""
+        if self.spec.setup is None:
+            raise RuntimeError(
+                f"Agent {self.spec.name!r} is not installed in the sandbox "
+                "and no setup commands are provided in the spec."
+            )
+        commands = (
+            [self.spec.setup] if isinstance(self.spec.setup, str) else self.spec.setup
+        )
+        for cmd in commands:
+            self._exec_with_retry(
+                sandbox,
+                cmd,
+                timeout=self._install_timeout_s,
+                attempts=3,
+                backoff_s=3.0,
+                label=f"{self.spec.name} install",
+            )
+
+    def _upload_files(
+        self,
+        sandbox: SandboxHandle,
+        task: Any,
+        config: Any,
+    ) -> None:
+        """Upload ``spec.files`` into the sandbox, resolving callables."""
+        if not self.spec.files:
+            return
+        for path, content_or_fn in self.spec.files.items():
+            if callable(content_or_fn):
+                content = content_or_fn(task, config)
+            else:
+                content = content_or_fn
+            if content is not None:
+                sandbox.write_text(path, content)
+
+        # Also upload task.upload_files if the task has them.
+        upload_files = task.upload_files if hasattr(task, "upload_files") else {}
+        for path, content in upload_files.items():
+            sandbox.write_text(path, content)
+
+    def _write_mcp_config(
+        self,
+        sandbox: SandboxHandle,
+        config: Any,
+    ) -> None:
+        """Write MCP configuration using the spec's builder."""
+        if self.spec.build_mcp_config is None:
+            return
+        if (
+            self.spec.mcp_config.method == "config_file"
+            and self.spec.mcp_config.path_template
+        ):
+            workdir = (
+                config.sandbox_home + "/workdir"
+                if hasattr(config, "sandbox_home")
+                else "/home/user/workdir"
+            )
+            home = (
+                config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
+            )
+            mcp_path = self.spec.mcp_config.path_template.format(
+                workdir=workdir,
+                home=home,
+            )
+            mcp_content = self.spec.build_mcp_config(self.spec, [], workdir)
+            sandbox.write_text(mcp_path, mcp_content)
+
+    # Agent launch
+
+    def _start_agent(
+        self,
+        sandbox: SandboxHandle,
+        task: Any,
+        config: Any,
+        *,
+        base_url_override: str | None = None,
+    ) -> BgJob:
+        """Build CLI command, resolve env vars, and launch as bg process."""
+        # Build command via spec hook
+        if self.spec.build_command is not None:
+            cmd = self.spec.build_command(self.spec, config, task, None)
+        else:
+            cmd = " ".join(shlex.quote(c) for c in self.spec.base_command)
+
+        # Resolve environment variables
+        envs = self._resolve_env_vars(config, base_url_override=base_url_override)
+
+        _log.info("%s driver: launching agent", self.spec.name)
+        return sandbox.start_bg(cmd, envs=envs)
+
+    def _resolve_env_vars(
+        self,
+        config: Any,
+        *,
+        base_url_override: str | None = None,
+    ) -> dict[str, str]:
+        """Build the env var dict for the agent process.
+
+        If ``spec.build_env_vars`` is provided, delegate to it.
+        Otherwise resolve ``{placeholder}`` substitutions in ``spec.env``.
+        """
+        if self.spec.build_env_vars is not None:
+            return self.spec.build_env_vars(self.spec, config)
+
+        if not self.spec.env:
+            return {}
+
+        base_url = base_url_override or (
+            config.base_url if hasattr(config, "base_url") else ""
+        )
+        api_key = config.api_key if hasattr(config, "api_key") else "intercepted"
+        model = config.model if hasattr(config, "model") else ""
+
+        substitutions = {
+            "base_url": base_url,
+            "api_key": api_key,
+            "model": model,
+        }
+
+        resolved: dict[str, str] = {}
+        for key, value in self.spec.env.items():
+            try:
+                resolved[key] = value.format(**substitutions)
+            except KeyError:
+                # If a placeholder isn't in our substitutions, keep it as-is.
+                resolved[key] = value
+        return resolved
+
+    # Interception proxy
+
+    def _start_proxy(
+        self,
+        sandbox: SandboxHandle,
+        *,
+        base_url: str,
+        api_key: str,
+        model: str,
+    ) -> tuple[BgJob, str, str]:
+        """Install deps, start proxy as bg job, wait for healthz.
+
+        Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``.
+        """
+        proxy_already_present = sandbox.exists("/home/user/proxy/interception.py")
+
+        if not proxy_already_present:
+            self._exec_with_retry(
+                sandbox,
+                "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
+                "'httpx>=0.27' 2>&1 | tail -20",
+                timeout=180,
+                attempts=3,
+                backoff_s=2.0,
+                label="proxy deps install",
+            )
+            sandbox.write_text(
+                "/home/user/proxy/interception.py",
+                _PROXY_SOURCE_PATH.read_text(),
+            )
+            sandbox.write_text("/home/user/proxy/__init__.py", "")
+
+        proxy_args = [
+            "python",
+            "interception.py",
+            "--upstream-url",
+            base_url,
+            "--trace",
+            _PROXY_TRACE_PATH,
+            "--port",
+            str(_PROXY_PORT),
+            "--top-logprobs",
+            str(self._proxy_top_logprobs),
+        ]
+        if self._proxy_max_tokens_cap is not None:
+            proxy_args.extend(["--max-tokens-cap", str(self._proxy_max_tokens_cap)])
+        if self._proxy_disable_thinking:
+            proxy_args.append("--disable-thinking")
+        if model:
+            proxy_args.extend(["--model-override", model])
+
+        quoted = " ".join(shlex.quote(a) for a in proxy_args)
+        proxy_cmd = (
+            f"cd /home/user/proxy && {quoted} > {shlex.quote(_PROXY_LOG_PATH)} 2>&1"
+        )
+        proxy_env = {"OPENCODE_UPSTREAM_API_KEY": api_key}
+        proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env)
+
+        # Wait for proxy healthz
+        attempts = 120
+        interval_s = 0.5
+        for _ in range(attempts):
+            r = sandbox.exec(
+                f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz",
+                timeout=5,
+            )
+            if r.exit_code == 0:
+                break
+            time.sleep(interval_s)
+        else:
+            log_content = ""
+            try:
+                log_content = sandbox.read_text(_PROXY_LOG_PATH)
+            except Exception:
+                pass
+            proxy_job.kill()
+            raise RuntimeError(
+                f"proxy did not start within {attempts * interval_s:.0f}s. "
+                f"log:\n{log_content[-2000:]}"
+            )
+
+        override_url = f"http://127.0.0.1:{_PROXY_PORT}/v1"
+        return proxy_job, override_url, _PROXY_TRACE_PATH
+
+    # Utilities
+
+    def _exec_with_retry(
+        self,
+        sandbox: SandboxHandle,
+        cmd: str,
+        *,
+        timeout: float,
+        attempts: int = 3,
+        backoff_s: float = 3.0,
+        label: str = "cmd",
+    ) -> Any:
+        """Run ``sandbox.exec`` with exponential backoff on transient failure."""
+        last_stdout = ""
+        last_stderr = ""
+        last_exit = 0
+        for i in range(attempts):
+            try:
+                r = sandbox.exec(cmd, timeout=timeout)
+                if r.exit_code == 0:
+                    return r
+                last_stdout = r.stdout or ""
+                last_stderr = r.stderr or ""
+                last_exit = r.exit_code
+                if last_stderr.strip():
+                    break
+            except Exception as exc:
+                last_stderr = f"{type(exc).__name__}: {exc}"
+                last_exit = -1
+            if i + 1 < attempts:
+                time.sleep(backoff_s * (2**i))
+        raise RuntimeError(
+            f"{label} failed after {attempts} attempts "
+            f"(exit={last_exit}, stderr={last_stderr!r}, "
+            f"stdout_tail={last_stdout[-400:]!r})"
+        )
+
+
+# CLIAgentSessionFactory
+
+
+class CLIAgentSessionFactory(ResourceSessionFactory):
+    """Factory that produces :class:`CLIAgentSession` instances for any
+    registered agent.
+
+    Wraps :class:`CLIAgentDriver` to satisfy the
+    :class:`ResourceSessionFactory` contract from PR #603.
+    """
+
+    def __init__(
+        self,
+        *,
+        spec: CLIAgentSpec,
+        config: Any,
+        sandbox_backend: SandboxBackend,
+        mode: Literal["black_box", "transparent_proxy"] = "black_box",
+        verifier: Verifier | None = None,
+        install_timeout_s: int = 240,
+        setup_timeout_s: int = 300,
+        proxy_top_logprobs: int = 5,
+        proxy_max_tokens_cap: int | None = 16384,
+        proxy_disable_thinking: bool = False,
+    ) -> None:
+        self._spec = spec
+        self._config = config
+        self._verifier = verifier
+        self._driver = CLIAgentDriver(
+            spec=spec,
+            sandbox_backend=sandbox_backend,
+            mode=mode,
+            install_timeout_s=install_timeout_s,
+            setup_timeout_s=setup_timeout_s,
+            proxy_top_logprobs=proxy_top_logprobs,
+            proxy_max_tokens_cap=proxy_max_tokens_cap,
+            proxy_disable_thinking=proxy_disable_thinking,
+        )
+
+    def create(
+        self,
+        task: Any,
+        seed: int | None = None,
+        episode_id: str | None = None,
+    ) -> CLIAgentSession:
+        """Create one isolated session for a rollout."""
+        return self._driver.create_session(
+            task=task,
+            config=self._config,
+            verifier=self._verifier,
+            seed=seed,
+            episode_id=episode_id,
+        )
+
+
+__all__ = [
+    "CLIAgentDriver",
+    "CLIAgentSession",
+    "CLIAgentSessionFactory",
+    "Verifier",
+]
diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py
new file mode 100644
index 000000000..b179e9c9f
--- /dev/null
+++ b/src/openenv/core/harness/agents/opencode.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenCode agent adapter.
+
+Expresses the OpenCode harness as a purely declarative :class:`CLIAgentSpec`.
+All builders (command construction, config generation, env var resolution)
+are self-contained with no imports from ``envs/opencode_env/``.
+
+Registered on import::
+
+    import openenv.core.harness.agents.opencode
+    # OPENCODE_SPEC is now in the registry
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from . import register_agent
+from .base import AgentEvent, ArtifactSpec, CLIAgentSpec, MCPConfigSpec
+
+
+# Command / config / env builders
+
+
+def _build_opencode_command(
+    spec: CLIAgentSpec,
+    config: Any,
+    task: Any,
+    mcp_config_path: str | None,
+) -> str:
+    """Build the ``opencode run`` shell command."""
+    home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
+    run_format = config.run_format if hasattr(config, "run_format") else "json"
+    format_flag = "--format json" if run_format == "json" else ""
+    instruction_file = f"{home}/task/instruction.md"
+    log_file = f"{home}/logs/agent/opencode.jsonl"
+    workdir = f"{home}/workdir"
+
+    return (
+        f'export PATH="$HOME/.opencode/bin:$PATH" && '
+        f"cd {workdir} && "
+        f'opencode run {format_flag} "$(cat {instruction_file})" '
+        f"2>&1 | tee {log_file}"
+    ).strip()
+
+
+def _build_opencode_mcp_config(
+    spec: CLIAgentSpec,
+    tools: list[Any],
+    workdir: str,
+) -> str:
+    """Build the ``opencode.json`` content for the MCP config file."""
+    return json.dumps(
+        {
+            "$schema": "https://opencode.ai/config.json",
+            "model": "intercepted/model",
+            "provider": {
+                "intercepted": {
+                    "npm": "@ai-sdk/openai-compatible",
+                    "name": "Intercepted",
+                    "options": {
+                        "baseURL": "http://127.0.0.1:7000/v1",
+                        "apiKey": "intercepted",
+                        "timeout": 600000,
+                    },
+                    "models": {
+                        "model": {"name": "Intercepted Model"},
+                    },
+                }
+            },
+        },
+        indent=2,
+    )
+
+
+def _build_opencode_env_vars(
+    spec: CLIAgentSpec,
+    config: Any,
+) -> dict[str, str]:
+    """Build env vars for the OpenCode process."""
+    home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
+    base_url = config.base_url if hasattr(config, "base_url") else ""
+    api_key = config.api_key if hasattr(config, "api_key") else "intercepted"
+    extra_env = config.extra_env if hasattr(config, "extra_env") else {}
+
+    env = dict(extra_env)
+    env["OPENAI_BASE_URL"] = base_url
+    env["OPENAI_API_KEY"] = api_key
+    env["OPENCODE_CONFIG"] = f"{home}/.config/opencode/opencode.json"
+    return env
+
+
+def _parse_opencode_event(line: str) -> AgentEvent | None:
+    """Parse one line of OpenCode's JSONL stdout."""
+    line = line.strip()
+    if not line:
+        return None
+    try:
+        data = json.loads(line)
+    except json.JSONDecodeError:
+        return None
+
+    event_type = data.get("type", "")
+    if event_type in ("assistant", "message"):
+        return AgentEvent(type="assistant", data=data, raw=line)
+    elif event_type in ("tool_call", "tool_use"):
+        return AgentEvent(type="tool_call", data=data, raw=line)
+    elif event_type in ("tool_result", "tool_response"):
+        return AgentEvent(type="tool_result", data=data, raw=line)
+    elif event_type == "error":
+        return AgentEvent(type="error", data=data, raw=line)
+    elif event_type in ("done", "complete", "end"):
+        return AgentEvent(type="done", data=data, raw=line)
+    return AgentEvent(type="assistant", data=data, raw=line)
+
+
+# File resolvers
+
+
+def _instruction_file_content(task: Any, config: Any) -> str:
+    return task.instruction if hasattr(task, "instruction") else str(task)
+
+
+def _system_prompt_content(task: Any, config: Any) -> str | None:
+    if hasattr(config, "system_prompt") and config.system_prompt:
+        return config.system_prompt
+    return None
+
+
+# Spec definition
+
+
+OPENCODE_SPEC = CLIAgentSpec(
+    name="opencode",
+    install_check_cmd=["/home/user/.opencode/bin/opencode", "--version"],
+    base_command=[
+        "opencode",
+        "run",
+        "--format",
+        "json",
+        "--dangerously-skip-permissions",
+    ],
+    mcp_config=MCPConfigSpec(
+        method="config_file",
+        path_template="{home}/.config/opencode/opencode.json",
+    ),
+    supports_logprob_proxy=True,
+    default_timeout_s=900.0,
+    setup=(
+        "set -e && "
+        "mkdir -p /home/user/.config/opencode /home/user/logs/agent "
+        "/home/user/logs/verifier /home/user/task /home/user/workdir && "
+        "curl -fsSL https://opencode.ai/install | bash && "
+        'export PATH="$HOME/.opencode/bin:$PATH" && '
+        "opencode --version"
+    ),
+    files={
+        "/home/user/task/instruction.md": _instruction_file_content,
+        "/home/user/task/system.md": _system_prompt_content,
+    },
+    artifacts={
+        "agent_log": ArtifactSpec(
+            path="/home/user/logs/agent/opencode.jsonl",
+            format="jsonl",
+        ),
+    },
+    env={
+        "PATH": "/home/user/.opencode/bin:$PATH",
+        "OPENAI_BASE_URL": "{base_url}",
+        "OPENAI_API_KEY": "{api_key}",
+    },
+    build_command=_build_opencode_command,
+    build_mcp_config=_build_opencode_mcp_config,
+    parse_events=_parse_opencode_event,
+    build_env_vars=_build_opencode_env_vars,
+)
+
+
+# Auto-register on import
+register_agent(OPENCODE_SPEC)
+
+
+__all__ = [
+    "OPENCODE_SPEC",
+]

From 455b0e9e46b266655ec05558a321814593ee5cfd Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Tue, 12 May 2026 23:02:10 +0530
Subject: [PATCH 03/35] feat: add tests

---
 tests/core/test_cli_agent_driver.py | 1064 +++++++++++++++++++++++++++
 tests/envs/test_opencode_env.py     |    8 +-
 2 files changed, 1067 insertions(+), 5 deletions(-)
 create mode 100644 tests/core/test_cli_agent_driver.py

diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
new file mode 100644
index 000000000..b26f01d67
--- /dev/null
+++ b/tests/core/test_cli_agent_driver.py
@@ -0,0 +1,1064 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for the CLI agent driver abstraction (Phase 2).
+
+Covers:
+  - Agent spec + event protocols (base.py)
+  - Agent registry (__init__.py)
+  - CLIAgentDriver / CLIAgentSession / CLIAgentSessionFactory (cli_driver.py)
+  - OpenCode adapter spec (opencode.py)
+
+All tests run without external dependencies (no E2B, no LLM, no network).
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+
+# Fake sandbox infrastructure (mirrors test_opencode_env.py pattern)
+
+
+@dataclass
+class FakeExecResult:
+    exit_code: int = 0
+    stdout: str = "ok"
+    stderr: str = ""
+
+
+@dataclass
+class FakeBgJob:
+    cmd: str = ""
+    envs: dict[str, str] | None = None
+    _exit_code: int = 0
+
+    @property
+    def pid(self) -> int:
+        return 12345
+
+    def wait(self, timeout: float | None = None) -> int:
+        return self._exit_code
+
+    def kill(self) -> None:
+        pass
+
+
+class FakeSandbox:
+    """In-memory sandbox for unit testing."""
+
+    def __init__(
+        self,
+        *,
+        install_check_succeeds: bool = False,
+        healthz_succeeds: bool = True,
+    ) -> None:
+        self.sandbox_id = "fake-sandbox-001"
+        self.written: dict[str, str] = {}
+        self.executed: list[str] = []
+        self.bg_commands: list[tuple[str, dict[str, str] | None]] = []
+        self._install_check_succeeds = install_check_succeeds
+        self._healthz_succeeds = healthz_succeeds
+        self._killed = False
+
+    def exec(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+        timeout: float | None = 60,
+    ) -> FakeExecResult:
+        self.executed.append(cmd)
+        if cmd == "echo ok":
+            return FakeExecResult(exit_code=0, stdout="ok")
+        # install check — only standalone version-check commands (short, just
+        # binary + --version) should be treated as install probes. Multi-part
+        # setup scripts that happen to end with --version should succeed.
+        if "--version" in cmd and len(cmd) < 80 and "&&" not in cmd:
+            if self._install_check_succeeds:
+                return FakeExecResult(exit_code=0, stdout="1.0.0")
+            return FakeExecResult(exit_code=127, stderr="not found")
+        # healthz check
+        if "healthz" in cmd:
+            if self._healthz_succeeds:
+                return FakeExecResult(exit_code=0, stdout='{"status":"ok"}')
+            return FakeExecResult(exit_code=7, stderr="connection refused")
+        # All other commands succeed
+        return FakeExecResult(exit_code=0, stdout="")
+
+    def start_bg(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+    ) -> FakeBgJob:
+        self.bg_commands.append((cmd, envs))
+        return FakeBgJob(cmd=cmd, envs=envs)
+
+    def write_text(self, path: str, content: str) -> None:
+        self.written[path] = content
+
+    def read_text(self, path: str) -> str:
+        if path not in self.written:
+            raise FileNotFoundError(f"No such file: {path}")
+        return self.written[path]
+
+    def exists(self, path: str) -> bool:
+        return path in self.written
+
+    def kill(self) -> None:
+        self._killed = True
+
+
+class FakeSandboxBackend:
+    """Backend that returns FakeSandbox instances."""
+
+    def __init__(
+        self,
+        *,
+        install_check_succeeds: bool = False,
+        healthz_succeeds: bool = True,
+    ) -> None:
+        self._install_check_succeeds = install_check_succeeds
+        self._healthz_succeeds = healthz_succeeds
+        self.created: list[FakeSandbox] = []
+
+    def create(
+        self,
+        *,
+        timeout_s: int = 900,
+        envs: dict[str, str] | None = None,
+        metadata: dict[str, str] | None = None,
+    ) -> FakeSandbox:
+        sbx = FakeSandbox(
+            install_check_succeeds=self._install_check_succeeds,
+            healthz_succeeds=self._healthz_succeeds,
+        )
+        self.created.append(sbx)
+        return sbx
+
+
+@dataclass
+class FakeTask:
+    instruction: str = "Write hello.py"
+    setup_shell: str | None = None
+    upload_files: dict[str, str] = field(default_factory=dict)
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class FakeConfig:
+    base_url: str = "https://api.example.com/v1"
+    api_key: str = "sk-test-key"
+    model: str = "test-model"
+    agent_timeout_s: float = 300.0
+    sandbox_home: str = "/home/user"
+    extra_env: dict[str, str] = field(default_factory=dict)
+
+
+# PR 2.1: Agent Spec and Event Parser Protocols
+
+
+class TestAgentSpecProtocols:
+    """Tests for base.py data models."""
+
+    def test_mcp_config_spec_frozen(self):
+        from openenv.core.harness.agents.base import MCPConfigSpec
+
+        spec = MCPConfigSpec(method="config_file", path_template="{workdir}/mcp.json")
+        assert spec.method == "config_file"
+        assert spec.path_template == "{workdir}/mcp.json"
+        with pytest.raises(AttributeError):
+            spec.method = "cli_flags"  # type: ignore[misc]
+
+    def test_artifact_spec_defaults(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+
+        a = ArtifactSpec(path="/logs/agent/out.log")
+        assert a.format == "text"
+        assert a.optional is True
+
+    def test_artifact_spec_json(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+
+        a = ArtifactSpec(path="/data/traj.json", format="json", optional=False)
+        assert a.format == "json"
+        assert a.optional is False
+
+    def test_agent_event_creation(self):
+        from openenv.core.harness.agents.base import AgentEvent
+
+        e = AgentEvent(
+            type="tool_call", data={"name": "bash"}, raw='{"type":"tool_call"}'
+        )
+        assert e.type == "tool_call"
+        assert e.data["name"] == "bash"
+
+    def test_cli_agent_spec_minimal(self):
+        from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec
+
+        spec = CLIAgentSpec(
+            name="test-agent",
+            install_check_cmd=["test-agent", "--version"],
+            base_command=["test-agent", "run"],
+            mcp_config=MCPConfigSpec(method="cli_flags"),
+        )
+        assert spec.name == "test-agent"
+        assert spec.supports_logprob_proxy is True
+        assert spec.default_timeout_s == 600.0
+        assert spec.setup is None
+        assert spec.files is None
+        assert spec.artifacts is None
+        assert spec.env is None
+        assert spec.build_command is None
+
+    def test_cli_agent_spec_full(self):
+        from openenv.core.harness.agents.base import (
+            ArtifactSpec,
+            CLIAgentSpec,
+            MCPConfigSpec,
+        )
+
+        spec = CLIAgentSpec(
+            name="full-agent",
+            install_check_cmd=["full-agent", "--version"],
+            base_command=["full-agent", "exec"],
+            mcp_config=MCPConfigSpec(
+                method="config_file", path_template="{workdir}/mcp.json"
+            ),
+            supports_logprob_proxy=True,
+            default_timeout_s=900.0,
+            setup="npm install -g full-agent",
+            files={
+                "/task.txt": "hello",
+                "/dynamic.txt": lambda task, config: task.instruction,
+            },
+            artifacts={
+                "log": ArtifactSpec(path="/logs/out.log"),
+                "traj": ArtifactSpec(path="/logs/traj.json", format="json"),
+            },
+            env={"API_KEY": "{api_key}", "MODEL": "{model}"},
+            build_command=lambda spec, config, task, mcp: "full-agent exec",
+            build_mcp_config=lambda spec, tools, workdir: "{}",
+            parse_events=lambda line: None,
+        )
+        assert spec.name == "full-agent"
+        assert len(spec.artifacts) == 2
+        assert callable(spec.files["/dynamic.txt"])
+
+
+# PR 2.2: Agent Registry
+
+
+class TestAgentRegistry:
+    """Tests for the agent registry."""
+
+    def test_register_and_lookup(self):
+        from openenv.core.harness.agents import (
+            get_agent_spec,
+            list_agents,
+            register_agent,
+            unregister_agent,
+        )
+        from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec
+
+        spec = CLIAgentSpec(
+            name="test-registry-agent",
+            install_check_cmd=["tra", "--version"],
+            base_command=["tra", "run"],
+            mcp_config=MCPConfigSpec(method="cli_flags"),
+        )
+        try:
+            register_agent(spec)
+            assert "test-registry-agent" in list_agents()
+            assert get_agent_spec("test-registry-agent") is spec
+        finally:
+            unregister_agent("test-registry-agent")
+
+    def test_duplicate_registration_same_object_ok(self):
+        from openenv.core.harness.agents import register_agent, unregister_agent
+        from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec
+
+        spec = CLIAgentSpec(
+            name="test-dup-ok",
+            install_check_cmd=["x"],
+            base_command=["x"],
+            mcp_config=MCPConfigSpec(method="cli_flags"),
+        )
+        try:
+            register_agent(spec)
+            register_agent(spec)  # same object — should be fine
+        finally:
+            unregister_agent("test-dup-ok")
+
+    def test_duplicate_registration_different_object_raises(self):
+        from openenv.core.harness.agents import register_agent, unregister_agent
+        from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec
+
+        spec1 = CLIAgentSpec(
+            name="test-dup-fail",
+            install_check_cmd=["x"],
+            base_command=["x"],
+            mcp_config=MCPConfigSpec(method="cli_flags"),
+        )
+        spec2 = CLIAgentSpec(
+            name="test-dup-fail",
+            install_check_cmd=["y"],
+            base_command=["y"],
+            mcp_config=MCPConfigSpec(method="cli_flags"),
+        )
+        try:
+            register_agent(spec1)
+            with pytest.raises(ValueError, match="already registered"):
+                register_agent(spec2)
+        finally:
+            unregister_agent("test-dup-fail")
+
+    def test_unknown_agent_raises_keyerror(self):
+        from openenv.core.harness.agents import get_agent_spec
+
+        with pytest.raises(KeyError, match="Unknown agent"):
+            get_agent_spec("nonexistent-agent-xyz")
+
+    def test_unregister_returns_spec(self):
+        from openenv.core.harness.agents import register_agent, unregister_agent
+        from openenv.core.harness.agents.base import CLIAgentSpec, MCPConfigSpec
+
+        spec = CLIAgentSpec(
+            name="test-unreg",
+            install_check_cmd=["x"],
+            base_command=["x"],
+            mcp_config=MCPConfigSpec(method="cli_flags"),
+        )
+        register_agent(spec)
+        removed = unregister_agent("test-unreg")
+        assert removed is spec
+        assert unregister_agent("test-unreg") is None
+
+    def test_auto_import_opencode(self):
+        """Auto-import triggers registration of built-in agents."""
+        from openenv.core.harness.agents import get_agent_spec
+
+        spec = get_agent_spec("opencode")
+        assert spec.name == "opencode"
+
+
+# PR 2.3: CLIAgentDriver / CLIAgentSession / CLIAgentSessionFactory
+
+
+def _make_test_spec(**overrides: Any) -> Any:
+    from openenv.core.harness.agents.base import (
+        ArtifactSpec,
+        CLIAgentSpec,
+        MCPConfigSpec,
+    )
+
+    defaults = dict(
+        name="test-agent",
+        install_check_cmd=["test-agent", "--version"],
+        base_command=["test-agent", "run", "--json"],
+        mcp_config=MCPConfigSpec(
+            method="config_file", path_template="{workdir}/mcp.json"
+        ),
+        setup="apt-get install -y test-agent",
+        files={
+            "/home/user/task/instruction.txt": lambda task, config: task.instruction,
+        },
+        artifacts={
+            "agent_log": ArtifactSpec(path="/home/user/logs/agent.log"),
+        },
+        env={
+            "API_KEY": "{api_key}",
+            "BASE_URL": "{base_url}",
+            "MODEL": "{model}",
+        },
+        build_command=lambda spec, config, task, mcp: (
+            f"test-agent run --json '{task.instruction}' 2>&1 | tee /home/user/logs/agent.log"
+        ),
+        build_mcp_config=lambda spec, tools, workdir: json.dumps({"tools": []}),
+        parse_events=lambda line: None,
+    )
+    defaults.update(overrides)
+    return CLIAgentSpec(**defaults)
+
+
+class TestCLIAgentDriver:
+    """Tests for the shared CLI agent driver."""
+
+    def test_create_session_full_lifecycle(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        task = FakeTask(instruction="Write hello.py")
+        config = FakeConfig()
+        session = driver.create_session(task=task, config=config)
+
+        # Verify sandbox was created
+        assert len(backend.created) == 1
+        sbx = backend.created[0]
+
+        # Verify sandbox readiness was probed
+        assert "echo ok" in sbx.executed
+
+        # Verify install was attempted (agent not pre-installed)
+        assert any("apt-get install" in cmd for cmd in sbx.executed)
+
+        # Verify files were uploaded
+        assert "/home/user/task/instruction.txt" in sbx.written
+        assert sbx.written["/home/user/task/instruction.txt"] == "Write hello.py"
+
+        # Verify MCP config was written
+        assert "/home/user/workdir/mcp.json" in sbx.written
+
+        # Verify agent was launched as bg process
+        assert len(sbx.bg_commands) == 1
+        bg_cmd, bg_envs = sbx.bg_commands[0]
+        assert "test-agent run" in bg_cmd
+
+        # Verify env vars were resolved
+        assert bg_envs["API_KEY"] == "sk-test-key"
+        assert bg_envs["BASE_URL"] == "https://api.example.com/v1"
+        assert bg_envs["MODEL"] == "test-model"
+
+        # Session API
+        assert session.initial_messages() == [
+            {"role": "user", "content": "Write hello.py"}
+        ]
+        assert session.list_tools() == []
+        assert session.call_tool("x", {}).error is not None
+        assert session.wait_for_completion() == 0
+
+        session.close()
+        assert sbx._killed
+
+    def test_create_session_skips_install_when_prebaked(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend(install_check_succeeds=True)
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        session = driver.create_session(
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+
+        sbx = backend.created[0]
+        # install should have been skipped
+        assert not any("apt-get install" in cmd for cmd in sbx.executed)
+        session.close()
+
+    def test_create_session_with_proxy(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(
+            spec=spec,
+            sandbox_backend=backend,
+            mode="transparent_proxy",
+        )
+
+        session = driver.create_session(
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+
+        sbx = backend.created[0]
+
+        # Proxy source should have been uploaded
+        assert "/home/user/proxy/interception.py" in sbx.written
+        assert "/home/user/proxy/__init__.py" in sbx.written
+
+        # Proxy should have been started as bg (before agent)
+        # and agent as second bg
+        assert len(sbx.bg_commands) == 2
+        proxy_cmd, proxy_envs = sbx.bg_commands[0]
+        assert "interception.py" in proxy_cmd
+        assert proxy_envs == {"OPENCODE_UPSTREAM_API_KEY": "sk-test-key"}
+
+        # Agent env should point at proxy
+        agent_cmd, agent_envs = sbx.bg_commands[1]
+        assert agent_envs["BASE_URL"] == "http://127.0.0.1:7000/v1"
+
+        session.close()
+
+    def test_create_session_uploads_task_files(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        task = FakeTask(
+            instruction="Write code",
+            upload_files={"/extra/data.json": '{"key": "value"}'},
+        )
+        session = driver.create_session(task=task, config=FakeConfig())
+
+        sbx = backend.created[0]
+        assert sbx.written["/extra/data.json"] == '{"key": "value"}'
+        session.close()
+
+    def test_create_session_runs_task_setup_shell(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        task = FakeTask(
+            instruction="Write code",
+            setup_shell="pip install pandas",
+        )
+        session = driver.create_session(task=task, config=FakeConfig())
+
+        sbx = backend.created[0]
+        assert "pip install pandas" in sbx.executed
+        session.close()
+
+    def test_create_session_with_verifier(self):
+        from openenv.core.harness import VerifyResult
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        def verifier(sandbox, task):
+            return VerifyResult(env_reward=1.0, done=True, metrics={"correct": True})
+
+        session = driver.create_session(
+            task=FakeTask(),
+            config=FakeConfig(),
+            verifier=verifier,
+        )
+
+        result = session.verify([])
+        assert result.env_reward == 1.0
+        assert result.metrics["correct"] is True
+        session.close()
+
+    def test_session_verify_without_verifier(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        session = driver.create_session(task=FakeTask(), config=FakeConfig())
+
+        result = session.verify([])
+        assert result.env_reward is None
+        assert result.done is True
+        session.close()
+
+    def test_invalid_mode_raises(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        with pytest.raises(ValueError, match="Unknown mode"):
+            CLIAgentDriver(
+                spec=spec,
+                sandbox_backend=FakeSandboxBackend(),
+                mode="invalid",  # type: ignore[arg-type]
+            )
+
+
+class TestCLIAgentSession:
+    """Tests for CLIAgentSession."""
+
+    def test_collect_artifacts_text(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec(
+            artifacts={
+                "log": ArtifactSpec(path="/logs/out.log"),
+            },
+        )
+        sbx = FakeSandbox()
+        sbx.written["/logs/out.log"] = "line1\nline2\n"
+
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+        arts = session.collect_artifacts()
+        assert arts["log"] == "line1\nline2\n"
+
+    def test_collect_artifacts_json(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec(
+            artifacts={
+                "traj": ArtifactSpec(path="/logs/traj.json", format="json"),
+            },
+        )
+        sbx = FakeSandbox()
+        sbx.written["/logs/traj.json"] = json.dumps({"steps": [1, 2, 3]})
+
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+        arts = session.collect_artifacts()
+        assert arts["traj"] == {"steps": [1, 2, 3]}
+
+    def test_collect_artifacts_jsonl(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec(
+            artifacts={
+                "events": ArtifactSpec(path="/logs/events.jsonl", format="jsonl"),
+            },
+        )
+        sbx = FakeSandbox()
+        sbx.written["/logs/events.jsonl"] = '{"a":1}\n{"b":2}\n'
+
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+        arts = session.collect_artifacts()
+        assert arts["events"] == [{"a": 1}, {"b": 2}]
+
+    def test_collect_artifacts_missing_optional(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec(
+            artifacts={
+                "log": ArtifactSpec(path="/missing/file.log", optional=True),
+            },
+        )
+        sbx = FakeSandbox()
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+        arts = session.collect_artifacts()
+        assert "log" not in arts
+
+    def test_collect_artifacts_missing_required_raises(self):
+        from openenv.core.harness.agents.base import ArtifactSpec
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec(
+            artifacts={
+                "log": ArtifactSpec(path="/missing/file.log", optional=False),
+            },
+        )
+        sbx = FakeSandbox()
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+        )
+        with pytest.raises(FileNotFoundError):
+            session.collect_artifacts()
+
+    def test_fetch_proxy_trace_black_box(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec()
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=FakeSandbox(),
+            task=FakeTask(),
+            config=FakeConfig(),
+            proxy_trace_path=None,
+        )
+        assert session.fetch_proxy_trace() == []
+
+    def test_fetch_proxy_trace_with_data(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec()
+        sbx = FakeSandbox()
+        trace_path = "/logs/proxy_trace.jsonl"
+        sbx.written[trace_path] = (
+            json.dumps({"turn": 1, "latency_s": 0.5})
+            + "\n"
+            + json.dumps({"turn": 2, "latency_s": 0.3})
+            + "\n"
+        )
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+            proxy_trace_path=trace_path,
+        )
+        trace = session.fetch_proxy_trace()
+        assert len(trace) == 2
+        assert trace[0]["turn"] == 1
+
+    def test_close_kills_sandbox_and_jobs(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        spec = _make_test_spec()
+        sbx = FakeSandbox()
+        agent_job = FakeBgJob()
+        proxy_job = FakeBgJob()
+
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+            agent_bg_job=agent_job,
+            proxy_bg_job=proxy_job,
+        )
+        session.close()
+        assert sbx._killed
+        assert session._agent_bg_job is None
+        assert session._proxy_bg_job is None
+
+
+class TestCLIAgentSessionFactory:
+    """Tests for the ResourceSessionFactory wrapper."""
+
+    def test_factory_creates_sessions(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+
+        factory = CLIAgentSessionFactory(
+            spec=spec,
+            config=FakeConfig(),
+            sandbox_backend=backend,
+            mode="black_box",
+        )
+
+        session = factory.create(task=FakeTask())
+        assert len(backend.created) == 1
+        assert session.initial_messages()[0]["content"] == "Write hello.py"
+        session.close()
+
+    def test_factory_with_verifier(self):
+        from openenv.core.harness import VerifyResult
+        from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+
+        def verifier(sandbox, task):
+            return VerifyResult(env_reward=0.5, done=True)
+
+        factory = CLIAgentSessionFactory(
+            spec=spec,
+            config=FakeConfig(),
+            sandbox_backend=backend,
+            mode="black_box",
+            verifier=verifier,
+        )
+
+        session = factory.create(task=FakeTask())
+        result = session.verify([])
+        assert result.env_reward == 0.5
+        session.close()
+
+    def test_factory_implements_resource_session_factory(self):
+        from openenv.core.harness import ResourceSessionFactory
+        from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
+
+        assert issubclass(CLIAgentSessionFactory, ResourceSessionFactory)
+
+    def test_session_implements_resource_session(self):
+        from openenv.core.harness import ResourceSession
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+
+        assert issubclass(CLIAgentSession, ResourceSession)
+
+
+# PR 2.4: OpenCode Adapter Spec
+
+
+class TestOpenCodeSpec:
+    """Tests for the OpenCode declarative spec."""
+
+    def test_spec_is_registered(self):
+        from openenv.core.harness.agents import get_agent_spec
+
+        spec = get_agent_spec("opencode")
+        assert spec.name == "opencode"
+
+    def test_spec_fields(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        assert OPENCODE_SPEC.name == "opencode"
+        assert OPENCODE_SPEC.install_check_cmd == [
+            "/home/user/.opencode/bin/opencode",
+            "--version",
+        ]
+        assert OPENCODE_SPEC.supports_logprob_proxy is True
+        assert OPENCODE_SPEC.default_timeout_s == 900.0
+        assert OPENCODE_SPEC.mcp_config.method == "config_file"
+        assert "{home}" in OPENCODE_SPEC.mcp_config.path_template
+        assert OPENCODE_SPEC.artifacts is not None
+        assert "agent_log" in OPENCODE_SPEC.artifacts
+        assert OPENCODE_SPEC.artifacts["agent_log"].format == "jsonl"
+
+    def test_build_command(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        @dataclass
+        class OcConfig:
+            sandbox_home: str = "/home/user"
+            run_format: str = "json"
+
+        cmd = OPENCODE_SPEC.build_command(
+            OPENCODE_SPEC,
+            OcConfig(),
+            FakeTask(instruction="Write hello.py"),
+            None,
+        )
+        assert "opencode run" in cmd
+        assert "--format json" in cmd
+        assert "/home/user/task/instruction.md" in cmd
+
+    def test_build_mcp_config(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        config_str = OPENCODE_SPEC.build_mcp_config(
+            OPENCODE_SPEC,
+            [],
+            "/home/user/workdir",
+        )
+        config = json.loads(config_str)
+        assert "$schema" in config
+        assert "provider" in config
+
+    def test_parse_events_assistant(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        line = json.dumps({"type": "assistant", "content": "hello"})
+        event = OPENCODE_SPEC.parse_events(line)
+        assert event is not None
+        assert event.type == "assistant"
+
+    def test_parse_events_tool_call(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        line = json.dumps({"type": "tool_call", "name": "bash", "args": {}})
+        event = OPENCODE_SPEC.parse_events(line)
+        assert event is not None
+        assert event.type == "tool_call"
+
+    def test_parse_events_error(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        line = json.dumps({"type": "error", "message": "boom"})
+        event = OPENCODE_SPEC.parse_events(line)
+        assert event is not None
+        assert event.type == "error"
+
+    def test_parse_events_done(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        line = json.dumps({"type": "done"})
+        event = OPENCODE_SPEC.parse_events(line)
+        assert event is not None
+        assert event.type == "done"
+
+    def test_parse_events_invalid_json(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        assert OPENCODE_SPEC.parse_events("not json") is None
+        assert OPENCODE_SPEC.parse_events("") is None
+
+    def test_build_env_vars(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        config = FakeConfig()
+        config.extra_env = {"EXTRA": "val"}
+        envs = OPENCODE_SPEC.build_env_vars(OPENCODE_SPEC, config)
+        assert envs["OPENAI_BASE_URL"] == "https://api.example.com/v1"
+        assert envs["OPENAI_API_KEY"] == "sk-test-key"
+        assert envs["OPENCODE_CONFIG"] == "/home/user/.config/opencode/opencode.json"
+        assert envs["EXTRA"] == "val"
+
+    def test_files_instruction_resolver(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        task = FakeTask(instruction="Build a REST API")
+        config = FakeConfig()
+        instruction_fn = OPENCODE_SPEC.files["/home/user/task/instruction.md"]
+        assert callable(instruction_fn)
+        assert instruction_fn(task, config) == "Build a REST API"
+
+    def test_files_system_prompt_resolver(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        task = FakeTask()
+        config = FakeConfig()
+        system_fn = OPENCODE_SPEC.files["/home/user/task/system.md"]
+        assert callable(system_fn)
+        # No system prompt on FakeConfig → returns None
+        assert system_fn(task, config) is None
+
+    def test_opencode_driver_integration(self):
+        """End-to-end: create a session using the OpenCode spec via the driver."""
+        from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        backend = FakeSandboxBackend()
+        factory = CLIAgentSessionFactory(
+            spec=OPENCODE_SPEC,
+            config=FakeConfig(),
+            sandbox_backend=backend,
+            mode="black_box",
+        )
+
+        session = factory.create(task=FakeTask(instruction="Hello"))
+        assert session.spec.name == "opencode"
+        assert session.initial_messages()[0]["content"] == "Hello"
+
+        sbx = backend.created[0]
+        # Instruction file should have been written
+        assert sbx.written.get("/home/user/task/instruction.md") == "Hello"
+
+        session.close()
+
+
+# Env var resolution
+
+
+class TestEnvVarResolution:
+    """Tests for environment variable placeholder resolution."""
+
+    def test_resolve_placeholders(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec(
+            env={
+                "KEY": "{api_key}",
+                "URL": "{base_url}",
+                "MDL": "{model}",
+                "STATIC": "fixed_value",
+            },
+            build_env_vars=None,  # use placeholder resolution
+        )
+        driver = CLIAgentDriver(
+            spec=spec,
+            sandbox_backend=FakeSandboxBackend(),
+            mode="black_box",
+        )
+        envs = driver._resolve_env_vars(FakeConfig())
+        assert envs["KEY"] == "sk-test-key"
+        assert envs["URL"] == "https://api.example.com/v1"
+        assert envs["MDL"] == "test-model"
+        assert envs["STATIC"] == "fixed_value"
+
+    def test_resolve_with_proxy_override(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec(
+            env={"URL": "{base_url}"},
+            build_env_vars=None,
+        )
+        driver = CLIAgentDriver(
+            spec=spec,
+            sandbox_backend=FakeSandboxBackend(),
+            mode="black_box",
+        )
+        envs = driver._resolve_env_vars(
+            FakeConfig(),
+            base_url_override="http://127.0.0.1:7000/v1",
+        )
+        assert envs["URL"] == "http://127.0.0.1:7000/v1"
+
+    def test_build_env_vars_hook_takes_precedence(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        def custom_env(spec, config):
+            return {"CUSTOM": "yes", "MODEL": config.model}
+
+        spec = _make_test_spec(
+            env={"SHOULD_NOT": "appear"},
+            build_env_vars=custom_env,
+        )
+        driver = CLIAgentDriver(
+            spec=spec,
+            sandbox_backend=FakeSandboxBackend(),
+            mode="black_box",
+        )
+        envs = driver._resolve_env_vars(FakeConfig())
+        assert envs == {"CUSTOM": "yes", "MODEL": "test-model"}
+        assert "SHOULD_NOT" not in envs
+
+    def test_empty_env_dict(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec(env=None, build_env_vars=None)
+        driver = CLIAgentDriver(
+            spec=spec,
+            sandbox_backend=FakeSandboxBackend(),
+            mode="black_box",
+        )
+        envs = driver._resolve_env_vars(FakeConfig())
+        assert envs == {}
+
+
+# Multiple setup commands
+
+
+class TestMultiStepSetup:
+    """Tests for specs with multi-step setup commands."""
+
+    def test_list_of_setup_commands(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec(
+            setup=[
+                "apt-get update",
+                "apt-get install -y nodejs",
+                "npm install -g test-agent",
+            ],
+        )
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        session = driver.create_session(task=FakeTask(), config=FakeConfig())
+        sbx = backend.created[0]
+
+        # All three setup commands should have been executed
+        assert any("apt-get update" in cmd for cmd in sbx.executed)
+        assert any("apt-get install" in cmd for cmd in sbx.executed)
+        assert any("npm install" in cmd for cmd in sbx.executed)
+        session.close()
+
+    def test_no_setup_and_not_installed_raises(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec(setup=None)
+        backend = FakeSandboxBackend(install_check_succeeds=False)
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        with pytest.raises(RuntimeError, match="not installed"):
+            driver.create_session(task=FakeTask(), config=FakeConfig())
diff --git a/tests/envs/test_opencode_env.py b/tests/envs/test_opencode_env.py
index 6014c9199..5e930b8bc 100644
--- a/tests/envs/test_opencode_env.py
+++ b/tests/envs/test_opencode_env.py
@@ -276,10 +276,6 @@ def exists(self, path: str) -> bool:
         def kill(self) -> None:
             pass
 
-    class NoopInstallFactory(OpenCodeSessionFactory):
-        def _exec_with_retry(self, *args, **kwargs):
-            return FakeExecResult()
-
     secret = "sk-test '$(leak)"
     model = "provider/model'; touch /tmp/pwn #"
     config = OpenCodeConfig(
@@ -288,12 +284,14 @@ def _exec_with_retry(self, *args, **kwargs):
         model=model,
     )
     sandbox = FakeSandbox()
-    factory = NoopInstallFactory(
+    factory = OpenCodeSessionFactory(
         config=config,
         sandbox_backend=object(),  # unused by this protected-method test
         mode="transparent_proxy",
     )
 
+    # _start_proxy delegates to CLIAgentDriver._start_proxy which runs the
+    # proxy inside the sandbox. The driver handles dep install + source upload.
     factory._start_proxy(sandbox)
 
     assert sandbox.started_cmd is not None

From e97fda0e0144cc566b17f262dc5c304749567b11 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Wed, 13 May 2026 11:22:09 +0530
Subject: [PATCH 04/35] feat: impl Docker sandbox backend - related tests

---
 src/openenv/core/harness/sandbox/__init__.py  |  37 +-
 src/openenv/core/harness/sandbox/base.py      |   2 +-
 .../core/harness/sandbox/docker_backend.py    | 328 +++++++++++++++++
 .../core/harness/sandbox/e2b_backend.py       |   7 +-
 .../core/harness/sandbox/interception.py      |  19 +-
 tests/core/test_docker_sandbox_backend.py     | 335 ++++++++++++++++++
 6 files changed, 707 insertions(+), 21 deletions(-)
 create mode 100644 src/openenv/core/harness/sandbox/docker_backend.py
 create mode 100644 tests/core/test_docker_sandbox_backend.py

diff --git a/src/openenv/core/harness/sandbox/__init__.py b/src/openenv/core/harness/sandbox/__init__.py
index d0324a7d7..83d37fb48 100644
--- a/src/openenv/core/harness/sandbox/__init__.py
+++ b/src/openenv/core/harness/sandbox/__init__.py
@@ -7,25 +7,52 @@
 """Sandbox backends for harness-driven rollouts.
 
 Provides the :class:`SandboxBackend` / :class:`SandboxHandle` protocols and
-concrete implementations. Any harness adapter can use any backend — the
+concrete implementations. Any harness adapter can use any backend -- the
 sandbox layer is orthogonal to the agent CLI choice.
 
-The ``e2b`` import is wrapped in ``try/except`` so this package loads cleanly
-in environments where ``e2b`` isn't installed (CI smoke tests, lint runs).
+Optional backend imports are wrapped in ``try/except`` so this package
+loads cleanly when dependencies aren't installed (CI smoke tests, lint).
 """
 
+from typing import Any, Literal
+
 from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
+from .docker_backend import DockerBgJob, DockerSandboxBackend, DockerSandboxHandle
 
 __all__ = [
     "BgJob",
+    "DockerBgJob",
+    "DockerSandboxBackend",
+    "DockerSandboxHandle",
     "ExecResult",
     "SandboxBackend",
     "SandboxHandle",
+    "create_sandbox_backend",
 ]
 
 try:
-    from .e2b_backend import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle
+    from .e2b_backend import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle  # noqa: F401
 
     __all__.extend(["E2BBgJob", "E2BSandboxBackend", "E2BSandboxHandle"])
 except ImportError:
-    pass  # e2b not installed — stubs live in envs/opencode_env/sandbox/__init__.py
+    pass  # e2b not installed
+
+
+def create_sandbox_backend(
+    backend: Literal["e2b", "docker"] = "e2b",
+    **kwargs: Any,
+) -> SandboxBackend:
+    """Create a sandbox backend by name.
+
+    For ``"e2b"``: works with both E2B cloud and CubeSandbox
+    (set ``E2B_API_URL``).
+
+    For ``"docker"``: local Docker, no external dependencies.
+    """
+    if backend == "e2b":
+        from .e2b_backend import E2BSandboxBackend
+
+        return E2BSandboxBackend(**kwargs)
+    elif backend == "docker":
+        return DockerSandboxBackend(**kwargs)
+    raise ValueError(f"Unknown sandbox backend: {backend!r}. Use 'e2b' or 'docker'.")
diff --git a/src/openenv/core/harness/sandbox/base.py b/src/openenv/core/harness/sandbox/base.py
index 4b2620799..d84e267e1 100644
--- a/src/openenv/core/harness/sandbox/base.py
+++ b/src/openenv/core/harness/sandbox/base.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 
 
 @dataclass
diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py
new file mode 100644
index 000000000..aeaacad7c
--- /dev/null
+++ b/src/openenv/core/harness/sandbox/docker_backend.py
@@ -0,0 +1,328 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Docker implementation of :class:`SandboxBackend`.
+
+Runs each sandbox as a ``docker run -d`` container on the local machine.
+Commands execute via ``docker exec``, files transfer via ``docker exec``
+with stdin piping. Suitable for CI, local dev, and environments without
+KVM or cloud sandbox credentials.
+
+Usage::
+
+    from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+    backend = DockerSandboxBackend(image="ubuntu:22.04")
+    sandbox = backend.create()
+    result = sandbox.exec("echo hello")
+    print(result.stdout)  # "hello"
+    sandbox.kill()
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+import threading
+import time
+import uuid
+from pathlib import PurePosixPath
+
+from openenv.core.harness.sandbox.base import BgJob, ExecResult
+
+_log = logging.getLogger(__name__)
+
+
+class DockerBgJob:
+    """Handle to a background process running inside a Docker container.
+
+    Launches the command via ``docker exec -d`` and tracks the wrapper
+    shell PID. Completion is detected by polling whether the PID is still
+    alive inside the container.
+    """
+
+    def __init__(
+        self, container_id: str, pid: int, poll_thread: threading.Thread
+    ) -> None:
+        self._container_id = container_id
+        self._pid = pid
+        self._exit_code: int | None = None
+        self._error: BaseException | None = None
+        self._done = threading.Event()
+        self._poll_thread = poll_thread
+
+    @property
+    def pid(self) -> int:
+        return self._pid
+
+    def wait(self, timeout: float | None = None) -> int:
+        if not self._done.wait(timeout=timeout):
+            raise TimeoutError(
+                f"Background command (pid={self._pid}) did not exit within {timeout}s"
+            )
+        if self._error is not None:
+            raise self._error
+        return self._exit_code if self._exit_code is not None else 0
+
+    def kill(self) -> None:
+        try:
+            subprocess.run(
+                ["docker", "exec", self._container_id, "kill", "-9", str(self._pid)],
+                capture_output=True,
+                timeout=5,
+            )
+        except Exception:
+            pass
+        self._done.set()
+
+
+class DockerSandboxHandle:
+    """Wraps a running Docker container to satisfy :class:`SandboxHandle`."""
+
+    def __init__(self, container_id: str, *, user: str | None = None) -> None:
+        self._container_id = container_id
+        self._user = user
+        self._bg_jobs: list[DockerBgJob] = []
+
+    @property
+    def sandbox_id(self) -> str:
+        return self._container_id[:12]
+
+    def exec(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+        timeout: float | None = 60,
+    ) -> ExecResult:
+        docker_cmd = self._build_exec_cmd(envs=envs, cwd=cwd)
+        docker_cmd.extend(["bash", "-c", cmd])
+        try:
+            result = subprocess.run(
+                docker_cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+            return ExecResult(
+                exit_code=result.returncode,
+                stdout=result.stdout,
+                stderr=result.stderr,
+            )
+        except subprocess.TimeoutExpired:
+            return ExecResult(
+                exit_code=-1, stdout="", stderr=f"Command timed out after {timeout}s"
+            )
+        except Exception as exc:
+            return ExecResult(exit_code=-1, stdout="", stderr=str(exc))
+
+    def start_bg(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+    ) -> BgJob:
+        marker = f"/tmp/.bg_{uuid.uuid4().hex[:8]}"
+        wrapped = f"bash -c {_shell_quote(cmd + f'; echo $? > {marker}')} &\necho $!"
+        docker_cmd = self._build_exec_cmd(envs=envs, cwd=cwd)
+        docker_cmd.extend(["bash", "-c", wrapped])
+        result = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=10)
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to start background command: {result.stderr}")
+        pid = int(result.stdout.strip().splitlines()[-1])
+
+        job = DockerBgJob(self._container_id, pid, poll_thread=None)  # type: ignore[arg-type]
+        poll_thread = threading.Thread(
+            target=self._poll_bg_job,
+            args=(job, marker),
+            daemon=True,
+        )
+        job._poll_thread = poll_thread
+        self._bg_jobs.append(job)
+        poll_thread.start()
+        return job
+
+    def write_text(self, path: str, content: str) -> None:
+        parent = str(PurePosixPath(path).parent)
+        if parent not in ("", "/"):
+            subprocess.run(
+                ["docker", "exec", self._container_id, "mkdir", "-p", parent],
+                capture_output=True,
+                timeout=10,
+            )
+        subprocess.run(
+            [
+                "docker",
+                "exec",
+                "-i",
+                self._container_id,
+                "bash",
+                "-c",
+                f"cat > {_shell_quote(path)}",
+            ],
+            input=content.encode(),
+            capture_output=True,
+            timeout=30,
+        )
+
+    def read_text(self, path: str) -> str:
+        result = subprocess.run(
+            ["docker", "exec", self._container_id, "cat", path],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode != 0:
+            raise FileNotFoundError(f"No such file in container: {path}")
+        return result.stdout
+
+    def exists(self, path: str) -> bool:
+        result = subprocess.run(
+            ["docker", "exec", self._container_id, "test", "-e", path],
+            capture_output=True,
+            timeout=10,
+        )
+        return result.returncode == 0
+
+    def kill(self) -> None:
+        for job in self._bg_jobs:
+            try:
+                job.kill()
+            except Exception:
+                pass
+        self._bg_jobs.clear()
+        try:
+            subprocess.run(
+                ["docker", "rm", "-f", self._container_id],
+                capture_output=True,
+                timeout=15,
+            )
+        except Exception:
+            pass
+
+    def _build_exec_cmd(
+        self,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+    ) -> list[str]:
+        cmd = ["docker", "exec"]
+        if self._user:
+            cmd.extend(["-u", self._user])
+        if cwd:
+            cmd.extend(["-w", cwd])
+        for k, v in (envs or {}).items():
+            cmd.extend(["-e", f"{k}={v}"])
+        cmd.append(self._container_id)
+        return cmd
+
+    def _poll_bg_job(self, job: DockerBgJob, marker: str) -> None:
+        while not job._done.is_set():
+            try:
+                result = subprocess.run(
+                    ["docker", "exec", self._container_id, "cat", marker],
+                    capture_output=True,
+                    text=True,
+                    timeout=5,
+                )
+                if result.returncode == 0 and result.stdout.strip():
+                    job._exit_code = int(result.stdout.strip())
+                    job._done.set()
+                    return
+            except Exception:
+                pass
+
+            # Also check if PID is gone (crash without writing marker).
+            try:
+                check = subprocess.run(
+                    ["docker", "exec", self._container_id, "kill", "-0", str(job._pid)],
+                    capture_output=True,
+                    timeout=5,
+                )
+                if check.returncode != 0:
+                    job._exit_code = 1
+                    job._done.set()
+                    return
+            except Exception:
+                pass
+
+            time.sleep(0.5)
+
+
+class DockerSandboxBackend:
+    """Creates Docker container sandboxes.
+
+    Each :meth:`create` call spawns a fresh ``docker run -d`` container
+    that stays alive until :meth:`SandboxHandle.kill` is called or the
+    container's ``timeout_s`` sleep expires.
+    """
+
+    def __init__(
+        self,
+        *,
+        image: str = "ubuntu:22.04",
+        docker_args: list[str] | None = None,
+        user: str | None = None,
+    ) -> None:
+        self._image = image
+        self._docker_args = docker_args or []
+        self._user = user
+
+        try:
+            subprocess.run(
+                ["docker", "version"],
+                capture_output=True,
+                check=True,
+                timeout=5,
+            )
+        except (
+            subprocess.CalledProcessError,
+            FileNotFoundError,
+            subprocess.TimeoutExpired,
+        ) as exc:
+            raise RuntimeError(
+                "DockerSandboxBackend requires a running Docker daemon."
+            ) from exc
+
+    def create(
+        self,
+        *,
+        timeout_s: int = 900,
+        envs: dict[str, str] | None = None,
+        metadata: dict[str, str] | None = None,
+    ) -> DockerSandboxHandle:
+        cmd = [
+            "docker",
+            "run",
+            "-d",
+            "--label",
+            "openenv.sandbox=true",
+        ]
+        if metadata:
+            for k, v in metadata.items():
+                cmd.extend(["--label", f"openenv.{k}={v}"])
+        for k, v in (envs or {}).items():
+            cmd.extend(["-e", f"{k}={v}"])
+        cmd.extend(self._docker_args)
+        cmd.extend([self._image, "sleep", str(timeout_s)])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to create Docker sandbox: {result.stderr.strip()}"
+            )
+        container_id = result.stdout.strip()
+        _log.info(
+            "Docker sandbox created: %s (image=%s)", container_id[:12], self._image
+        )
+        return DockerSandboxHandle(container_id, user=self._user)
+
+
+def _shell_quote(s: str) -> str:
+    """Single-quote a string for shell, escaping embedded single quotes."""
+    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/src/openenv/core/harness/sandbox/e2b_backend.py b/src/openenv/core/harness/sandbox/e2b_backend.py
index f344346ba..29c9d952d 100644
--- a/src/openenv/core/harness/sandbox/e2b_backend.py
+++ b/src/openenv/core/harness/sandbox/e2b_backend.py
@@ -21,8 +21,7 @@
 
 from e2b import Sandbox
 from e2b.sandbox_sync.commands.command_handle import CommandHandle
-
-from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxBackend, SandboxHandle
+from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxHandle
 
 
 class E2BBgJob:
@@ -53,9 +52,7 @@ def pid(self) -> int:
     def wait(self, timeout: float | None = None) -> int:
         self._thread.join(timeout)
         if self._thread.is_alive():
-            raise TimeoutError(
-                f"Background command did not exit within {timeout}s"
-            )
+            raise TimeoutError(f"Background command did not exit within {timeout}s")
         if self._error is not None:
             # E2B raises CommandExitException on non-zero; treat as exit code.
             code = getattr(self._error, "exit_code", None)
diff --git a/src/openenv/core/harness/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py
index dc3dbe5be..d943fd755 100644
--- a/src/openenv/core/harness/sandbox/interception.py
+++ b/src/openenv/core/harness/sandbox/interception.py
@@ -130,9 +130,7 @@ async def chat_completions(request: Request) -> Response:
         try:
             body = json.loads(raw_body)
         except json.JSONDecodeError:
-            return JSONResponse(
-                status_code=400, content={"error": "invalid json body"}
-            )
+            return JSONResponse(status_code=400, content={"error": "invalid json body"})
 
         forwarded_body = _prepare_forwarded_body(body, cfg)
         headers = {
@@ -338,7 +336,7 @@ async def _stream() -> Any:
                 yield line + "\n"
                 if not line.startswith("data:"):
                     continue
-                data = line[len("data:"):].strip()
+                data = line[len("data:") :].strip()
                 if data == "[DONE]":
                     continue
                 try:
@@ -381,7 +379,11 @@ def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None
             tc_idx = tc.get("index", 0)
             bucket = acc["tool_calls_by_idx"].setdefault(
                 (idx, tc_idx),
-                {"id": None, "type": "function", "function": {"name": "", "arguments": ""}},
+                {
+                    "id": None,
+                    "type": "function",
+                    "function": {"name": "", "arguments": ""},
+                },
             )
             if tc.get("id"):
                 bucket["id"] = tc["id"]
@@ -487,8 +489,7 @@ def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]:
     choices = out.get("choices")
     if isinstance(choices, list):
         out["choices"] = [
-            {k: v for k, v in (ch or {}).items() if k != "logprobs"}
-            for ch in choices
+            {k: v for k, v in (ch or {}).items() if k != "logprobs"} for ch in choices
         ]
     return out
 
@@ -537,9 +538,7 @@ def start(self) -> None:
             lifespan="on",
         )
         self._server = uvicorn.Server(config)
-        self._thread = threading.Thread(
-            target=self._run_server, daemon=True
-        )
+        self._thread = threading.Thread(target=self._run_server, daemon=True)
         self._thread.start()
         # Wait for the server to accept connections.
         deadline = time.time() + 10
diff --git a/tests/core/test_docker_sandbox_backend.py b/tests/core/test_docker_sandbox_backend.py
new file mode 100644
index 000000000..b47f6bd4e
--- /dev/null
+++ b/tests/core/test_docker_sandbox_backend.py
@@ -0,0 +1,335 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for the Docker sandbox backend.
+
+Tests marked ``@pytest.mark.docker`` require a running Docker daemon and
+are skipped in CI when Docker is unavailable. They exercise the real
+``docker run`` / ``docker exec`` / ``docker rm`` lifecycle.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import time
+
+import pytest
+
+_DOCKER_AVAILABLE = False
+try:
+    subprocess.run(
+        ["docker", "version"],
+        capture_output=True,
+        check=True,
+        timeout=5,
+    )
+    _DOCKER_AVAILABLE = True
+except Exception:
+    pass
+
+docker = pytest.mark.skipif(not _DOCKER_AVAILABLE, reason="Docker not available")
+
+
+class TestDockerSandboxBackendUnit:
+    """Unit tests that don't require Docker."""
+
+    def test_import(self):
+        from openenv.core.harness.sandbox.docker_backend import (
+            DockerBgJob,
+            DockerSandboxBackend,
+            DockerSandboxHandle,
+        )
+
+        assert DockerSandboxBackend is not None
+        assert DockerSandboxHandle is not None
+        assert DockerBgJob is not None
+
+    def test_exported_from_package(self):
+        from openenv.core.harness.sandbox import (
+            DockerBgJob,
+            DockerSandboxBackend,
+            DockerSandboxHandle,
+        )
+
+        assert DockerSandboxBackend is not None
+        assert DockerSandboxHandle is not None
+        assert DockerBgJob is not None
+
+    def test_create_sandbox_backend_factory(self):
+        from openenv.core.harness.sandbox import create_sandbox_backend
+
+        assert callable(create_sandbox_backend)
+
+    def test_create_sandbox_backend_unknown_raises(self):
+        from openenv.core.harness.sandbox import create_sandbox_backend
+
+        with pytest.raises(ValueError, match="Unknown sandbox backend"):
+            create_sandbox_backend("bogus")  # type: ignore[arg-type]
+
+    @pytest.mark.skipif(_DOCKER_AVAILABLE, reason="Only test error when Docker missing")
+    def test_backend_raises_without_docker(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        with pytest.raises(RuntimeError, match="Docker daemon"):
+            DockerSandboxBackend()
+
+
+@docker
+class TestDockerSandboxBackendIntegration:
+    """Integration tests against a real Docker daemon."""
+
+    def test_create_and_kill(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            assert sandbox.sandbox_id
+            assert len(sandbox.sandbox_id) == 12
+        finally:
+            sandbox.kill()
+
+    def test_exec_echo(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            result = sandbox.exec("echo hello world")
+            assert result.exit_code == 0
+            assert "hello world" in result.stdout
+        finally:
+            sandbox.kill()
+
+    def test_exec_nonzero_exit(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            result = sandbox.exec("exit 42")
+            assert result.exit_code == 42
+        finally:
+            sandbox.kill()
+
+    def test_exec_with_env(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            result = sandbox.exec("echo $MY_VAR", envs={"MY_VAR": "test123"})
+            assert result.exit_code == 0
+            assert "test123" in result.stdout
+        finally:
+            sandbox.kill()
+
+    def test_exec_with_cwd(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            result = sandbox.exec("pwd", cwd="/tmp")
+            assert result.exit_code == 0
+            assert "/tmp" in result.stdout
+        finally:
+            sandbox.kill()
+
+    def test_write_and_read_text(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            sandbox.write_text("/tmp/test.txt", "hello from test")
+            content = sandbox.read_text("/tmp/test.txt")
+            assert content == "hello from test"
+        finally:
+            sandbox.kill()
+
+    def test_write_creates_parent_dirs(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            sandbox.write_text("/home/user/deep/nested/file.txt", "nested content")
+            content = sandbox.read_text("/home/user/deep/nested/file.txt")
+            assert content == "nested content"
+        finally:
+            sandbox.kill()
+
+    def test_write_special_chars(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            text = "line1\nline2\n'quotes' and \"doubles\" and $vars"
+            sandbox.write_text("/tmp/special.txt", text)
+            content = sandbox.read_text("/tmp/special.txt")
+            assert content == text
+        finally:
+            sandbox.kill()
+
+    def test_read_missing_file_raises(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            with pytest.raises(FileNotFoundError):
+                sandbox.read_text("/nonexistent/path.txt")
+        finally:
+            sandbox.kill()
+
+    def test_exists(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            assert not sandbox.exists("/tmp/check_me.txt")
+            sandbox.write_text("/tmp/check_me.txt", "exists")
+            assert sandbox.exists("/tmp/check_me.txt")
+        finally:
+            sandbox.kill()
+
+    def test_start_bg_and_wait(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            job = sandbox.start_bg("sleep 1 && echo done > /tmp/bg_out.txt")
+            exit_code = job.wait(timeout=10)
+            assert exit_code == 0
+            content = sandbox.read_text("/tmp/bg_out.txt")
+            assert "done" in content
+        finally:
+            sandbox.kill()
+
+    def test_start_bg_kill(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            job = sandbox.start_bg("sleep 300")
+            time.sleep(0.5)
+            job.kill()
+            # Should be able to wait without hanging
+            exit_code = job.wait(timeout=5)
+            # Exit code after kill is implementation-defined
+            assert isinstance(exit_code, int)
+        finally:
+            sandbox.kill()
+
+    def test_start_bg_timeout(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            job = sandbox.start_bg("sleep 300")
+            with pytest.raises(TimeoutError):
+                job.wait(timeout=1)
+            job.kill()
+        finally:
+            sandbox.kill()
+
+    def test_create_with_envs(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60, envs={"INIT_VAR": "from_create"})
+        try:
+            result = sandbox.exec("echo $INIT_VAR")
+            assert "from_create" in result.stdout
+        finally:
+            sandbox.kill()
+
+    def test_create_with_metadata(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(
+            timeout_s=60,
+            metadata={"episode_id": "ep-123"},
+        )
+        try:
+            result = subprocess.run(
+                [
+                    "docker",
+                    "inspect",
+                    "--format",
+                    '{{index .Config.Labels "openenv.episode_id"}}',
+                    sandbox._container_id,
+                ],
+                capture_output=True,
+                text=True,
+            )
+            assert "ep-123" in result.stdout
+        finally:
+            sandbox.kill()
+
+    def test_factory_creates_docker_backend(self):
+        from openenv.core.harness.sandbox import create_sandbox_backend
+
+        backend = create_sandbox_backend("docker", image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            result = sandbox.exec("echo ok")
+            assert result.exit_code == 0
+        finally:
+            sandbox.kill()
+
+    def test_satisfies_sandbox_handle_protocol(self):
+        from openenv.core.harness.sandbox import SandboxBackend
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            assert isinstance(sandbox, SandboxBackend) or hasattr(sandbox, "exec")
+            assert hasattr(sandbox, "sandbox_id")
+            assert hasattr(sandbox, "exec")
+            assert hasattr(sandbox, "start_bg")
+            assert hasattr(sandbox, "write_text")
+            assert hasattr(sandbox, "read_text")
+            assert hasattr(sandbox, "exists")
+            assert hasattr(sandbox, "kill")
+        finally:
+            sandbox.kill()
+
+    def test_satisfies_sandbox_backend_protocol(self):
+        from openenv.core.harness.sandbox import SandboxBackend
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        assert issubclass(DockerSandboxBackend, SandboxBackend)
+
+    def test_satisfies_bg_job_protocol(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        try:
+            job = sandbox.start_bg("sleep 1")
+            assert hasattr(job, "pid")
+            assert hasattr(job, "wait")
+            assert hasattr(job, "kill")
+            job.kill()
+        finally:
+            sandbox.kill()
+
+    def test_kill_is_idempotent(self):
+        from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
+
+        backend = DockerSandboxBackend(image="ubuntu:22.04")
+        sandbox = backend.create(timeout_s=60)
+        sandbox.kill()
+        sandbox.kill()  # should not raise

From 9a350062eabbcf903e1a938939ee9d1cefb79cc8 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Wed, 13 May 2026 23:38:58 +0530
Subject: [PATCH 05/35] feat: pi agent adapter

- fix agent handling
---
 src/openenv/core/harness/agents/base.py       |   2 +-
 src/openenv/core/harness/agents/cli_driver.py |  25 ++-
 src/openenv/core/harness/agents/opencode.py   |  71 ++++++---
 src/openenv/core/harness/agents/pi.py         | 145 ++++++++++++++++++
 .../core/harness/sandbox/docker_backend.py    |  13 +-
 5 files changed, 225 insertions(+), 31 deletions(-)
 create mode 100644 src/openenv/core/harness/agents/pi.py

diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py
index 145d3001e..72cc9a6cf 100644
--- a/src/openenv/core/harness/agents/base.py
+++ b/src/openenv/core/harness/agents/base.py
@@ -41,7 +41,7 @@ class MCPConfigSpec:
     - ``"cli_flags"`` — the driver passes MCP configuration via CLI
       flags built by :attr:`CLIAgentSpec.build_command`.
     - ``"settings_file"`` — write into a global settings file (e.g.
-      Gemini's ``~/.gemini/settings.json``).
+      e.g. ``~/.config/agent/settings.json``).
     """
 
     method: Literal["config_file", "cli_flags", "settings_file"]
diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 8e8179889..760218687 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -166,11 +166,23 @@ def collect_artifacts(self) -> dict[str, Any]:
                 if artifact_spec.format == "json":
                     result[name] = json.loads(content)
                 elif artifact_spec.format == "jsonl":
-                    result[name] = [
-                        json.loads(line)
-                        for line in content.splitlines()
-                        if line.strip()
-                    ]
+                    # Parse valid JSON lines, skip non-JSON preamble
+                    # (e.g. opencode emits database migration messages
+                    # before the first JSON event).
+                    records = []
+                    for line in content.splitlines():
+                        line = line.strip()
+                        if not line:
+                            continue
+                        try:
+                            records.append(json.loads(line))
+                        except json.JSONDecodeError:
+                            _log.debug(
+                                "Skipping non-JSON line in %s: %s",
+                                artifact_spec.path,
+                                line[:120],
+                            )
+                    result[name] = records
                 else:
                     result[name] = content
             except Exception:
@@ -468,7 +480,8 @@ def _write_mcp_config(
                 home=home,
             )
             mcp_content = self.spec.build_mcp_config(self.spec, [], workdir)
-            sandbox.write_text(mcp_path, mcp_content)
+            if mcp_content:
+                sandbox.write_text(mcp_path, mcp_content)
 
     # Agent launch
 
diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py
index b179e9c9f..875882f27 100644
--- a/src/openenv/core/harness/agents/opencode.py
+++ b/src/openenv/core/harness/agents/opencode.py
@@ -25,9 +25,6 @@
 from .base import AgentEvent, ArtifactSpec, CLIAgentSpec, MCPConfigSpec
 
 
-# Command / config / env builders
-
-
 def _build_opencode_command(
     spec: CLIAgentSpec,
     config: Any,
@@ -44,7 +41,7 @@ def _build_opencode_command(
 
     return (
         f'export PATH="$HOME/.opencode/bin:$PATH" && '
-        f"cd {workdir} && "
+        f"cd {workdir} && git init -q 2>/dev/null; "
         f'opencode run {format_flag} "$(cat {instruction_file})" '
         f"2>&1 | tee {log_file}"
     ).strip()
@@ -55,22 +52,54 @@ def _build_opencode_mcp_config(
     tools: list[Any],
     workdir: str,
 ) -> str:
-    """Build the ``opencode.json`` content for the MCP config file."""
+    """Build ``opencode.json`` content.
+
+    Returns an empty string so the driver skips writing this file.
+    The actual config is written via ``spec.files`` using
+    ``_build_opencode_config_file`` which has access to the rollout
+    config (base_url, api_key, model).
+    """
+    return ""
+
+
+def _build_opencode_config_file(task: Any, config: Any) -> str:
+    """Build the full ``opencode.json`` dynamically from config fields."""
+    base_url = (
+        config.base_url if hasattr(config, "base_url") else "http://127.0.0.1:7000/v1"
+    )
+    api_key = config.api_key if hasattr(config, "api_key") else "intercepted"
+    model = config.model if hasattr(config, "model") else "model"
+    timeout = (
+        int(config.agent_timeout_s * 1000)
+        if hasattr(config, "agent_timeout_s")
+        else 600000
+    )
+
+    # Split model into provider_name/model_id for the opencode config format.
+    # e.g. "zai-org/GLM-5.1:zai-org" becomes provider "hf", model_id as-is.
+    provider_name = "default"
+    model_id = model
+    if hasattr(config, "provider_name") and config.provider_name:
+        provider_name = config.provider_name
+
     return json.dumps(
         {
             "$schema": "https://opencode.ai/config.json",
-            "model": "intercepted/model",
+            "model": f"{provider_name}/{model_id}",
             "provider": {
-                "intercepted": {
+                provider_name: {
                     "npm": "@ai-sdk/openai-compatible",
-                    "name": "Intercepted",
+                    "name": provider_name.title(),
                     "options": {
-                        "baseURL": "http://127.0.0.1:7000/v1",
-                        "apiKey": "intercepted",
-                        "timeout": 600000,
+                        "baseURL": base_url,
+                        "apiKey": api_key,
+                        "timeout": timeout,
                     },
                     "models": {
-                        "model": {"name": "Intercepted Model"},
+                        model_id: {
+                            "name": model_id,
+                            "id": model_id,
+                        },
                     },
                 }
             },
@@ -107,12 +136,16 @@ def _parse_opencode_event(line: str) -> AgentEvent | None:
         return None
 
     event_type = data.get("type", "")
-    if event_type in ("assistant", "message"):
+    if event_type in ("assistant", "message", "text"):
         return AgentEvent(type="assistant", data=data, raw=line)
     elif event_type in ("tool_call", "tool_use"):
         return AgentEvent(type="tool_call", data=data, raw=line)
     elif event_type in ("tool_result", "tool_response"):
         return AgentEvent(type="tool_result", data=data, raw=line)
+    elif event_type in ("step_start",):
+        return AgentEvent(type="assistant", data=data, raw=line)
+    elif event_type in ("step_finish",):
+        return AgentEvent(type="done", data=data, raw=line)
     elif event_type == "error":
         return AgentEvent(type="error", data=data, raw=line)
     elif event_type in ("done", "complete", "end"):
@@ -120,9 +153,6 @@ def _parse_opencode_event(line: str) -> AgentEvent | None:
     return AgentEvent(type="assistant", data=data, raw=line)
 
 
-# File resolvers
-
-
 def _instruction_file_content(task: Any, config: Any) -> str:
     return task.instruction if hasattr(task, "instruction") else str(task)
 
@@ -133,9 +163,6 @@ def _system_prompt_content(task: Any, config: Any) -> str | None:
     return None
 
 
-# Spec definition
-
-
 OPENCODE_SPEC = CLIAgentSpec(
     name="opencode",
     install_check_cmd=["/home/user/.opencode/bin/opencode", "--version"],
@@ -154,15 +181,16 @@ def _system_prompt_content(task: Any, config: Any) -> str | None:
     default_timeout_s=900.0,
     setup=(
         "set -e && "
+        "curl -fsSL https://opencode.ai/install | bash && "
         "mkdir -p /home/user/.config/opencode /home/user/logs/agent "
         "/home/user/logs/verifier /home/user/task /home/user/workdir && "
-        "curl -fsSL https://opencode.ai/install | bash && "
         'export PATH="$HOME/.opencode/bin:$PATH" && '
         "opencode --version"
     ),
     files={
         "/home/user/task/instruction.md": _instruction_file_content,
         "/home/user/task/system.md": _system_prompt_content,
+        "/home/user/.config/opencode/opencode.json": _build_opencode_config_file,
     },
     artifacts={
         "agent_log": ArtifactSpec(
@@ -181,11 +209,8 @@ def _system_prompt_content(task: Any, config: Any) -> str | None:
     build_env_vars=_build_opencode_env_vars,
 )
 
-
-# Auto-register on import
 register_agent(OPENCODE_SPEC)
 
-
 __all__ = [
     "OPENCODE_SPEC",
 ]
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
new file mode 100644
index 000000000..7e0fa29c1
--- /dev/null
+++ b/src/openenv/core/harness/agents/pi.py
@@ -0,0 +1,145 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Pi coding agent adapter.
+
+Pi runs in print mode for non-interactive harness usage::
+
+    pi --no-session --no-context-files --provider <p> --model <m> --thinking off \\
+       -p @/home/user/task/instruction.txt 2>&1 | tee /home/user/logs/agent/pi.txt
+
+The provider and model are passed as CLI flags so the spec's ``env`` dict
+only needs auth credentials (``HF_TOKEN``, ``OPENAI_API_KEY``, etc.).
+
+Registered on import::
+
+    import openenv.core.harness.agents.pi
+    # PI_SPEC is now in the registry
+"""
+
+from __future__ import annotations
+
+import json
+import shlex
+from typing import Any
+
+from . import register_agent
+from .base import AgentEvent, ArtifactSpec, CLIAgentSpec, MCPConfigSpec
+
+
+def _instruction(task: Any, config: Any) -> str:
+    return task.instruction if hasattr(task, "instruction") else str(task)
+
+
+def _system_prompt(task: Any, config: Any) -> str | None:
+    if hasattr(config, "system_prompt") and config.system_prompt:
+        return config.system_prompt
+    return None
+
+
+def _build_command(
+    spec: CLIAgentSpec,
+    config: Any,
+    task: Any,
+    mcp_config_path: str | None,
+) -> str:
+    home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
+    instruction_file = f"{home}/task/instruction.txt"
+    log_file = f"{home}/logs/agent/pi.txt"
+    workdir = f"{home}/workdir"
+
+    provider = ""
+    if hasattr(config, "provider") and config.provider:
+        provider = f" --provider {shlex.quote(config.provider)}"
+    model = ""
+    if hasattr(config, "model") and config.model:
+        model = f" --model {shlex.quote(config.model)}"
+    thinking = " --thinking off"
+    if hasattr(config, "thinking") and config.thinking:
+        thinking = f" --thinking {shlex.quote(config.thinking)}"
+
+    return (
+        f"cd {workdir} && git init -q 2>/dev/null; "
+        f"pi --no-session --no-context-files"
+        f"{provider}{model}{thinking}"
+        f" -p @{instruction_file}"
+        f" 2>&1 | tee {log_file}"
+    )
+
+
+def _build_mcp_config(
+    spec: CLIAgentSpec,
+    tools: list[Any],
+    workdir: str,
+) -> str:
+    return json.dumps({"mcpServers": {}}, indent=2)
+
+
+def _parse_events(line: str) -> AgentEvent | None:
+    line = line.strip()
+    if not line:
+        return None
+    try:
+        data = json.loads(line)
+    except json.JSONDecodeError:
+        return AgentEvent(type="assistant", data={"text": line}, raw=line)
+
+    event_type = data.get("type", "")
+    if event_type in ("assistant", "message", "response"):
+        return AgentEvent(type="assistant", data=data, raw=line)
+    if event_type in ("tool_call", "tool_use", "function_call"):
+        return AgentEvent(type="tool_call", data=data, raw=line)
+    if event_type in ("tool_result", "tool_response"):
+        return AgentEvent(type="tool_result", data=data, raw=line)
+    if event_type in ("thinking", "reasoning"):
+        return AgentEvent(type="reasoning", data=data, raw=line)
+    if event_type == "error":
+        return AgentEvent(type="error", data=data, raw=line)
+    if event_type in ("done", "complete", "end"):
+        return AgentEvent(type="done", data=data, raw=line)
+    return AgentEvent(type="assistant", data=data, raw=line)
+
+
+PI_SPEC = CLIAgentSpec(
+    name="pi",
+    install_check_cmd=["pi", "--version"],
+    base_command=["pi", "--no-session", "--no-context-files"],
+    mcp_config=MCPConfigSpec(
+        method="config_file",
+        path_template="{workdir}/.mcp.json",
+    ),
+    supports_logprob_proxy=True,
+    default_timeout_s=600.0,
+    setup=(
+        "set -e && "
+        "apt-get update -qq && apt-get install -y -qq curl ca-certificates gnupg && "
+        "curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && "
+        "apt-get install -y -qq nodejs && "
+        "curl -fsSL https://pi.dev/install.sh | sh && "
+        "mkdir -p /home/user/logs/agent /home/user/task /home/user/workdir && "
+        'export PATH="$HOME/.local/bin:$HOME/.pi/bin:$PATH" && '
+        "pi --version"
+    ),
+    files={
+        "/home/user/task/instruction.txt": _instruction,
+        "/home/user/task/system.txt": _system_prompt,
+    },
+    artifacts={
+        "agent_log": ArtifactSpec(path="/home/user/logs/agent/pi.txt"),
+    },
+    env={
+        "HF_TOKEN": "{api_key}",
+        "PI_SKIP_VERSION_CHECK": "1",
+        "PI_TELEMETRY": "0",
+    },
+    build_command=_build_command,
+    build_mcp_config=_build_mcp_config,
+    parse_events=_parse_events,
+)
+
+register_agent(PI_SPEC)
+
+__all__ = ["PI_SPEC"]
diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py
index aeaacad7c..559817d1b 100644
--- a/src/openenv/core/harness/sandbox/docker_backend.py
+++ b/src/openenv/core/harness/sandbox/docker_backend.py
@@ -134,7 +134,18 @@ def start_bg(
         result = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=10)
         if result.returncode != 0:
             raise RuntimeError(f"Failed to start background command: {result.stderr}")
-        pid = int(result.stdout.strip().splitlines()[-1])
+        # Extract PID from the last numeric-only line (commands may print
+        # banners like "Database migration complete." before the PID).
+        pid_line = None
+        for line in reversed(result.stdout.strip().splitlines()):
+            if line.strip().isdigit():
+                pid_line = line.strip()
+                break
+        if pid_line is None:
+            raise RuntimeError(
+                f"Could not extract PID from start_bg output: {result.stdout!r}"
+            )
+        pid = int(pid_line)
 
         job = DockerBgJob(self._container_id, pid, poll_thread=None)  # type: ignore[arg-type]
         poll_thread = threading.Thread(

From 06df791958922f7d08e35a36a2bab873e7c6f258 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Thu, 14 May 2026 10:44:34 +0530
Subject: [PATCH 06/35] chore: agent specifications and improve interception
 module ref

---
 src/openenv/core/harness/agents/cli_driver.py    | 2 +-
 src/openenv/core/harness/agents/opencode.py      | 2 +-
 src/openenv/core/harness/agents/pi.py            | 2 ++
 src/openenv/core/harness/sandbox/interception.py | 4 ++--
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 760218687..42ac460f1 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -52,7 +52,7 @@
 # already have it baked in.
 _PROXY_SOURCE_PATH = Path(__file__).resolve().parents[1] / "sandbox" / "interception.py"
 
-# Verifier type — same as opencode_env's Verifier alias
+# Verifier type — callable that checks the agent's work and returns a result
 Verifier = Callable[..., VerifyResult]
 
 
diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py
index 875882f27..d0146b008 100644
--- a/src/openenv/core/harness/agents/opencode.py
+++ b/src/openenv/core/harness/agents/opencode.py
@@ -8,7 +8,7 @@
 
 Expresses the OpenCode harness as a purely declarative :class:`CLIAgentSpec`.
 All builders (command construction, config generation, env var resolution)
-are self-contained with no imports from ``envs/opencode_env/``.
+are self-contained with no imports from any environment package.
 
 Registered on import::
 
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index 7e0fa29c1..63e2eb0c3 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -132,6 +132,8 @@ def _parse_events(line: str) -> AgentEvent | None:
     },
     env={
         "HF_TOKEN": "{api_key}",
+        "OPENAI_API_KEY": "{api_key}",
+        "OPENAI_BASE_URL": "{base_url}",
         "PI_SKIP_VERSION_CHECK": "1",
         "PI_TELEMETRY": "0",
     },
diff --git a/src/openenv/core/harness/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py
index d943fd755..4e7c857ac 100644
--- a/src/openenv/core/harness/sandbox/interception.py
+++ b/src/openenv/core/harness/sandbox/interception.py
@@ -511,7 +511,7 @@ class InterceptionProxy:
 
     Used by unit tests and by any in-process driver that wants a short-lived
     proxy on the local machine. Inside a sandbox we invoke :func:`serve`
-    directly via ``python -m opencode_env.interception``.
+    directly via ``python -m openenv.core.harness.sandbox.interception``.
     """
 
     def __init__(self, cfg: ProxyConfig) -> None:
@@ -602,7 +602,7 @@ def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]:
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(prog="opencode_env.interception")
+    parser = argparse.ArgumentParser(prog="openenv.core.harness.sandbox.interception")
     parser.add_argument("--upstream-url", required=True)
     parser.add_argument(
         "--upstream-api-key",

From a3c4a3d487dfd4bd9047e6ddadf5bdce8c0e4af3 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Thu, 14 May 2026 11:02:13 +0530
Subject: [PATCH 07/35] feat: add tests for opencode + pi harness adapters

---
 tests/core/test_cli_agent_driver.py | 51 +++++++++-------
 tests/core/test_harness_adapters.py | 93 +++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 21 deletions(-)
 create mode 100644 tests/core/test_harness_adapters.py

diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index b26f01d67..29bf06caa 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -22,16 +22,10 @@
 from typing import Any
 
 import pytest
+from openenv.core.harness.sandbox.base import ExecResult, SandboxHandle
 
 
-# Fake sandbox infrastructure (mirrors test_opencode_env.py pattern)
-
-
-@dataclass
-class FakeExecResult:
-    exit_code: int = 0
-    stdout: str = "ok"
-    stderr: str = ""
+# Fake sandbox infrastructure (mirrors test_coding_agent_env.py pattern)
 
 
 @dataclass
@@ -75,24 +69,24 @@ def exec(
         envs: dict[str, str] | None = None,
         cwd: str | None = None,
         timeout: float | None = 60,
-    ) -> FakeExecResult:
+    ) -> ExecResult:
         self.executed.append(cmd)
         if cmd == "echo ok":
-            return FakeExecResult(exit_code=0, stdout="ok")
+            return ExecResult(exit_code=0, stdout="ok", stderr="")
         # install check — only standalone version-check commands (short, just
         # binary + --version) should be treated as install probes. Multi-part
         # setup scripts that happen to end with --version should succeed.
         if "--version" in cmd and len(cmd) < 80 and "&&" not in cmd:
             if self._install_check_succeeds:
-                return FakeExecResult(exit_code=0, stdout="1.0.0")
-            return FakeExecResult(exit_code=127, stderr="not found")
+                return ExecResult(exit_code=0, stdout="1.0.0", stderr="")
+            return ExecResult(exit_code=127, stdout="", stderr="not found")
         # healthz check
         if "healthz" in cmd:
             if self._healthz_succeeds:
-                return FakeExecResult(exit_code=0, stdout='{"status":"ok"}')
-            return FakeExecResult(exit_code=7, stderr="connection refused")
+                return ExecResult(exit_code=0, stdout='{"status":"ok"}', stderr="")
+            return ExecResult(exit_code=7, stdout="", stderr="connection refused")
         # All other commands succeed
-        return FakeExecResult(exit_code=0, stdout="")
+        return ExecResult(exit_code=0, stdout="", stderr="")
 
     def start_bg(
         self,
@@ -138,7 +132,7 @@ def create(
         timeout_s: int = 900,
         envs: dict[str, str] | None = None,
         metadata: dict[str, str] | None = None,
-    ) -> FakeSandbox:
+    ) -> SandboxHandle:
         sbx = FakeSandbox(
             install_check_succeeds=self._install_check_succeeds,
             healthz_succeeds=self._healthz_succeeds,
@@ -252,7 +246,9 @@ def test_cli_agent_spec_full(self):
             parse_events=lambda line: None,
         )
         assert spec.name == "full-agent"
+        assert spec.artifacts is not None
         assert len(spec.artifacts) == 2
+        assert spec.files is not None
         assert callable(spec.files["/dynamic.txt"])
 
 
@@ -355,14 +351,14 @@ def test_auto_import_opencode(self):
 # PR 2.3: CLIAgentDriver / CLIAgentSession / CLIAgentSessionFactory
 
 
-def _make_test_spec(**overrides: Any) -> Any:
+def _make_test_spec(**overrides: Any):
     from openenv.core.harness.agents.base import (
         ArtifactSpec,
         CLIAgentSpec,
         MCPConfigSpec,
     )
 
-    defaults = dict(
+    defaults: dict[str, Any] = dict(
         name="test-agent",
         install_check_cmd=["test-agent", "--version"],
         base_command=["test-agent", "run", "--json"],
@@ -428,6 +424,7 @@ def test_create_session_full_lifecycle(self):
         assert "test-agent run" in bg_cmd
 
         # Verify env vars were resolved
+        assert bg_envs is not None
         assert bg_envs["API_KEY"] == "sk-test-key"
         assert bg_envs["BASE_URL"] == "https://api.example.com/v1"
         assert bg_envs["MODEL"] == "test-model"
@@ -491,6 +488,7 @@ def test_create_session_with_proxy(self):
 
         # Agent env should point at proxy
         agent_cmd, agent_envs = sbx.bg_commands[1]
+        assert agent_envs is not None
         assert agent_envs["BASE_URL"] == "http://127.0.0.1:7000/v1"
 
         session.close()
@@ -819,6 +817,7 @@ def test_spec_fields(self):
         assert OPENCODE_SPEC.supports_logprob_proxy is True
         assert OPENCODE_SPEC.default_timeout_s == 900.0
         assert OPENCODE_SPEC.mcp_config.method == "config_file"
+        assert OPENCODE_SPEC.mcp_config.path_template is not None
         assert "{home}" in OPENCODE_SPEC.mcp_config.path_template
         assert OPENCODE_SPEC.artifacts is not None
         assert "agent_log" in OPENCODE_SPEC.artifacts
@@ -832,6 +831,7 @@ class OcConfig:
             sandbox_home: str = "/home/user"
             run_format: str = "json"
 
+        assert OPENCODE_SPEC.build_command is not None
         cmd = OPENCODE_SPEC.build_command(
             OPENCODE_SPEC,
             OcConfig(),
@@ -845,18 +845,20 @@ class OcConfig:
     def test_build_mcp_config(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
+        assert OPENCODE_SPEC.build_mcp_config is not None
         config_str = OPENCODE_SPEC.build_mcp_config(
             OPENCODE_SPEC,
             [],
             "/home/user/workdir",
         )
-        config = json.loads(config_str)
-        assert "$schema" in config
-        assert "provider" in config
+        # OpenCode returns empty string because the config is written
+        # via spec.files using _build_opencode_config_file instead.
+        assert config_str == ""
 
     def test_parse_events_assistant(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
+        assert OPENCODE_SPEC.parse_events is not None
         line = json.dumps({"type": "assistant", "content": "hello"})
         event = OPENCODE_SPEC.parse_events(line)
         assert event is not None
@@ -865,6 +867,7 @@ def test_parse_events_assistant(self):
     def test_parse_events_tool_call(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
+        assert OPENCODE_SPEC.parse_events is not None
         line = json.dumps({"type": "tool_call", "name": "bash", "args": {}})
         event = OPENCODE_SPEC.parse_events(line)
         assert event is not None
@@ -873,6 +876,7 @@ def test_parse_events_tool_call(self):
     def test_parse_events_error(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
+        assert OPENCODE_SPEC.parse_events is not None
         line = json.dumps({"type": "error", "message": "boom"})
         event = OPENCODE_SPEC.parse_events(line)
         assert event is not None
@@ -881,6 +885,7 @@ def test_parse_events_error(self):
     def test_parse_events_done(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
+        assert OPENCODE_SPEC.parse_events is not None
         line = json.dumps({"type": "done"})
         event = OPENCODE_SPEC.parse_events(line)
         assert event is not None
@@ -889,6 +894,7 @@ def test_parse_events_done(self):
     def test_parse_events_invalid_json(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
+        assert OPENCODE_SPEC.parse_events is not None
         assert OPENCODE_SPEC.parse_events("not json") is None
         assert OPENCODE_SPEC.parse_events("") is None
 
@@ -897,6 +903,7 @@ def test_build_env_vars(self):
 
         config = FakeConfig()
         config.extra_env = {"EXTRA": "val"}
+        assert OPENCODE_SPEC.build_env_vars is not None
         envs = OPENCODE_SPEC.build_env_vars(OPENCODE_SPEC, config)
         assert envs["OPENAI_BASE_URL"] == "https://api.example.com/v1"
         assert envs["OPENAI_API_KEY"] == "sk-test-key"
@@ -908,6 +915,7 @@ def test_files_instruction_resolver(self):
 
         task = FakeTask(instruction="Build a REST API")
         config = FakeConfig()
+        assert OPENCODE_SPEC.files is not None
         instruction_fn = OPENCODE_SPEC.files["/home/user/task/instruction.md"]
         assert callable(instruction_fn)
         assert instruction_fn(task, config) == "Build a REST API"
@@ -917,6 +925,7 @@ def test_files_system_prompt_resolver(self):
 
         task = FakeTask()
         config = FakeConfig()
+        assert OPENCODE_SPEC.files is not None
         system_fn = OPENCODE_SPEC.files["/home/user/task/system.md"]
         assert callable(system_fn)
         # No system prompt on FakeConfig → returns None
diff --git a/tests/core/test_harness_adapters.py b/tests/core/test_harness_adapters.py
new file mode 100644
index 000000000..f5e1dc260
--- /dev/null
+++ b/tests/core/test_harness_adapters.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for currently implemented harness adapters (OpenCode + Pi)."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+
+@dataclass
+class FakeTask:
+    instruction: str = "Write hello.py"
+    setup_shell: str | None = None
+    upload_files: dict[str, str] = field(default_factory=dict)
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class FakeConfig:
+    base_url: str = "https://api.example.com/v1"
+    api_key: str = "sk-test"
+    model: str = "test-model"
+    agent_timeout_s: float = 300.0
+    sandbox_home: str = "/home/user"
+    system_prompt: str | None = None
+
+
+class TestPiSpec:
+    def test_registered(self):
+        from openenv.core.harness.agents import get_agent_spec
+
+        spec = get_agent_spec("pi")
+        assert spec.name == "pi"
+
+    def test_fields(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        assert PI_SPEC.install_check_cmd == ["pi", "--version"]
+        assert PI_SPEC.mcp_config.method == "config_file"
+        assert PI_SPEC.mcp_config.path_template is not None
+        assert ".mcp.json" in PI_SPEC.mcp_config.path_template
+        assert PI_SPEC.env is not None
+        assert "HF_TOKEN" in PI_SPEC.env
+        assert "PI_SKIP_VERSION_CHECK" in PI_SPEC.env
+
+    def test_build_command(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        assert PI_SPEC.build_command is not None
+        cmd = PI_SPEC.build_command(PI_SPEC, FakeConfig(), FakeTask(), None)
+        assert "pi --no-session" in cmd
+        assert "--no-context-files" in cmd
+
+    def test_build_mcp_config(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        assert PI_SPEC.build_mcp_config is not None
+        content = PI_SPEC.build_mcp_config(PI_SPEC, [], "/workdir")
+        assert "mcpServers" in json.loads(content)
+
+
+class TestOpenCodeSpec:
+    def test_registered(self):
+        from openenv.core.harness.agents import get_agent_spec
+
+        spec = get_agent_spec("opencode")
+        assert spec.name == "opencode"
+
+
+class TestRegistryAutoImport:
+    @pytest.mark.parametrize("name", ["pi", "opencode"])
+    def test_auto_import(self, name):
+        from openenv.core.harness.agents import get_agent_spec
+
+        spec = get_agent_spec(name)
+        assert spec.name == name
+
+    def test_list_agents_includes_current(self):
+        import openenv.core.harness.agents.opencode  # noqa: F401
+        import openenv.core.harness.agents.pi  # noqa: F401
+        from openenv.core.harness.agents import list_agents
+
+        agents = list_agents()
+        for name in ["opencode", "pi"]:
+            assert name in agents, f"{name} not in {agents}"

From 81e37a2c4bced06024bc1bf434b33d36c73f99a8 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Thu, 14 May 2026 13:48:48 +0530
Subject: [PATCH 08/35] chore: migrate opencode_env to coding_agent_env

---
 docs/source/environments.md                   |   8 +-
 docs/source/environments/coding_agent.md      |   2 +
 docs/source/environments/opencode.md          |   2 -
 .../.dockerignore                             |   0
 .../.gitignore                                |   0
 .../README.md                                 | 104 +++----
 .../__init__.py                               |  30 +-
 .../client.py                                 |  32 ++-
 .../config.py                                 |   6 +-
 .../harness.py                                |  55 ++--
 .../models.py                                 |   8 +-
 .../opencode_runtime.py                       |  24 +-
 .../openenv.yaml                              |   2 +-
 .../pyproject.toml                            |  17 +-
 .../sandbox/__init__.py                       |   0
 .../sandbox/build_template.py                 |  12 +-
 .../server/Dockerfile                         |   6 +-
 .../server/__init__.py                        |   2 +-
 .../server/app.py                             |  22 +-
 .../server/catalog.py                         |   0
 .../server/coding_environment.py}             | 262 ++++++++++++++----
 .../server/gradio_ui.py                       |  59 ++--
 .../task.py                                   |  14 +-
 .../uv.lock                                   |  70 ++---
 ...v_simple.py => coding_agent_env_simple.py} |  24 +-
 ...encode_env.py => test_coding_agent_env.py} | 140 +++++++---
 26 files changed, 566 insertions(+), 335 deletions(-)
 create mode 100644 docs/source/environments/coding_agent.md
 delete mode 100644 docs/source/environments/opencode.md
 rename envs/{opencode_env => coding_agent_env}/.dockerignore (100%)
 rename envs/{opencode_env => coding_agent_env}/.gitignore (100%)
 rename envs/{opencode_env => coding_agent_env}/README.md (67%)
 rename envs/{opencode_env => coding_agent_env}/__init__.py (63%)
 rename envs/{opencode_env => coding_agent_env}/client.py (85%)
 rename envs/{opencode_env => coding_agent_env}/config.py (94%)
 rename envs/{opencode_env => coding_agent_env}/harness.py (87%)
 rename envs/{opencode_env => coding_agent_env}/models.py (90%)
 rename envs/{opencode_env => coding_agent_env}/opencode_runtime.py (86%)
 rename envs/{opencode_env => coding_agent_env}/openenv.yaml (76%)
 rename envs/{opencode_env => coding_agent_env}/pyproject.toml (68%)
 rename envs/{opencode_env => coding_agent_env}/sandbox/__init__.py (100%)
 rename envs/{opencode_env => coding_agent_env}/sandbox/build_template.py (91%)
 rename envs/{opencode_env => coding_agent_env}/server/Dockerfile (91%)
 rename envs/{opencode_env => coding_agent_env}/server/__init__.py (79%)
 rename envs/{opencode_env => coding_agent_env}/server/app.py (81%)
 rename envs/{opencode_env => coding_agent_env}/server/catalog.py (100%)
 rename envs/{opencode_env/server/opencode_environment.py => coding_agent_env/server/coding_environment.py} (70%)
 rename envs/{opencode_env => coding_agent_env}/server/gradio_ui.py (92%)
 rename envs/{opencode_env => coding_agent_env}/task.py (73%)
 rename envs/{opencode_env => coding_agent_env}/uv.lock (99%)
 rename examples/{opencode_env_simple.py => coding_agent_env_simple.py} (83%)
 rename tests/envs/{test_opencode_env.py => test_coding_agent_env.py} (73%)

diff --git a/docs/source/environments.md b/docs/source/environments.md
index a14564a1a..58f36c155 100644
--- a/docs/source/environments.md
+++ b/docs/source/environments.md
@@ -549,13 +549,13 @@ AgentWorldModel-1K — 1,000 synthetic MCP tool-use environments with 10,000 tas
 ```
 ````
 
-````{grid-item-card} Opencode
+````{grid-item-card} Coding Agent
 :class-card: sd-border-1
 
-`opencode_env` runs the OpenCode coding agent inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logpr...
+`coding_agent_env` runs coding-agent harnesses (currently OpenCode + Pi) inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logpr...
 
 +++
-```{button-link} environments/opencode.html
+```{button-link} environments/coding_agent.html
 :color: primary
 :outline:
 
@@ -633,5 +633,5 @@ environments/tbench2
 environments/unity
 environments/wildfire
 environments/agent_world_model
-environments/opencode
+environments/coding_agent
 ```
diff --git a/docs/source/environments/coding_agent.md b/docs/source/environments/coding_agent.md
new file mode 100644
index 000000000..2903e2322
--- /dev/null
+++ b/docs/source/environments/coding_agent.md
@@ -0,0 +1,2 @@
+```{include} ../../../envs/coding_agent_env/README.md
+```
diff --git a/docs/source/environments/opencode.md b/docs/source/environments/opencode.md
deleted file mode 100644
index 9a93ebe33..000000000
--- a/docs/source/environments/opencode.md
+++ /dev/null
@@ -1,2 +0,0 @@
-```{include} ../../../envs/opencode_env/README.md
-```
diff --git a/envs/opencode_env/.dockerignore b/envs/coding_agent_env/.dockerignore
similarity index 100%
rename from envs/opencode_env/.dockerignore
rename to envs/coding_agent_env/.dockerignore
diff --git a/envs/opencode_env/.gitignore b/envs/coding_agent_env/.gitignore
similarity index 100%
rename from envs/opencode_env/.gitignore
rename to envs/coding_agent_env/.gitignore
diff --git a/envs/opencode_env/README.md b/envs/coding_agent_env/README.md
similarity index 67%
rename from envs/opencode_env/README.md
rename to envs/coding_agent_env/README.md
index 79ebc6ed3..11fb88188 100644
--- a/envs/opencode_env/README.md
+++ b/envs/coding_agent_env/README.md
@@ -1,5 +1,5 @@
 ---
-title: OpenCode Environment Server
+title: Coding Agent Environment Server
 emoji: 🛠️
 colorFrom: indigo
 colorTo: purple
@@ -9,23 +9,24 @@ app_port: 8000
 base_path: /web
 tags:
   - openenv
-short_description: OpenCode coding agent in an E2B sandbox with logprob capture
+short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B with logprob capture
 ---
 
-# OpenCode Environment for OpenEnv
+# Coding Agent Environment for OpenEnv
 
-`opencode_env` runs the [OpenCode](https://opencode.ai) coding agent inside
-an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible
+`coding_agent_env` runs coding-agent harnesses (currently
+[OpenCode](https://opencode.ai) and [Pi](https://github.com/badlogic/pi-mono))
+inside an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible
 LLM endpoint, optionally capturing per-token logprobs for GRPO training.
 
-**🚀 Try it live**: [`AdithyaSK/opencode-env`](https://huggingface.co/spaces/AdithyaSK/opencode-env)
+**🚀 Try it live**: [`AdithyaSK/coding-agent-env`](https://huggingface.co/spaces/AdithyaSK/coding-agent-env)
 
 The deployed Space exposes:
 
-- **Web UI** at [`/web`](https://adithyask-opencode-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward + logprobs.
-- **MCP tool API** at [`/mcp`](https://adithyask-opencode-env.hf.space/mcp) — programmatic `run_rollout` calls.
-- **OpenAPI docs** at [`/docs`](https://adithyask-opencode-env.hf.space/docs).
-- **Health** at [`/health`](https://adithyask-opencode-env.hf.space/health).
+- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward + logprobs.
+- **MCP tool API** at [`/mcp`](https://adithyask-coding-agent-env.hf.space/mcp) — programmatic `run_rollout` calls.
+- **OpenAPI docs** at [`/docs`](https://adithyask-coding-agent-env.hf.space/docs).
+- **Health** at [`/health`](https://adithyask-coding-agent-env.hf.space/health).
 
 The env is **task-agnostic** — every rollout is configured at call-time
 with a uniform Task shape:
@@ -47,20 +48,21 @@ a float to `/home/user/logs/verifier/reward.txt` (override).
 ```python
 import asyncio
 import os
-from opencode_env import OpenCodeEnv
-from opencode_env.client import _extract_text
-from opencode_env.models import RolloutResult
+from coding_agent_env import CodingAgentEnv
+from coding_agent_env.client import _extract_text
+from coding_agent_env.models import RolloutResult
 
 
 async def main():
-    SPACE = "https://adithyask-opencode-env.hf.space"
+    SPACE = "https://adithyask-coding-agent-env.hf.space"
 
-    async with OpenCodeEnv(base_url=SPACE) as env:
+    async with CodingAgentEnv(base_url=SPACE) as env:
         await env.reset()
 
         # The MCP tool returns JSON; deserialize via the typed model.
         raw = await env.call_tool(
             "run_rollout",
+            agent="opencode",                          # opencode | pi
             endpoint="openai",                          # vllm | openai | hf_router
             api_key=os.environ["OPENAI_API_KEY"],       # or set as a Space secret
             instruction=(
@@ -75,7 +77,7 @@ async def main():
                 "import binary_search; "
                 "assert binary_search.binary_search([1,2,3], 2) == 1; print('OK')\"",
             ],
-            template="opencode-rl",                     # prebaked E2B template
+            template="coding-agent-rl",                     # prebaked E2B template
             task_id="binary_search_v1",
         )
         result = RolloutResult.model_validate_json(_extract_text(raw))
@@ -102,10 +104,10 @@ wall: 19.8 s
 
 ```python
 import os
-from opencode_env import OpenCodeEnv
+from coding_agent_env import CodingAgentEnv
 
 # .sync() returns a synchronous wrapper around the async client.
-with OpenCodeEnv(base_url="https://adithyask-opencode-env.hf.space").sync() as env:
+with CodingAgentEnv(base_url="https://adithyask-coding-agent-env.hf.space").sync() as env:
     env.reset()
     # MCP tools are reachable via env.call_tool(...) / env.step(...) sync-wrapped.
     # See the async example above for the full run_rollout signature.
@@ -120,12 +122,12 @@ For trainers that want to drive a sandbox directly without an HTTP boundary:
 
 ```python
 import os
-from opencode_env import (
-    OpenCodeConfig, OpenCodeSessionFactory, OpenCodeTask, E2BSandboxBackend,
+from coding_agent_env import (
+    CodingAgentConfig, CodingAgentSessionFactory, CodingAgentTask, E2BSandboxBackend,
 )
 
-factory = OpenCodeSessionFactory(
-    config=OpenCodeConfig(
+factory = CodingAgentSessionFactory(
+    config=CodingAgentConfig(
         provider="openai_compatible",
         base_url="https://api.openai.com/v1",
         api_key=os.environ["OPENAI_API_KEY"],
@@ -134,7 +136,7 @@ factory = OpenCodeSessionFactory(
     sandbox_backend=E2BSandboxBackend(),
     mode="transparent_proxy",                   # captures per-token logprobs
 )
-session = factory.create(task=OpenCodeTask(instruction="..."))
+session = factory.create(task=CodingAgentTask(instruction="..."))
 session.wait_for_completion()
 turns = session.fetch_proxy_trace()             # per-turn (tokens, logprobs)
 session.close()
@@ -146,22 +148,22 @@ The Dockerfile lives at `server/Dockerfile`. Use the `openenv` CLI from
 the env root:
 
 ```bash
-cd envs/opencode_env
+cd envs/coding_agent_env
 
 openenv validate               # check pyproject.toml + openenv.yaml + server/app.py + uv.lock
-openenv build -t opencode-env  # builds the image (uses server/Dockerfile)
+openenv build -t coding-agent-env  # builds the image (uses server/Dockerfile)
 
 # run locally with E2B credentials
-docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
+docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env
 
 # push to HF Spaces (Docker variant)
-openenv push --repo-id <user>/opencode-env
+openenv push --repo-id <user>/coding-agent-env
 ```
 
 Or build directly without the CLI:
 
 ```bash
-docker build -t opencode-env -f envs/opencode_env/server/Dockerfile envs/opencode_env
+docker build -t coding-agent-env -f envs/coding_agent_env/server/Dockerfile envs/coding_agent_env
 ```
 
 The image:
@@ -174,7 +176,7 @@ The image:
 
 ## The MCP Tool: `run_rollout`
 
-Single tool, two ways to specify the LLM endpoint:
+Single tool, with an ``agent`` selector plus two ways to specify the LLM endpoint:
 
 **Option A — endpoint shorthand (recommended)**: pass
 `endpoint="vllm"` (or `"openai"` / `"hf_router"`). The server resolves
@@ -186,9 +188,10 @@ directly.
 
 | Arg | Type | Default | Notes |
 |---|---|---|---|
+| `agent` | `str` | `"opencode"` | Harness to run: `"opencode"` or `"pi"`. |
 | `endpoint` | `str` | `""` | One of `"vllm"` / `"openai"` / `"hf_router"`. |
 | `base_url` / `api_key` / `model` | `str` | `""` | Override / supply explicitly. |
-| `instruction` | `str` | required | Prompt passed to `opencode run`. |
+| `instruction` | `str` | required | Prompt passed to the selected harness CLI. |
 | `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. |
 | `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. |
 | `task_id` | `str` | `""` | Echoed back in result. |
@@ -196,8 +199,8 @@ directly.
 | `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. |
 | `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. |
 | `top_logprobs` | `int` | `5` | HF Router cap is 5; OpenAI 0–20; vLLM unbounded. |
-| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for opencode. |
-| `template` | `str` | `""` | E2B template name; `"opencode-rl"` skips ~2 min of install per rollout. |
+| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for the selected harness. |
+| `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. |
 
 Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
 `verify_results[]`, `proxy_turns[]`, `files{}`, `agent_log_tail`,
@@ -207,8 +210,8 @@ Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
 
 | Mode | What it does | Best for |
 |---|---|---|
-| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards opencode's LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. |
-| **`black_box`** | No proxy. opencode talks straight to `base_url`. | Smoke tests, eval, SFT data collection. |
+| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards harness LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. |
+| **`black_box`** | No proxy. The selected harness talks straight to `base_url`. | Smoke tests, eval, SFT data collection. |
 
 ## Environment Variables
 
@@ -240,21 +243,21 @@ Hyperbolic / Featherless (silent drop) and Groq (HTTP 400).
 ## Pre-baked E2B Template
 
 The first rollout in a fresh E2B sandbox spends ~2 min installing
-opencode and the proxy's Python deps. Build a one-time template that
+harness tooling and the proxy's Python deps. Build a one-time template that
 ships those pre-installed:
 
 ```bash
-.venv/bin/python envs/opencode_env/sandbox/build_template.py
-# → builds `opencode-rl` template in your E2B account (~1m20s, one-time)
+.venv/bin/python envs/coding_agent_env/sandbox/build_template.py
+# → builds `coding-agent-rl` template in your E2B account (~1m20s, one-time)
 ```
 
-After this, pass `template="opencode-rl"` on every `run_rollout` call —
+After this, pass `template="coding-agent-rl"` on every `run_rollout` call —
 each rollout drops to ~20–30s end-to-end.
 
 ## Project Structure
 
 ```
-opencode_env/
+coding_agent_env/
 ├── README.md                       # this file
 ├── openenv.yaml                    # OpenEnv space spec
 ├── pyproject.toml                  # deps + ``server`` entrypoint
@@ -262,33 +265,38 @@ opencode_env/
 ├── .gitignore / .dockerignore      # excludes .env / __pycache__
 ├── __init__.py                     # re-exports primitive + client + models
 │
-├── client.py                       # OpenCodeEnv(MCPToolClient)
-├── models.py                       # RolloutResult / RolloutTurn / OpenCodeState
+├── client.py                       # CodingAgentEnv(MCPToolClient)
+├── models.py                       # RolloutResult / RolloutTurn / CodingAgentState
 │
-├── config.py                       # OpenCodeConfig (primitive)
-├── harness.py                      # OpenCodeSession / OpenCodeSessionFactory (CLI-only)
+├── config.py                       # CodingAgentConfig (primitive)
+├── harness.py                      # CodingAgentSession / CodingAgentSessionFactory (CLI-only)
 ├── opencode_runtime.py             # opencode.json builder + cmds
-├── task.py                         # OpenCodeTask
+├── task.py                         # CodingAgentTask
 │
 ├── server/
 │   ├── __init__.py
 │   ├── app.py                      # FastAPI factory; mounts Gradio at /web
-│   ├── opencode_environment.py     # MCPEnvironment with single ``run_rollout`` tool
+│   ├── coding_environment.py      # MCPEnvironment with single ``run_rollout`` tool
 │   ├── gradio_ui.py                # the /web Gradio Blocks UI
 │   ├── catalog.py                  # endpoint shorthand resolver
 │   └── Dockerfile                  # multi-stage uv build (used by ``openenv build``)
 │
 └── sandbox/
     ├── __init__.py
-    ├── base.py                     # SandboxBackend / SandboxHandle Protocols
-    ├── e2b.py                      # E2B implementation
-    ├── interception.py             # in-sandbox FastAPI proxy (logprob capture)
     └── build_template.py           # one-time E2B template builder
+
+# Shared sandbox runtime (moved to core):
+src/openenv/core/harness/sandbox/
+├── base.py                         # SandboxBackend / SandboxHandle protocols
+├── e2b_backend.py                  # E2B implementation
+├── docker_backend.py               # local Docker backend
+└── interception.py                 # in-sandbox FastAPI proxy (logprob capture)
 ```
 
 ## References
 
 - [OpenEnv docs](https://meta-pytorch.org/OpenEnv/)
 - [OpenCode CLI](https://opencode.ai/docs/cli/)
+- [Pi](https://github.com/badlogic/pi-mono)
 - [E2B Python SDK](https://e2b.dev/docs)
 - [HF Inference Providers logprob matrix](../../../DOCS/HF/hf_inference_providers_logprobs.md)
diff --git a/envs/opencode_env/__init__.py b/envs/coding_agent_env/__init__.py
similarity index 63%
rename from envs/opencode_env/__init__.py
rename to envs/coding_agent_env/__init__.py
index dcd48a01c..6b839e7ea 100644
--- a/envs/opencode_env/__init__.py
+++ b/envs/coding_agent_env/__init__.py
@@ -4,16 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""OpenCode environment for OpenEnv.
+"""Coding-agent environment for OpenEnv.
 
 Two layers in this package:
 
-1. **Harness primitive** -- :class:`OpenCodeSessionFactory` /
-   :class:`OpenCodeSession` / :class:`OpenCodeConfig` /
+1. **Harness primitive** -- :class:`CodingAgentSessionFactory` /
+   :class:`CodingAgentSession` / :class:`CodingAgentConfig` /
    :class:`E2BSandboxBackend`. Built on the generic
    :class:`CLIAgentDriver` from ``openenv.core.harness.agents``.
 
-2. **Deployable env** -- :class:`OpenCodeEnv` (MCP client) talks to the
+2. **Deployable env** -- :class:`CodingAgentEnv` (MCP client) talks to the
    FastAPI server at ``server/app.py`` over HTTP. Use this when the
    sandbox + agent live behind an HTTP boundary (e.g. an HF Space).
    See ``client.py`` and ``server/``.
@@ -22,11 +22,11 @@
 from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
 from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
 
-from .client import OpenCodeEnv
-from .config import OpenCodeConfig, Provider
-from .harness import OpenCodeSession, OpenCodeSessionFactory
-from .models import CommandResult, OpenCodeState, RolloutResult, RolloutTurn
-from .task import OpenCodeTask
+from .client import CodingAgentEnv
+from .config import CodingAgentConfig, Provider
+from .harness import CodingAgentSession, CodingAgentSessionFactory
+from .models import CommandResult, CodingAgentState, RolloutResult, RolloutTurn
+from .task import CodingAgentTask
 
 try:
     from openenv.core.harness.sandbox import E2BSandboxBackend
@@ -35,19 +35,19 @@
 
 __all__ = [
     # Deployed-env client
-    "OpenCodeEnv",
+    "CodingAgentEnv",
     "CallToolAction",
     "ListToolsAction",
     # HTTP API models
     "CommandResult",
-    "OpenCodeState",
+    "CodingAgentState",
     "RolloutResult",
     "RolloutTurn",
     # Harness primitive
-    "OpenCodeConfig",
-    "OpenCodeSession",
-    "OpenCodeSessionFactory",
-    "OpenCodeTask",
+    "CodingAgentConfig",
+    "CodingAgentSession",
+    "CodingAgentSessionFactory",
+    "CodingAgentTask",
     "Provider",
     # Sandbox backend
     "E2BSandboxBackend",
diff --git a/envs/opencode_env/client.py b/envs/coding_agent_env/client.py
similarity index 85%
rename from envs/opencode_env/client.py
rename to envs/coding_agent_env/client.py
index 52e76e2d5..8c512090d 100644
--- a/envs/opencode_env/client.py
+++ b/envs/coding_agent_env/client.py
@@ -4,16 +4,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Client for the deployed opencode_env server.
+"""Client for the deployed coding_agent_env server.
 
-The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode
-rollout in an E2B sandbox and returns a JSON-serialized :class:`RolloutResult`.
+The server exposes a single MCP tool ``run_rollout`` that runs one coding-agent
+rollout (OpenCode or Pi) in an E2B sandbox and returns a JSON-serialized
+:class:`RolloutResult`.
 
 Example::
 
-    from opencode_env import OpenCodeEnv
+    from coding_agent_env import CodingAgentEnv
 
-    with OpenCodeEnv(base_url="https://adithya-sk-opencode-env.hf.space") as env:
+    with CodingAgentEnv(base_url="https://your-space.hf.space") as env:
         env.reset()
         result = env.run_rollout(
             base_url="https://api.openai.com/v1",
@@ -40,8 +41,8 @@
     from models import RolloutResult  # type: ignore
 
 
-class OpenCodeEnv(MCPToolClient):
-    """Typed client for the opencode_env MCP server.
+class CodingAgentEnv(MCPToolClient):
+    """Typed client for the coding_agent_env MCP server.
 
     Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image``
     / context-manager semantics from :class:`MCPToolClient`.
@@ -50,7 +51,8 @@ class OpenCodeEnv(MCPToolClient):
     def run_rollout(
         self,
         *,
-        # Endpoint — pass either the shorthand selector OR explicit fields.
+        # Agent + endpoint — pass either shorthand endpoint or explicit fields.
+        agent: str = "opencode",  # "opencode" | "pi"
         endpoint: str = "",  # "vllm" | "openai" | "hf_router"
         base_url: str = "",
         api_key: str = "",
@@ -68,16 +70,17 @@ def run_rollout(
         agent_timeout_s: float = 600.0,
         template: str = "",
     ) -> RolloutResult:
-        """Run one OpenCode rollout and return the typed result.
+        """Run one coding-agent rollout and return the typed result.
 
         Args:
+            agent: Harness CLI to run in sandbox (``"opencode"`` or ``"pi"``).
             base_url: OpenAI-compatible LLM endpoint (with trailing /v1).
             api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM
                 if it doesn't enforce auth.
             model: Model id understood by the LLM endpoint
                 (e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``,
                 ``"Qwen/Qwen3-4B-Instruct-2507:nscale"``).
-            instruction: Prompt passed to ``opencode run``.
+            instruction: Prompt passed to the selected harness CLI.
             setup: Bash commands run sequentially **before** the agent starts.
                 Each command runs in the sandbox; non-zero exit aborts setup.
             verify: Bash commands run sequentially **after** the agent exits.
@@ -90,12 +93,11 @@ def run_rollout(
                 ``chat_template_kwargs.enable_thinking=false`` on forwarded
                 requests. Needed for Qwen3.5 vLLM; harmless on Instruct
                 variants; rejected by OpenAI direct.
-            max_tokens_cap: Clamp on per-turn ``max_tokens``. OpenCode asks
-                for ~32k by default; gpt-4o-mini caps at 16k.
+            max_tokens_cap: Clamp on per-turn ``max_tokens``.
             top_logprobs: Top-k logprobs requested upstream. HF Router caps
                 at 5; OpenAI accepts up to 20; vLLM is unbounded.
-            agent_timeout_s: Hard wall-clock budget for one ``opencode run``.
-            template: E2B template name (e.g. ``"opencode-rl"``). Empty
+            agent_timeout_s: Hard wall-clock budget for one agent run.
+            template: E2B template name (e.g. ``"coding-agent-rl"``). Empty
                 string uses the default (slow) base image.
 
         Returns:
@@ -104,6 +106,7 @@ def run_rollout(
         """
         raw = self.call_tool(
             "run_rollout",
+            agent=agent,
             endpoint=endpoint,
             base_url=base_url,
             api_key=api_key,
@@ -166,3 +169,4 @@ def _extract_text(result: Any) -> str:
             return text
 
     return str(result)
+
diff --git a/envs/opencode_env/config.py b/envs/coding_agent_env/config.py
similarity index 94%
rename from envs/opencode_env/config.py
rename to envs/coding_agent_env/config.py
index 2b6bae0a2..2eac8d16f 100644
--- a/envs/opencode_env/config.py
+++ b/envs/coding_agent_env/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Configuration model for the OpenCode harness primitive."""
+"""Configuration model for the coding-agent harness primitive."""
 
 from __future__ import annotations
 
@@ -16,8 +16,8 @@
 Provider = Literal["openai_compatible", "openai", "anthropic"]
 
 
-class OpenCodeConfig(BaseModel):
-    """All configuration required to launch one OpenCode rollout in a sandbox.
+class CodingAgentConfig(BaseModel):
+    """All configuration required to launch one coding-agent rollout in a sandbox.
 
     Field names are provider-agnostic. The primitive maps ``provider`` onto the
     correct ``opencode.json`` provider block (``@ai-sdk/openai-compatible``,
diff --git a/envs/opencode_env/harness.py b/envs/coding_agent_env/harness.py
similarity index 87%
rename from envs/opencode_env/harness.py
rename to envs/coding_agent_env/harness.py
index 600aafa82..ccbfa2cfc 100644
--- a/envs/opencode_env/harness.py
+++ b/envs/coding_agent_env/harness.py
@@ -4,16 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""OpenCode session factory + session — backed by CLIAgentDriver.
+"""Coding-agent session factory + session — backed by CLIAgentDriver.
 
-This module exposes :class:`OpenCodeSession` and
-:class:`OpenCodeSessionFactory` built on top of the generic
+This module exposes :class:`CodingAgentSession` and
+:class:`CodingAgentSessionFactory` built on top of the generic
 :class:`CLIAgentDriver` / :class:`CLIAgentSession` /
 :class:`CLIAgentSessionFactory` from ``openenv.core.harness.agents``.
 
-OpenCode-specific configuration (``opencode.json`` generation, provider
+Agent-specific (OpenCode spec) configuration (``opencode.json`` generation, provider
 mapping, tool enable/disable) is handled by
-:mod:`opencode_env.opencode_runtime` builders wired into the
+:mod:`coding_agent_env.opencode_runtime` builders wired into the
 :data:`OPENCODE_SPEC` via callable hooks.
 """
 
@@ -31,7 +31,7 @@
 from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 
-from .config import OpenCodeConfig
+from .config import CodingAgentConfig
 from .opencode_runtime import (
     agent_log_path,
     build_env_vars,
@@ -42,7 +42,7 @@
     opencode_config_path,
     system_prompt_path,
 )
-from .task import OpenCodeTask
+from .task import CodingAgentTask
 
 
 # Inside-sandbox proxy paths (Mode B).
@@ -61,21 +61,19 @@
 )
 
 
-class OpenCodeSession(CLIAgentSession):
-    """One live OpenCode rollout inside a sandbox.
+class CodingAgentSession(CLIAgentSession):
+    """One live coding-agent rollout inside a sandbox.
 
-    Extends :class:`CLIAgentSession` with OpenCode-specific convenience
-    methods (``fetch_trace``, ``wait_for_completion`` with config-aware
-    timeout). Fully backward-compatible with code that used the old
-    ``OpenCodeSession`` API.
+    Extends :class:`CLIAgentSession` with Agent-specific (OpenCode spec) convenience
+    methods (``fetch_trace``, ``wait_for_completion`` with config-aware timeout).
     """
 
     def __init__(
         self,
         *,
         sandbox: SandboxHandle,
-        config: OpenCodeConfig,
-        task: OpenCodeTask,
+        config: CodingAgentConfig,
+        task: CodingAgentTask,
         verifier: Verifier | None = None,
         base_url_override: str | None = None,
         proxy_trace_path: str | None = None,
@@ -108,8 +106,7 @@ def wait_for_completion(self, timeout_s: float | None = None) -> int:
     def start_agent(self) -> None:
         """Launch ``opencode run`` as a background subprocess in the sandbox.
 
-        Provided for backward compatibility — the factory now starts the
-        agent during ``create()``, so calling this manually is a no-op
+        The factory starts the agent during ``create()``; this method is a no-op
         if the agent is already running.
         """
         if self._agent_bg_job is not None:
@@ -119,8 +116,8 @@ def start_agent(self) -> None:
         self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs)
 
 
-class OpenCodeSessionFactory(ResourceSessionFactory):
-    """Produce isolated per-rollout :class:`OpenCodeSession` instances.
+class CodingAgentSessionFactory(ResourceSessionFactory):
+    """Produce isolated per-rollout :class:`CodingAgentSession` instances.
 
     The factory owns sandbox provisioning, opencode install, config injection,
     and (Mode B) proxy startup. Each :meth:`create` call returns a fresh
@@ -128,13 +125,13 @@ class OpenCodeSessionFactory(ResourceSessionFactory):
 
     Internally delegates to :class:`CLIAgentDriver` for the generic
     sandbox lifecycle (readiness probing, install retry, proxy startup).
-    OpenCode-specific config generation uses ``opencode_runtime`` builders.
+    Agent-specific (OpenCode spec) config generation uses ``opencode_runtime`` builders.
     """
 
     def __init__(
         self,
         *,
-        config: OpenCodeConfig,
+        config: CodingAgentConfig,
         sandbox_backend: SandboxBackend,
         mode: Literal["black_box", "transparent_proxy"] = "black_box",
         verifier: Verifier | None = None,
@@ -167,12 +164,12 @@ def create(
         task: Any,
         seed: int | None = None,
         episode_id: str | None = None,
-    ) -> OpenCodeSession:
+    ) -> CodingAgentSession:
         import logging
 
         _log = logging.getLogger(__name__)
 
-        oc_task = OpenCodeTask.coerce(task)
+        oc_task = CodingAgentTask.coerce(task)
         sandbox_timeout = int(self._config.agent_timeout_s) + 300
 
         _log.info(
@@ -213,7 +210,7 @@ def create(
             )
             _log.info("factory.create: proxy up at %s", base_url_override)
             # Rewrite opencode.json so opencode points at the proxy.
-            proxy_cfg = OpenCodeConfig(
+            proxy_cfg = CodingAgentConfig(
                 **{
                     **self._config.model_dump(),
                     "provider": "openai_compatible",
@@ -225,7 +222,7 @@ def create(
                 build_opencode_json(proxy_cfg),
             )
 
-        session = OpenCodeSession(
+        session = CodingAgentSession(
             sandbox=sandbox,
             config=self._config,
             task=oc_task,
@@ -244,7 +241,7 @@ def create(
     def _bootstrap_sandbox(
         self,
         sandbox: SandboxHandle,
-        task: OpenCodeTask,
+        task: CodingAgentTask,
     ) -> None:
         """Install opencode, write config + task files, run optional setup."""
 
@@ -310,8 +307,8 @@ def _start_proxy(
 
 
 __all__ = [
-    "OpenCodeSession",
-    "OpenCodeSessionFactory",
-    "OpenCodeTask",
+    "CodingAgentSession",
+    "CodingAgentSessionFactory",
+    "CodingAgentTask",
     "Verifier",
 ]
diff --git a/envs/opencode_env/models.py b/envs/coding_agent_env/models.py
similarity index 90%
rename from envs/opencode_env/models.py
rename to envs/coding_agent_env/models.py
index b218c5f78..3e31962fb 100644
--- a/envs/opencode_env/models.py
+++ b/envs/coding_agent_env/models.py
@@ -4,11 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Pydantic models for the deployed opencode_env HTTP server.
+"""Pydantic models for the deployed coding_agent_env HTTP server.
 
 The server exposes a single MCP tool ``run_rollout`` that takes a Task
 (instruction + setup commands + verify commands) plus an LLM endpoint
-config, runs one OpenCode rollout end-to-end inside an E2B sandbox, and
+config, runs one coding-agent rollout end-to-end inside an E2B sandbox, and
 returns a :class:`RolloutResult` JSON.
 """
 
@@ -80,8 +80,8 @@ class RolloutResult(BaseModel):
     error: str | None = None
 
 
-class OpenCodeState(State):
-    """Per-session env state across calls to one OpenCodeEnvironment instance.
+class CodingAgentState(State):
+    """Per-session env state across calls to one CodingAgentEnvironment instance.
 
     Each HTTP session gets its own env (``SUPPORTS_CONCURRENT_SESSIONS=True``
     on the server class), so this state is per-session.
diff --git a/envs/opencode_env/opencode_runtime.py b/envs/coding_agent_env/opencode_runtime.py
similarity index 86%
rename from envs/opencode_env/opencode_runtime.py
rename to envs/coding_agent_env/opencode_runtime.py
index 75fed41e3..49855528b 100644
--- a/envs/opencode_env/opencode_runtime.py
+++ b/envs/coding_agent_env/opencode_runtime.py
@@ -16,34 +16,34 @@
 import json
 from typing import Any
 
-from .config import OpenCodeConfig, provider_npm_package
+from .config import CodingAgentConfig, provider_npm_package
 
 
-def opencode_config_path(config: OpenCodeConfig) -> str:
+def opencode_config_path(config: CodingAgentConfig) -> str:
     return f"{config.sandbox_home}/.config/opencode/opencode.json"
 
 
-def instruction_path(config: OpenCodeConfig) -> str:
+def instruction_path(config: CodingAgentConfig) -> str:
     return f"{config.sandbox_home}/task/instruction.md"
 
 
-def agent_log_path(config: OpenCodeConfig) -> str:
+def agent_log_path(config: CodingAgentConfig) -> str:
     return f"{config.sandbox_home}/logs/agent/opencode.jsonl"
 
 
-def system_prompt_path(config: OpenCodeConfig) -> str:
+def system_prompt_path(config: CodingAgentConfig) -> str:
     return f"{config.sandbox_home}/task/system.md"
 
 
-def verifier_reward_path(config: OpenCodeConfig) -> str:
+def verifier_reward_path(config: CodingAgentConfig) -> str:
     return f"{config.sandbox_home}/logs/verifier/reward.txt"
 
 
-def workdir_path(config: OpenCodeConfig) -> str:
+def workdir_path(config: CodingAgentConfig) -> str:
     return f"{config.sandbox_home}/workdir"
 
 
-def build_opencode_json(config: OpenCodeConfig) -> str:
+def build_opencode_json(config: CodingAgentConfig) -> str:
     """Return the serialized ``opencode.json`` the sandbox should install.
 
     Provider block is keyed by a stable internal name (``intercepted``) so the
@@ -79,7 +79,7 @@ def build_opencode_json(config: OpenCodeConfig) -> str:
     return json.dumps(doc, indent=2)
 
 
-def build_install_cmd(config: OpenCodeConfig) -> str:
+def build_install_cmd(config: CodingAgentConfig) -> str:
     """Return the shell command that installs OpenCode + ensures PATH.
 
     The upstream installer honors ``OPENCODE_VERSION=x.y.z`` for pinning;
@@ -99,7 +99,7 @@ def build_install_cmd(config: OpenCodeConfig) -> str:
     )
 
 
-def build_run_cmd(config: OpenCodeConfig) -> str:
+def build_run_cmd(config: CodingAgentConfig) -> str:
     """Return the shell command that launches OpenCode against a task."""
 
     format_flag = "--format json" if config.run_format == "json" else ""
@@ -112,7 +112,7 @@ def build_run_cmd(config: OpenCodeConfig) -> str:
 
 
 def build_env_vars(
-    config: OpenCodeConfig, *, base_url_override: str | None = None
+    config: CodingAgentConfig, *, base_url_override: str | None = None
 ) -> dict[str, str]:
     """Return env vars to set on the OpenCode process.
 
@@ -129,7 +129,7 @@ def build_env_vars(
     return env
 
 
-def _build_tools_block(config: OpenCodeConfig) -> dict[str, bool]:
+def _build_tools_block(config: CodingAgentConfig) -> dict[str, bool]:
     """Translate enabled/disabled lists into opencode's ``tools`` map."""
 
     if config.enabled_tools is not None:
diff --git a/envs/opencode_env/openenv.yaml b/envs/coding_agent_env/openenv.yaml
similarity index 76%
rename from envs/opencode_env/openenv.yaml
rename to envs/coding_agent_env/openenv.yaml
index 2a534a088..be34c3a51 100644
--- a/envs/opencode_env/openenv.yaml
+++ b/envs/coding_agent_env/openenv.yaml
@@ -1,5 +1,5 @@
 spec_version: 1
-name: opencode_env
+name: coding_agent_env
 type: space
 runtime: fastapi
 app: server.app:app
diff --git a/envs/opencode_env/pyproject.toml b/envs/coding_agent_env/pyproject.toml
similarity index 68%
rename from envs/opencode_env/pyproject.toml
rename to envs/coding_agent_env/pyproject.toml
index 50337baa2..276d3e0be 100644
--- a/envs/opencode_env/pyproject.toml
+++ b/envs/coding_agent_env/pyproject.toml
@@ -9,9 +9,9 @@ requires = ["setuptools>=45", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "openenv-opencode-env"
+name = "openenv-coding-agent-env"
 version = "0.1.0"
-description = "OpenCode coding-agent environment for OpenEnv — runs the OpenCode CLI in an E2B sandbox against any OpenAI-compatible LLM, optionally capturing per-token logprobs."
+description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints, optionally capturing per-token logprobs."
 requires-python = ">=3.10"
 dependencies = [
     # Core OpenEnv (server + MCP). 0.3.0 ships the harness runtime.
@@ -40,17 +40,16 @@ dev = [
 
 [project.scripts]
 # Server entrypoint — enables ``uv run --project . server``.
-server = "opencode_env.server.app:main"
+server = "coding_agent_env.server.app:main"
 
 [tool.setuptools]
 include-package-data = true
 packages = [
-    "opencode_env",
-    "opencode_env.sandbox",
-    "opencode_env.server",
-    "opencode_env.tests",
+    "coding_agent_env",
+    "coding_agent_env.sandbox",
+    "coding_agent_env.server",
 ]
-package-dir = { "opencode_env" = ".", "opencode_env.sandbox" = "sandbox", "opencode_env.server" = "server", "opencode_env.tests" = "tests" }
+package-dir = { "coding_agent_env" = ".", "coding_agent_env.sandbox" = "sandbox", "coding_agent_env.server" = "server" }
 
 [tool.setuptools.package-data]
-opencode_env = ["**/*.md"]
+coding_agent_env = ["**/*.md"]
diff --git a/envs/opencode_env/sandbox/__init__.py b/envs/coding_agent_env/sandbox/__init__.py
similarity index 100%
rename from envs/opencode_env/sandbox/__init__.py
rename to envs/coding_agent_env/sandbox/__init__.py
diff --git a/envs/opencode_env/sandbox/build_template.py b/envs/coding_agent_env/sandbox/build_template.py
similarity index 91%
rename from envs/opencode_env/sandbox/build_template.py
rename to envs/coding_agent_env/sandbox/build_template.py
index 6e0ba4f75..e22b30185 100644
--- a/envs/opencode_env/sandbox/build_template.py
+++ b/envs/coding_agent_env/sandbox/build_template.py
@@ -25,10 +25,10 @@
 
 Usage::
 
-    .venv/bin/python envs/opencode_env/tests/build_e2b_template.py
-    # → builds (or rebuilds) ``opencode-rl`` template, prints template id
+    .venv/bin/python envs/coding_agent_env/sandbox/build_template.py
+    # → builds (or rebuilds) ``coding-agent-rl`` template, prints template id
 
-Then ``test_five_sorts_e2e.py`` will use it via ``--template opencode-rl``.
+Then rollout tests can use it via ``--template coding-agent-rl``.
 
 Requires ``E2B_API_KEY`` in the environment. First build is ~3-8 min;
 subsequent builds reuse the cache and can finish in <60s.
@@ -113,8 +113,8 @@ def main(argv: list[str] | None = None) -> int:
     p = argparse.ArgumentParser(prog="build_e2b_template")
     p.add_argument(
         "--name",
-        default="opencode-rl",
-        help="Template name (default: opencode-rl).",
+        default="coding-agent-rl",
+        help="Template name (default: coding-agent-rl)."
     )
     p.add_argument(
         "--skip-cache",
@@ -123,7 +123,7 @@ def main(argv: list[str] | None = None) -> int:
     )
     args = p.parse_args(argv)
 
-    _load_env(_REPO_ROOT / "envs" / "opencode_env" / "sandbox" / ".env")
+    _load_env(_REPO_ROOT / "envs" / "coding_agent_env" / "sandbox" / ".env")
     if not os.environ.get("E2B_API_KEY"):
         print("ERROR: E2B_API_KEY required.", file=sys.stderr)
         return 2
diff --git a/envs/opencode_env/server/Dockerfile b/envs/coding_agent_env/server/Dockerfile
similarity index 91%
rename from envs/opencode_env/server/Dockerfile
rename to envs/coding_agent_env/server/Dockerfile
index ad8319423..97e880343 100644
--- a/envs/opencode_env/server/Dockerfile
+++ b/envs/coding_agent_env/server/Dockerfile
@@ -4,14 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 #
-# opencode_env Dockerfile — mirrors the standard OpenEnv multi-stage uv
+# coding_agent_env Dockerfile — mirrors the standard OpenEnv multi-stage uv
 # build used by echo_env / repl_env / jupyter_agent.
 #
 # Build:
-#   docker build -t opencode-env .
+#   docker build -t coding-agent-env .
 #
 # Run:
-#   docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
+#   docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env
 
 ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
 FROM ${BASE_IMAGE} AS builder
diff --git a/envs/opencode_env/server/__init__.py b/envs/coding_agent_env/server/__init__.py
similarity index 79%
rename from envs/opencode_env/server/__init__.py
rename to envs/coding_agent_env/server/__init__.py
index 56363edaa..2eac4fb05 100644
--- a/envs/opencode_env/server/__init__.py
+++ b/envs/coding_agent_env/server/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Server-side for the deployed opencode_env."""
+"""Server-side for the deployed coding_agent_env."""
diff --git a/envs/opencode_env/server/app.py b/envs/coding_agent_env/server/app.py
similarity index 81%
rename from envs/opencode_env/server/app.py
rename to envs/coding_agent_env/server/app.py
index 0757ef229..df40b507f 100644
--- a/envs/opencode_env/server/app.py
+++ b/envs/coding_agent_env/server/app.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""FastAPI app for the opencode_env MCP server.
+"""FastAPI app for the coding_agent_env MCP server.
 
 Mirrors the standard OpenEnv pattern (echo_env / repl_env / jupyter_agent)
 plus the custom Gradio UI mounted at ``/web`` per the
@@ -16,7 +16,7 @@
     E2B_API_KEY=... uvicorn server.app:app --host 0.0.0.0 --port 8000
 
     # Docker:
-    docker run -p 8000:8000 -e E2B_API_KEY=... opencode-env
+    docker run -p 8000:8000 -e E2B_API_KEY=... coding-agent-env
 
     # HF Space: deploys via the root ``Dockerfile``.
 
@@ -58,13 +58,13 @@ def _load_env_file() -> None:
     from openenv.core.env_server.http_server import create_app
     from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
 
-    from .gradio_ui import opencode_gradio_builder
-    from .opencode_environment import OpenCodeEnvironment
+    from .gradio_ui import coding_agent_gradio_builder
+    from .coding_environment import CodingAgentEnvironment
 except ImportError:  # pragma: no cover
     from openenv.core.env_server.http_server import create_app
     from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
-    from server.gradio_ui import opencode_gradio_builder  # type: ignore
-    from server.opencode_environment import OpenCodeEnvironment  # type: ignore
+    from server.gradio_ui import coding_agent_gradio_builder  # type: ignore
+    from server.coding_environment import CodingAgentEnvironment  # type: ignore
 
 
 # Always expose the Gradio UI at /web. Set ENABLE_WEB_INTERFACE=false to
@@ -80,22 +80,22 @@ def _custom_gradio_builder(
     title,
     quick_start_md,
 ):
-    """Hand off to ``server.gradio_ui.opencode_gradio_builder``."""
-    return opencode_gradio_builder(
+    """Hand off to ``server.gradio_ui.coding_agent_gradio_builder``."""
+    return coding_agent_gradio_builder(
         web_manager,
         action_fields,
         metadata,
         is_chat_env,
-        title or "opencode_env",
+        title or "coding_agent_env",
         quick_start_md,
     )
 
 
 app = create_app(
-    OpenCodeEnvironment,
+    CodingAgentEnvironment,
     CallToolAction,
     CallToolObservation,
-    env_name="opencode_env",
+    env_name="coding_agent_env",
     max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")),
     gradio_builder=_custom_gradio_builder,
 )
diff --git a/envs/opencode_env/server/catalog.py b/envs/coding_agent_env/server/catalog.py
similarity index 100%
rename from envs/opencode_env/server/catalog.py
rename to envs/coding_agent_env/server/catalog.py
diff --git a/envs/opencode_env/server/opencode_environment.py b/envs/coding_agent_env/server/coding_environment.py
similarity index 70%
rename from envs/opencode_env/server/opencode_environment.py
rename to envs/coding_agent_env/server/coding_environment.py
index 638dd5473..3f8eabd13 100644
--- a/envs/opencode_env/server/opencode_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -4,11 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""OpenCode MCP environment.
+"""Coding-agent MCP environment.
 
-Single MCP tool ``run_rollout`` that takes a uniform Task shape:
+Single MCP tool ``run_rollout`` with a uniform task shape:
 
-  - ``instruction``  — prompt for the agent
+  - ``instruction``  — prompt for the selected agent
   - ``setup``        — bash commands run BEFORE the agent (in the sandbox)
   - ``verify``       — bash commands run AFTER the agent
 
@@ -28,6 +28,7 @@
 from uuid import uuid4
 
 from fastmcp import FastMCP
+from pydantic import BaseModel, Field
 
 try:
     from openenv.core.env_server.mcp_environment import MCPEnvironment
@@ -40,7 +41,7 @@
     from server.catalog import ENDPOINT_KINDS, resolve_endpoint  # type: ignore
 
 
-# One rollout (sandbox cold start + opencode install + opencode run +
+# One rollout (sandbox cold start + harness install + agent run +
 # verifier) typically takes 30-180s; can spike to ~600s under load. Override
 # OpenEnv's 30s MCP-tool default so the server doesn't cut us off.
 _RUN_ROLLOUT_TIMEOUT_S = 900.0
@@ -53,9 +54,27 @@
 PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
 AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
 VERIFY_TIMEOUT_S = 120
+_SUPPORTED_AGENTS = ("opencode", "pi")
+_AGENT_LOG_BY_AGENT: dict[str, str] = {
+    "opencode": f"{HOME}/logs/agent/opencode.jsonl",
+    "pi": f"{HOME}/logs/agent/pi.txt",
+}
 
 
-class OpenCodeEnvironment(MCPEnvironment):
+class _GenericAgentConfig(BaseModel):
+    """Minimal config shape for CLIAgentSessionFactory-backed agents."""
+
+    base_url: str
+    api_key: str
+    model: str
+    agent_timeout_s: float = 600.0
+    sandbox_home: str = HOME
+    provider: str | None = None
+    thinking: str | None = "off"
+    extra_env: dict[str, str] = Field(default_factory=dict)
+
+
+class CodingAgentEnvironment(MCPEnvironment):
     """Per-session environment exposing a single ``run_rollout`` MCP tool."""
 
     SUPPORTS_CONCURRENT_SESSIONS = True
@@ -65,33 +84,37 @@ def __init__(self) -> None:
         try:
             from ..models import (
                 CommandResult,
-                OpenCodeState,
+                CodingAgentState,
                 RolloutResult,
                 RolloutTurn,
             )
         except ImportError:  # pragma: no cover
             from models import (  # type: ignore
                 CommandResult,
-                OpenCodeState,
+                CodingAgentState,
                 RolloutResult,
                 RolloutTurn,
             )
 
-        from opencode_env import (
+        from openenv.core.harness.agents import get_agent_spec
+        from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
+        from coding_agent_env import (
             E2BSandboxBackend,
-            OpenCodeConfig,
-            OpenCodeSessionFactory,
-            OpenCodeTask,
+            CodingAgentConfig,
+            CodingAgentSessionFactory,
+            CodingAgentTask,
         )
 
         self._CommandResult = CommandResult
         self._RolloutResult = RolloutResult
         self._RolloutTurn = RolloutTurn
-        self._OpenCodeState = OpenCodeState
-        self._OpenCodeConfig = OpenCodeConfig
-        self._OpenCodeSessionFactory = OpenCodeSessionFactory
-        self._OpenCodeTask = OpenCodeTask
+        self._CodingAgentState = CodingAgentState
+        self._CodingAgentConfig = CodingAgentConfig
+        self._CodingAgentSessionFactory = CodingAgentSessionFactory
+        self._CodingAgentTask = CodingAgentTask
         self._E2BSandboxBackend = E2BSandboxBackend
+        self._CLIAgentSessionFactory = CLIAgentSessionFactory
+        self._get_agent_spec = get_agent_spec
 
         # Don't raise on missing E2B_API_KEY here — OpenEnv's web-interface
         # layer instantiates the env at import time for schema introspection,
@@ -99,12 +122,14 @@ def __init__(self) -> None:
         # just exploring. The real check happens lazily in
         # ``_run_rollout_impl`` (any rollout without creds fails fast there
         # with a clear error in the result payload).
-        self._state = self._OpenCodeState(episode_id=str(uuid4()))
+        self._state = self._CodingAgentState(episode_id=str(uuid4()))
 
-        mcp = FastMCP("opencode_env")
+        mcp = FastMCP("coding_agent_env")
 
         @mcp.tool
         def run_rollout(
+            # Agent + endpoint.
+            agent: str = "opencode",
             # Endpoint — either a shorthand (resolved from env vars + catalog
             # defaults) OR explicit base_url+api_key+model. Explicit fields
             # always win over the catalog.
@@ -125,14 +150,17 @@ def run_rollout(
             agent_timeout_s: float = 600.0,
             template: str = "",
         ) -> str:
-            """Run one OpenCode rollout end-to-end.
+            """Run one coding-agent rollout end-to-end.
+
+            ``agent`` selects the harness CLI to run inside the sandbox.
+            Currently supported: ``"opencode"``, ``"pi"``.
 
             ``endpoint`` is the shorthand selector (one of
             ``"vllm"`` / ``"openai"`` / ``"hf_router"``) — the server
             resolves base_url / api_key / model from env vars + catalog
             defaults. Pass any of those explicitly to override.
 
-            See ``opencode_env.client.OpenCodeEnv.run_rollout`` for full
+            See ``coding_agent_env.client.CodingAgentEnv.run_rollout`` for full
             arg docs. Returns a JSON-serialized ``RolloutResult``.
             """
             # Resolve via catalog when shorthand is provided.
@@ -149,6 +177,11 @@ def run_rollout(
             if disable_thinking_resolved is None:
                 disable_thinking_resolved = False
 
+            agent = (agent or "opencode").strip()
+            if agent not in _SUPPORTED_AGENTS:
+                raise ValueError(
+                    f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}"
+                )
             if not (base_url and api_key and model):
                 raise ValueError(
                     "must provide either ``endpoint`` (one of "
@@ -158,6 +191,7 @@ def run_rollout(
                 raise ValueError("instruction is required")
 
             return self._run_rollout_impl(
+                agent=agent,
                 base_url=base_url,
                 api_key=api_key,
                 model=model,
@@ -183,13 +217,15 @@ def reset(
         episode_id: Optional[str] = None,
         **_: Any,
     ) -> Observation:
-        self._state = self._OpenCodeState(episode_id=episode_id or str(uuid4()))
+        self._state = self._CodingAgentState(episode_id=episode_id or str(uuid4()))
         return Observation(
             done=False,
             reward=None,
             metadata={
                 "status": "ready",
-                "message": ("opencode_env ready. Call run_rollout(...) with a task."),
+                "message": (
+                    "coding_agent_env ready. Call run_rollout(agent=..., ...) with a task."
+                ),
             },
         )
 
@@ -239,6 +275,7 @@ def state(self) -> Any:
     def _run_rollout_impl(
         self,
         *,
+        agent: str,
         base_url: str,
         api_key: str,
         model: str,
@@ -279,19 +316,18 @@ def _emit(msg: str) -> None:
             _emit("error: E2B_API_KEY missing on server")
             return result.model_dump_json()
 
-        _emit(f"resolving config (model={model}, mode={mode})")
+        _emit(f"resolving config (agent={agent}, model={model}, mode={mode})")
 
-        # Build OpenCodeConfig + factory. We keep the proxy in charge of
-        # ``model_override`` / ``logprobs`` / ``max_tokens``-cap injection.
-        config = self._OpenCodeConfig(
-            provider="openai_compatible",
-            base_url=base_url.rstrip("/"),
+        config = self._build_agent_config(
+            agent=agent,
+            mode=mode,
+            base_url=base_url,
             api_key=api_key,
             model=model,
             agent_timeout_s=agent_timeout_s,
-            proxy_disable_thinking=disable_thinking,
-            proxy_top_logprobs=top_logprobs,
-            proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
+            disable_thinking=disable_thinking,
+            top_logprobs=top_logprobs,
+            max_tokens_cap=max_tokens_cap,
         )
 
         # Concatenate setup commands into a single ``set -e`` script and let
@@ -300,21 +336,19 @@ def _emit(msg: str) -> None:
         # each command in a wrapper that captures exit/stdout/stderr.
         # That way the primitive still aborts on setup failure AND we get
         # observability in the response.
-        instruction_payload = instruction
-        opencode_task = self._OpenCodeTask(
-            instruction=instruction_payload,
-            metadata={"task_id": task_id},
+        rollout_task = self._CodingAgentTask(
+            instruction=instruction,
+            metadata={"task_id": task_id, "agent": agent},
         )
 
-        backend_kwargs: dict[str, Any] = {}
-        if template:
-            backend_kwargs["template"] = template
-
-        factory = self._OpenCodeSessionFactory(
+        factory = self._build_session_factory(
+            agent=agent,
             config=config,
-            sandbox_backend=self._E2BSandboxBackend(**backend_kwargs),
             mode=mode,
-            verifier=None,
+            template=template,
+            disable_thinking=disable_thinking,
+            top_logprobs=top_logprobs,
+            max_tokens_cap=max_tokens_cap,
         )
 
         session = None
@@ -323,7 +357,7 @@ def _emit(msg: str) -> None:
                 f"creating E2B sandbox (template={template or 'default'}) — "
                 "this is the slow phase (~5–60s cold, ~5s with template)"
             )
-            session = factory.create(task=opencode_task)
+            session = factory.create(task=rollout_task)
             result.sandbox_id = session.sandbox.sandbox_id
             _emit(
                 f"sandbox ready: {result.sandbox_id} — agent started "
@@ -336,7 +370,7 @@ def _emit(msg: str) -> None:
             # we'd need to restructure. As a pragmatic compromise we run
             # setup IMMEDIATELY after create(), which races with the agent
             # for ~1-2s but is fine for typical pip/git/download work
-            # because opencode itself takes >=20s to make its first model
+            # because most agent CLIs take a while before their first model
             # call.
             for i, cmd in enumerate(setup, 1):
                 _emit(f"setup [{i}/{len(setup)}]: {cmd[:80]}")
@@ -352,7 +386,7 @@ def _emit(msg: str) -> None:
             # Block until the agent is done (or setup already failed).
             if result.error is None:
                 _emit(
-                    f"agent running — opencode CLI in sandbox "
+                    f"agent running — {agent} CLI in sandbox "
                     f"(timeout {int(agent_timeout_s)}s)"
                 )
                 try:
@@ -387,7 +421,7 @@ def _emit(msg: str) -> None:
             result.files, result.files_extra = self._collect_files(session.sandbox)
             result.proxy_turns = self._collect_proxy_turns(session)
             result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
-            result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
+            result.agent_log_tail = self._collect_agent_log_tail(session, agent)
             _emit(
                 f"collected: {len(result.files)} file(s), "
                 f"{len(result.proxy_turns)} proxy turn(s), "
@@ -400,9 +434,7 @@ def _emit(msg: str) -> None:
                 result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[
                     -2000:
                 ]
-                result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[
-                    -2000:
-                ]
+                result.agent_log_tail = self._collect_agent_log_tail(session, agent)
         finally:
             if session is not None:
                 try:
@@ -422,6 +454,104 @@ def _emit(msg: str) -> None:
 
         return result.model_dump_json()
 
+    def _build_agent_config(
+        self,
+        *,
+        agent: str,
+        mode: str,
+        base_url: str,
+        api_key: str,
+        model: str,
+        agent_timeout_s: float,
+        disable_thinking: bool,
+        top_logprobs: int,
+        max_tokens_cap: int,
+    ) -> Any:
+        if agent == "opencode":
+            return self._CodingAgentConfig(
+                provider="openai_compatible",
+                base_url=base_url.rstrip("/"),
+                api_key=api_key,
+                model=model,
+                agent_timeout_s=agent_timeout_s,
+                proxy_disable_thinking=disable_thinking,
+                proxy_top_logprobs=top_logprobs,
+                proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
+            )
+
+        provider = (
+            "openai" if mode == "transparent_proxy" else self._infer_pi_provider(base_url)
+        )
+        return _GenericAgentConfig(
+            base_url=base_url.rstrip("/"),
+            api_key=api_key,
+            model=model,
+            agent_timeout_s=agent_timeout_s,
+            provider=provider,
+            thinking="off" if disable_thinking else None,
+        )
+
+    def _build_session_factory(
+        self,
+        *,
+        agent: str,
+        config: Any,
+        mode: str,
+        template: str,
+        disable_thinking: bool,
+        top_logprobs: int,
+        max_tokens_cap: int,
+    ) -> Any:
+        backend_kwargs: dict[str, Any] = {}
+        if template:
+            backend_kwargs["template"] = template
+        backend = self._E2BSandboxBackend(**backend_kwargs)
+
+        if agent == "opencode":
+            return self._CodingAgentSessionFactory(
+                config=config,
+                sandbox_backend=backend,
+                mode=mode,
+                verifier=None,
+            )
+
+        spec = self._get_agent_spec(agent)
+        return self._CLIAgentSessionFactory(
+            spec=spec,
+            config=config,
+            sandbox_backend=backend,
+            mode=mode,
+            verifier=None,
+            proxy_disable_thinking=disable_thinking,
+            proxy_top_logprobs=top_logprobs,
+            proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
+        )
+
+    @staticmethod
+    def _infer_pi_provider(base_url: str) -> str:
+        url = (base_url or "").lower()
+        if "router.huggingface.co" in url:
+            return "huggingface"
+        if "anthropic" in url:
+            return "anthropic"
+        if "googleapis.com" in url or "generativelanguage" in url:
+            return "gemini"
+        return "openai"
+
+    def _collect_agent_log_tail(self, session: Any, agent: str) -> str:
+        if hasattr(session, "collect_artifacts"):
+            try:
+                artifacts = session.collect_artifacts()
+                if isinstance(artifacts, dict) and "agent_log" in artifacts:
+                    val = artifacts["agent_log"]
+                    if isinstance(val, str):
+                        return val[-2000:]
+                    return json.dumps(val, default=str)[-2000:]
+            except Exception:
+                pass
+        path = _AGENT_LOG_BY_AGENT.get(agent, AGENT_LOG)
+        return self._safe_read(session.sandbox, path)[-2000:]
+
     # ── Helpers ────────────────────────────────────────────────────────────
 
     def _exec_command(self, sandbox: Any, cmd: str) -> Any:
@@ -471,18 +601,33 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]:
 
     def _collect_proxy_turns(self, session: Any) -> list[Any]:
         turns: list[Any] = []
-        proxy_trace_path = getattr(session, "_proxy_trace_path", None)
-        if not proxy_trace_path:
-            return turns
-        raw = self._safe_read(session.sandbox, proxy_trace_path)
-        for line in raw.splitlines():
-            line = line.strip()
-            if not line:
-                continue
+
+        records: list[dict[str, Any]] = []
+        if hasattr(session, "fetch_proxy_trace"):
             try:
-                rec = json.loads(line)
+                fetched = session.fetch_proxy_trace()
+                if isinstance(fetched, list):
+                    records = [r for r in fetched if isinstance(r, dict)]
             except Exception:
-                continue
+                records = []
+
+        if not records:
+            proxy_trace_path = getattr(session, "_proxy_trace_path", None)
+            if not proxy_trace_path:
+                return turns
+            raw = self._safe_read(session.sandbox, proxy_trace_path)
+            for line in raw.splitlines():
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rec = json.loads(line)
+                except Exception:
+                    continue
+                if isinstance(rec, dict):
+                    records.append(rec)
+
+        for rec in records:
             response = rec.get("response") or {}
             turns.append(
                 self._RolloutTurn(
@@ -509,3 +654,4 @@ def _safe_read(sandbox: Any, path: str) -> str:
             return sandbox.read_text(path) or ""
         except Exception:
             return ""
+
diff --git a/envs/opencode_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py
similarity index 92%
rename from envs/opencode_env/server/gradio_ui.py
rename to envs/coding_agent_env/server/gradio_ui.py
index d1ee6e403..5497ef0f2 100644
--- a/envs/opencode_env/server/gradio_ui.py
+++ b/envs/coding_agent_env/server/gradio_ui.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Minimal Gradio UI for opencode_env.
+"""Minimal Gradio UI for coding_agent_env.
 
 Mounts under the standard OpenEnv ``/web`` path via the
 ``gradio_builder=`` callback documented at
@@ -32,14 +32,14 @@
 
 try:
     from .catalog import catalog_summary, ENDPOINT_KINDS, resolve_endpoint
-    from .opencode_environment import OpenCodeEnvironment
+    from .coding_environment import CodingAgentEnvironment
 except ImportError:  # pragma: no cover
     from server.catalog import (  # type: ignore
         catalog_summary,
         ENDPOINT_KINDS,
         resolve_endpoint,
     )
-    from server.opencode_environment import OpenCodeEnvironment  # type: ignore
+    from server.coding_environment import CodingAgentEnvironment  # type: ignore
 
 
 # ────────────────────────────────────────────────────────────────────────────
@@ -190,8 +190,8 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str:
         toks = first["completion_tokens"][:10]
         lps = first.get("per_token_logps") or []
         lines.append(
-            f"\n**first productive turn (first 10 tokens)**\n\n"
-            f"```\n"
+            "\n**first productive turn (first 10 tokens)**\n\n"
+            "```\n"
             + "\n".join(
                 f"  {tok!r:<14}  {lp:+.3f}" if i < len(lps) else f"  {tok!r:<14}  -"
                 for i, (tok, lp) in enumerate(zip(toks, lps + [None] * len(toks)))
@@ -202,6 +202,7 @@ def _logprobs_md(turns: list[dict[str, Any]]) -> str:
 
 
 def _live_status_md(
+    agent: str,
     endpoint_kind: str,
     model: str,
     mode: str,
@@ -211,7 +212,7 @@ def _live_status_md(
     """Render a live phase log (latest at the bottom) with elapsed timestamps."""
     head = (
         f"### running…  `elapsed={elapsed_s:.1f}s`\n\n"
-        f"_endpoint=`{endpoint_kind}`  model=`{model}`  mode=`{mode}`_\n\n"
+        f"_agent=`{agent}`  endpoint=`{endpoint_kind}`  model=`{model}`  mode=`{mode}`_\n\n"
     )
     if not lines:
         body = "_(waiting for first phase update…)_"
@@ -255,7 +256,7 @@ def _catalog_banner() -> str:
 # ────────────────────────────────────────────────────────────────────────────
 
 
-def opencode_gradio_builder(
+def coding_agent_gradio_builder(
     web_manager,  # noqa: ARG001 (unused: we instantiate the env directly)
     action_fields,  # noqa: ARG001
     metadata,  # noqa: ARG001
@@ -263,16 +264,17 @@ def opencode_gradio_builder(
     title,
     quick_start_md,  # noqa: ARG001
 ) -> gr.Blocks:
-    """Build the opencode_env console.
+    """Build the coding_agent_env console.
 
     Compatible with ``create_app(..., gradio_builder=...)``. We ignore
-    ``web_manager`` and instantiate :class:`OpenCodeEnvironment` ourselves
-    inside the run handler — opencode_env's run_rollout doesn't need any
+    ``web_manager`` and instantiate :class:`CodingAgentEnvironment` ourselves
+    inside the run handler — coding_agent_env's run_rollout doesn't need any
     per-session state beyond the env's own bookkeeping, and instantiating
     is cheap (no sandbox is created until the tool fires).
     """
 
     def run(
+        agent: str,
         endpoint: str,
         model: str,
         base_url: str,
@@ -317,7 +319,7 @@ def run(
         else:
             dt = None
 
-        env = OpenCodeEnvironment()
+        env = CodingAgentEnvironment()
 
         # The worker fires _run_rollout_impl in a background thread and
         # streams progress messages into a queue; this generator polls the
@@ -331,6 +333,7 @@ def _cb(msg: str) -> None:
         def _worker():
             try:
                 payload = env._run_rollout_impl(
+                    agent=agent,
                     base_url=resolved.base_url,
                     api_key=resolved.api_key,
                     model=resolved.model,
@@ -361,7 +364,7 @@ def _worker():
 
         # First yield: announce we've started. Empty result panels.
         yield (
-            f"### running…\n\n_endpoint=`{resolved.kind}`  model=`{resolved.model}`  mode=`{mode}`_",
+            f"### running…\n\n_agent=`{agent}`  endpoint=`{resolved.kind}`  model=`{resolved.model}`  mode=`{mode}`_",
             [],
             [],
             "",
@@ -387,7 +390,12 @@ def _worker():
             # Render the live status pane.
             elapsed = time.time() - t_start
             md = _live_status_md(
-                resolved.kind, resolved.model, mode, elapsed, status_lines
+                agent,
+                resolved.kind,
+                resolved.model,
+                mode,
+                elapsed,
+                status_lines,
             )
             yield (md, [], [], "", "", "", {})
 
@@ -409,6 +417,7 @@ def _worker():
                 "",
                 "",
                 _live_status_md(
+                    agent,
                     resolved.kind,
                     resolved.model,
                     mode,
@@ -427,8 +436,9 @@ def _worker():
             _files_md(result.get("files") or {}),
             _logprobs_md(result.get("proxy_turns") or []),
             (
-                f"### live phase log\n\n"
+                "### live phase log\n\n"
                 + _live_status_md(
+                    agent,
                     resolved.kind,
                     resolved.model,
                     mode,
@@ -445,17 +455,24 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         p = PRESETS.get(name) or {"instruction": "", "setup": "", "verify": ""}
         return p["instruction"], p["setup"], p["verify"]
 
-    with gr.Blocks(title=title or "opencode_env") as app:
-        gr.Markdown(f"# {title or 'opencode_env'}")
+    with gr.Blocks(title=title or "coding_agent_env") as app:
+        gr.Markdown(f"# {title or 'coding_agent_env'}")
         gr.Markdown(
-            "Run one OpenCode rollout in an E2B sandbox against your chosen "
-            "LLM endpoint. Pick an endpoint, write the task as `(instruction, "
-            "setup, verify)`, and inspect the reward + per-token logprobs."
+            "Run one coding-agent rollout in an E2B sandbox against your chosen "
+            "LLM endpoint. Pick an agent + endpoint, write the task as "
+            "`(instruction, setup, verify)`, and inspect reward + per-token "
+            "logprobs."
         )
 
         gr.Markdown(_catalog_banner())
 
         with gr.Row():
+            agent = gr.Dropdown(
+                choices=["opencode", "pi"],
+                value="opencode",
+                label="Agent",
+                scale=1,
+            )
             endpoint = gr.Dropdown(
                 choices=list(ENDPOINT_KINDS),
                 value="openai",
@@ -481,7 +498,7 @@ def apply_preset(name: str) -> tuple[str, str, str]:
             )
 
         instruction = gr.Textbox(
-            label="Instruction (the prompt opencode runs)",
+            label="Instruction (the prompt the selected agent runs)",
             lines=4,
             value=PRESETS["binary_search"]["instruction"],
         )
@@ -567,6 +584,7 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         run_btn.click(
             fn=run,
             inputs=[
+                agent,
                 endpoint,
                 model,
                 base_url,
@@ -593,3 +611,4 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         )
 
     return app
+
diff --git a/envs/opencode_env/task.py b/envs/coding_agent_env/task.py
similarity index 73%
rename from envs/opencode_env/task.py
rename to envs/coding_agent_env/task.py
index f9d208d84..8633eb7aa 100644
--- a/envs/opencode_env/task.py
+++ b/envs/coding_agent_env/task.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Task payload accepted by :class:`OpenCodeSessionFactory`."""
+"""Task payload accepted by :class:`CodingAgentSessionFactory`."""
 
 from __future__ import annotations
 
@@ -13,8 +13,8 @@
 from pydantic import BaseModel, Field
 
 
-class OpenCodeTask(BaseModel):
-    """One task for an OpenCode rollout.
+class CodingAgentTask(BaseModel):
+    """One task for a coding-agent rollout.
 
     The primitive only needs ``instruction`` (the prompt handed to ``opencode
     run``). Callers may attach ``setup_shell`` (run once inside the sandbox
@@ -29,8 +29,8 @@ class OpenCodeTask(BaseModel):
     metadata: dict[str, Any] = Field(default_factory=dict)
 
     @classmethod
-    def coerce(cls, value: Any) -> "OpenCodeTask":
-        """Accept a bare string, a dict, or an existing ``OpenCodeTask``."""
+    def coerce(cls, value: Any) -> "CodingAgentTask":
+        """Accept a bare string, a dict, or an existing ``CodingAgentTask``."""
         if isinstance(value, cls):
             return value
         if isinstance(value, str):
@@ -38,6 +38,6 @@ def coerce(cls, value: Any) -> "OpenCodeTask":
         if isinstance(value, dict):
             return cls(**value)
         raise TypeError(
-            f"Cannot coerce {type(value).__name__} to OpenCodeTask; "
-            "pass a str, dict, or OpenCodeTask."
+            f"Cannot coerce {type(value).__name__} to CodingAgentTask; "
+            "pass a str, dict, or CodingAgentTask."
         )
diff --git a/envs/opencode_env/uv.lock b/envs/coding_agent_env/uv.lock
similarity index 99%
rename from envs/opencode_env/uv.lock
rename to envs/coding_agent_env/uv.lock
index 80dd00ba0..aa35531cc 100644
--- a/envs/opencode_env/uv.lock
+++ b/envs/coding_agent_env/uv.lock
@@ -1664,38 +1664,7 @@ wheels = [
 ]
 
 [[package]]
-name = "openenv-core"
-version = "0.2.3"
-source = { git = "https://github.com/adithya-s-k/OpenEnv.git?rev=opencode-harness#aabcdbb9d52aa62a842ec69472b2a1106acb831a" }
-dependencies = [
-    { name = "fastapi" },
-    { name = "fastmcp" },
-    { name = "gradio" },
-    { name = "httpx" },
-    { name = "huggingface-hub" },
-    { name = "openai" },
-    { name = "pydantic" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "rich" },
-    { name = "tomli" },
-    { name = "tomli-w" },
-    { name = "typer" },
-    { name = "uvicorn" },
-    { name = "websockets" },
-]
-
-[package.optional-dependencies]
-core = [
-    { name = "fastapi" },
-    { name = "pydantic" },
-    { name = "requests" },
-    { name = "uvicorn" },
-    { name = "websockets" },
-]
-
-[[package]]
-name = "openenv-opencode-env"
+name = "openenv-coding-agent-env"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
@@ -1724,7 +1693,7 @@ requires-dist = [
     { name = "fastmcp", specifier = ">=2.0.0" },
     { name = "gradio", specifier = ">=6.0.0" },
     { name = "httpx", specifier = ">=0.27.0" },
-    { name = "openenv-core", extras = ["core"], git = "https://github.com/adithya-s-k/OpenEnv.git?rev=opencode-harness" },
+    { name = "openenv-core", extras = ["core"], specifier = ">=0.3.0" },
     { name = "pydantic", specifier = ">=2.0.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" },
@@ -1734,6 +1703,41 @@ requires-dist = [
 ]
 provides-extras = ["dev"]
 
+[[package]]
+name = "openenv-core"
+version = "0.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fastapi" },
+    { name = "fastmcp" },
+    { name = "gradio" },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "openai" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "rich" },
+    { name = "tomli" },
+    { name = "tomli-w" },
+    { name = "typer" },
+    { name = "uvicorn" },
+    { name = "websockets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/d6/3bebe8afb55fcc3ea9251c4c2dfbab2879e31089bc91a8fe9696e5ce019b/openenv_core-0.3.0.tar.gz", hash = "sha256:c7fee2035badab5be497eb6f4afb2cb417de000f82cc19afd72fb5ec332c431d", size = 164720, upload-time = "2026-05-11T11:37:57.274Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/f5/aafa43138589bfd5d369a8d02ea365aae9d6fe55ac0b3894368d6d69bd03/openenv_core-0.3.0-py3-none-any.whl", hash = "sha256:859e875c9d5211b157c30fb9abc681606fcf0bf1b6ffcdf404678992823a1df0", size = 194313, upload-time = "2026-05-11T11:37:55.537Z" },
+]
+
+[package.optional-dependencies]
+core = [
+    { name = "fastapi" },
+    { name = "pydantic" },
+    { name = "requests" },
+    { name = "uvicorn" },
+    { name = "websockets" },
+]
+
 [[package]]
 name = "opentelemetry-api"
 version = "1.41.1"
diff --git a/examples/opencode_env_simple.py b/examples/coding_agent_env_simple.py
similarity index 83%
rename from examples/opencode_env_simple.py
rename to examples/coding_agent_env_simple.py
index 1713880fb..f8996e586 100644
--- a/examples/opencode_env_simple.py
+++ b/examples/coding_agent_env_simple.py
@@ -5,18 +5,18 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""End-to-end opencode_env example: write binary_search.py and verify it.
+"""End-to-end coding_agent_env example: write binary_search.py and verify it.
 
-Hits the deployed HF Space ``AdithyaSK/opencode-env`` (override via
-``OPENCODE_ENV_SPACE`` env var to point at your own Space or a local
+Hits the deployed HF Space ``AdithyaSK/coding-agent-env`` (override via
+``CODING_AGENT_ENV_SPACE`` env var to point at your own Space or a local
 container). The single MCP tool ``run_rollout`` does:
 
-  1. Spawns a fresh E2B sandbox (using the prebaked ``opencode-rl``
+  1. Spawns a fresh E2B sandbox (using the prebaked ``coding-agent-rl``
      template — falls back to a cold install if the template isn't
      present in your E2B account).
   2. Bootstraps an in-sandbox FastAPI proxy that captures per-token
      logprobs (``mode="transparent_proxy"``).
-  3. Runs ``opencode run`` with the instruction.
+  3. Runs the selected harness CLI with the instruction.
   4. Executes the verify bash commands; reward = passed / total.
   5. Returns a ``RolloutResult`` with reward + per-turn logprobs +
      the file contents the agent produced.
@@ -29,7 +29,7 @@
 
 Usage::
 
-    PYTHONPATH=src:envs uv run python examples/opencode_env_simple.py
+    PYTHONPATH=src:envs uv run python examples/coding_agent_env_simple.py
 
 Expected output (~20s with the prebaked template)::
 
@@ -49,12 +49,12 @@
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "envs"))
 
-from opencode_env import OpenCodeEnv  # noqa: E402
-from opencode_env.client import _extract_text  # noqa: E402
-from opencode_env.models import RolloutResult  # noqa: E402
+from coding_agent_env import CodingAgentEnv  # noqa: E402
+from coding_agent_env.client import _extract_text  # noqa: E402
+from coding_agent_env.models import RolloutResult  # noqa: E402
 
 
-SPACE = os.environ.get("OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space")
+SPACE = os.environ.get("CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space")
 
 INSTRUCTION = (
     "Create a single Python file named `binary_search.py` in the current "
@@ -91,7 +91,7 @@ async def main() -> int:
     print(f"Instruction:     {INSTRUCTION.splitlines()[0]} ...")
     print()
 
-    async with OpenCodeEnv(base_url=SPACE) as env:
+    async with CodingAgentEnv(base_url=SPACE) as env:
         await env.reset()
         raw = await env.call_tool(
             "run_rollout",
@@ -101,7 +101,7 @@ async def main() -> int:
             instruction=INSTRUCTION,
             setup=[],  # no setup commands
             verify=VERIFY,
-            template="opencode-rl",  # prebaked E2B template
+            template="coding-agent-rl",  # prebaked E2B template
             task_id="binary_search_simple",
             agent_timeout_s=600,
         )
diff --git a/tests/envs/test_opencode_env.py b/tests/envs/test_coding_agent_env.py
similarity index 73%
rename from tests/envs/test_opencode_env.py
rename to tests/envs/test_coding_agent_env.py
index 5e930b8bc..ec1f66fa5 100644
--- a/tests/envs/test_opencode_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Smoke tests for ``opencode_env``.
+"""Smoke tests for ``coding_agent_env``.
 
 The default suite runs in CI without any external dependencies (no E2B,
 no LLM, no network). It covers:
@@ -13,7 +13,7 @@
   - The endpoint catalog (`vllm` / `openai` / `hf_router`) resolves
     explicit + env-var + default-value precedence correctly.
   - Pydantic models accept their expected shapes.
-  - The `OpenCodeTask` coercion helper handles str / dict / `OpenCodeTask`.
+  - The `CodingAgentTask` coercion helper handles str / dict / `CodingAgentTask`.
 
 A second class is marked ``@pytest.mark.integration`` and exercises the
 deployed Space end-to-end. It only runs when ``E2B_API_KEY`` and at least
@@ -45,15 +45,15 @@
 
 def test_public_api_imports() -> None:
     """Top-level package re-exports the documented surface."""
-    from opencode_env import (  # noqa: F401
+    from coding_agent_env import (  # noqa: F401
         CommandResult,
         E2BSandboxBackend,
-        OpenCodeConfig,
-        OpenCodeEnv,
-        OpenCodeSession,
-        OpenCodeSessionFactory,
-        OpenCodeState,
-        OpenCodeTask,
+        CodingAgentConfig,
+        CodingAgentEnv,
+        CodingAgentSession,
+        CodingAgentSessionFactory,
+        CodingAgentState,
+        CodingAgentTask,
         Provider,
         RolloutResult,
         RolloutTurn,
@@ -64,14 +64,14 @@ def test_public_api_imports() -> None:
 
 def test_server_modules_import() -> None:
     """Server-side modules (FastAPI app, MCP env, catalog) import cleanly."""
-    from opencode_env.server.app import app  # noqa: F401
-    from opencode_env.server.catalog import (  # noqa: F401
+    from coding_agent_env.server.app import app  # noqa: F401
+    from coding_agent_env.server.catalog import (  # noqa: F401
         catalog_summary,
         ENDPOINT_KINDS,
         resolve_endpoint,
     )
-    from opencode_env.server.opencode_environment import (  # noqa: F401
-        OpenCodeEnvironment,
+    from coding_agent_env.server.coding_environment import (  # noqa: F401
+        CodingAgentEnvironment,
     )
 
 
@@ -81,14 +81,14 @@ def test_server_modules_import() -> None:
 
 
 def test_catalog_kinds() -> None:
-    from opencode_env.server.catalog import ENDPOINT_KINDS
+    from coding_agent_env.server.catalog import ENDPOINT_KINDS
 
     assert ENDPOINT_KINDS == ("vllm", "openai", "hf_router")
 
 
 def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) -> None:
     """Explicit args beat env vars beat catalog defaults."""
-    from opencode_env.server.catalog import resolve_endpoint
+    from coding_agent_env.server.catalog import resolve_endpoint
 
     monkeypatch.setenv("OPENAI_API_KEY", "from-env")
     r = resolve_endpoint(
@@ -107,7 +107,7 @@ def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) ->
 def test_resolve_endpoint_env_var_used_when_arg_missing(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    from opencode_env.server.catalog import resolve_endpoint
+    from coding_agent_env.server.catalog import resolve_endpoint
 
     monkeypatch.setenv("OPENAI_API_KEY", "key-from-env")
     monkeypatch.setenv("OPENAI_MODEL", "gpt-4o")
@@ -121,7 +121,7 @@ def test_resolve_endpoint_normalizes_v1_suffix(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Base URL gets ``/v1`` appended if missing, otherwise left alone."""
-    from opencode_env.server.catalog import resolve_endpoint
+    from coding_agent_env.server.catalog import resolve_endpoint
 
     monkeypatch.setenv("VLLM_URL", "https://my-vllm.example/")
     monkeypatch.setenv("VLLM_API_KEY", "x")
@@ -134,7 +134,7 @@ def test_resolve_endpoint_normalizes_v1_suffix(
 
 
 def test_resolve_endpoint_unknown_kind_raises() -> None:
-    from opencode_env.server.catalog import resolve_endpoint
+    from coding_agent_env.server.catalog import resolve_endpoint
 
     with pytest.raises(ValueError, match="unknown endpoint kind"):
         resolve_endpoint("bogus", base_url="x", api_key="y", model="z")
@@ -143,7 +143,7 @@ def test_resolve_endpoint_unknown_kind_raises() -> None:
 def test_resolve_endpoint_missing_creds_raises(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    from opencode_env.server.catalog import resolve_endpoint
+    from coding_agent_env.server.catalog import resolve_endpoint
 
     # Strip any inherited env vars.
     for k in ("OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL"):
@@ -153,7 +153,7 @@ def test_resolve_endpoint_missing_creds_raises(
 
 
 def test_catalog_summary_shape() -> None:
-    from opencode_env.server.catalog import catalog_summary
+    from coding_agent_env.server.catalog import catalog_summary
 
     summary = catalog_summary()
     assert {entry["kind"] for entry in summary} == {"vllm", "openai", "hf_router"}
@@ -166,13 +166,67 @@ def test_catalog_summary_shape() -> None:
         } <= entry.keys()
 
 
+def test_build_agent_config_opencode() -> None:
+    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
+
+    env = CodingAgentEnvironment()
+    cfg = env._build_agent_config(
+        agent="opencode",
+        mode="transparent_proxy",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-test",
+        model="gpt-4o-mini",
+        agent_timeout_s=123.0,
+        disable_thinking=True,
+        top_logprobs=7,
+        max_tokens_cap=2048,
+    )
+    assert isinstance(cfg, env._CodingAgentConfig)
+    assert cfg.proxy_disable_thinking is True
+    assert cfg.proxy_top_logprobs == 7
+    assert cfg.proxy_max_tokens_cap == 2048
+
+
+def test_build_agent_config_pi() -> None:
+    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
+
+    env = CodingAgentEnvironment()
+    cfg = env._build_agent_config(
+        agent="pi",
+        mode="black_box",
+        base_url="https://router.huggingface.co/v1",
+        api_key="hf_xxx",
+        model="zai-org/GLM-5.1",
+        agent_timeout_s=180.0,
+        disable_thinking=True,
+        top_logprobs=5,
+        max_tokens_cap=4096,
+    )
+    assert cfg.provider == "huggingface"
+    assert cfg.thinking == "off"
+    assert cfg.model == "zai-org/GLM-5.1"
+
+    cfg_proxy = env._build_agent_config(
+        agent="pi",
+        mode="transparent_proxy",
+        base_url="https://router.huggingface.co/v1",
+        api_key="hf_xxx",
+        model="zai-org/GLM-5.1",
+        agent_timeout_s=180.0,
+        disable_thinking=False,
+        top_logprobs=5,
+        max_tokens_cap=4096,
+    )
+    assert cfg_proxy.provider == "openai"
+
+
 # ---------------------------------------------------------------------------
 # Models + task coercion
 # ---------------------------------------------------------------------------
 
 
 def test_rollout_result_serializes_round_trip() -> None:
-    from opencode_env import CommandResult, RolloutResult, RolloutTurn
+    from coding_agent_env import CommandResult, RolloutResult, RolloutTurn
 
     r = RolloutResult(
         task_id="t1",
@@ -201,40 +255,40 @@ def test_rollout_result_serializes_round_trip() -> None:
     assert rebuilt.proxy_turns[0].completion_tokens == ["hi"]
 
 
-def test_opencode_task_coerce_str() -> None:
-    from opencode_env import OpenCodeTask
+def test_coding_agent_task_coerce_str() -> None:
+    from coding_agent_env import CodingAgentTask
 
-    t = OpenCodeTask.coerce("write fizzbuzz.py")
+    t = CodingAgentTask.coerce("write fizzbuzz.py")
     assert t.instruction == "write fizzbuzz.py"
     assert t.setup_shell is None
     assert t.upload_files == {}
 
 
-def test_opencode_task_coerce_dict() -> None:
-    from opencode_env import OpenCodeTask
+def test_coding_agent_task_coerce_dict() -> None:
+    from coding_agent_env import CodingAgentTask
 
-    t = OpenCodeTask.coerce({"instruction": "x", "setup_shell": "pip install pandas"})
+    t = CodingAgentTask.coerce({"instruction": "x", "setup_shell": "pip install pandas"})
     assert t.instruction == "x"
     assert t.setup_shell == "pip install pandas"
 
 
-def test_opencode_task_coerce_existing_passthrough() -> None:
-    from opencode_env import OpenCodeTask
+def test_coding_agent_task_coerce_existing_passthrough() -> None:
+    from coding_agent_env import CodingAgentTask
 
-    src = OpenCodeTask(instruction="y")
-    assert OpenCodeTask.coerce(src) is src
+    src = CodingAgentTask(instruction="y")
+    assert CodingAgentTask.coerce(src) is src
 
 
-def test_opencode_task_coerce_rejects_unknown_type() -> None:
-    from opencode_env import OpenCodeTask
+def test_coding_agent_task_coerce_rejects_unknown_type() -> None:
+    from coding_agent_env import CodingAgentTask
 
     with pytest.raises(TypeError, match="Cannot coerce"):
-        OpenCodeTask.coerce(42)  # type: ignore[arg-type]
+        CodingAgentTask.coerce(42)  # type: ignore[arg-type]
 
 
 def test_start_proxy_keeps_upstream_key_out_of_command() -> None:
     """The proxy API key must be passed via env, not shell argv."""
-    from opencode_env import OpenCodeConfig, OpenCodeSessionFactory
+    from coding_agent_env import CodingAgentConfig, CodingAgentSessionFactory
 
     class FakeExecResult:
         exit_code = 0
@@ -278,13 +332,13 @@ def kill(self) -> None:
 
     secret = "sk-test '$(leak)"
     model = "provider/model'; touch /tmp/pwn #"
-    config = OpenCodeConfig(
+    config = CodingAgentConfig(
         base_url="https://example.test/v1?x='y",
         api_key=secret,
         model=model,
     )
     sandbox = FakeSandbox()
-    factory = OpenCodeSessionFactory(
+    factory = CodingAgentSessionFactory(
         config=config,
         sandbox_backend=object(),  # unused by this protected-method test
         mode="transparent_proxy",
@@ -354,16 +408,16 @@ def test_run_rollout_e2e_via_deployed_space() -> None:
 
     import asyncio
 
-    from opencode_env import OpenCodeEnv
-    from opencode_env.client import _extract_text
-    from opencode_env.models import RolloutResult
+    from coding_agent_env import CodingAgentEnv
+    from coding_agent_env.client import _extract_text
+    from coding_agent_env.models import RolloutResult
 
     SPACE = os.environ.get(
-        "OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space"
+        "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space"
     )
 
     async def _go() -> RolloutResult:
-        async with OpenCodeEnv(base_url=SPACE) as env:
+        async with CodingAgentEnv(base_url=SPACE) as env:
             await env.reset()
             raw = await env.call_tool(
                 "run_rollout",
@@ -382,7 +436,7 @@ async def _go() -> RolloutResult:
                     "import binary_search; "
                     "assert binary_search.binary_search([1,2,3,4,5], 3) == 2; print('OK')\"",
                 ],
-                template="opencode-rl",
+                template="coding-agent-rl",
                 agent_timeout_s=600,
             )
             return RolloutResult.model_validate_json(_extract_text(raw))

From ddf1313bd7e9f3695d08a8d079fabbeb8b8d9608 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Thu, 14 May 2026 15:06:07 +0530
Subject: [PATCH 09/35] feat: hf sandbox backend - tests

---
 src/openenv/core/harness/sandbox/__init__.py  |  19 +-
 .../core/harness/sandbox/hf_backend.py        | 306 ++++++++++++++++++
 tests/core/test_hf_sandbox_backend.py         | 221 +++++++++++++
 3 files changed, 544 insertions(+), 2 deletions(-)
 create mode 100644 src/openenv/core/harness/sandbox/hf_backend.py
 create mode 100644 tests/core/test_hf_sandbox_backend.py

diff --git a/src/openenv/core/harness/sandbox/__init__.py b/src/openenv/core/harness/sandbox/__init__.py
index 83d37fb48..208fe54d5 100644
--- a/src/openenv/core/harness/sandbox/__init__.py
+++ b/src/openenv/core/harness/sandbox/__init__.py
@@ -37,9 +37,16 @@
 except ImportError:
     pass  # e2b not installed
 
+try:
+    from .hf_backend import HFBgJob, HFSandboxBackend, HFSandboxHandle  # noqa: F401
+
+    __all__.extend(["HFBgJob", "HFSandboxBackend", "HFSandboxHandle"])
+except ImportError:
+    pass  # hf-sandbox not installed
+
 
 def create_sandbox_backend(
-    backend: Literal["e2b", "docker"] = "e2b",
+    backend: Literal["e2b", "docker", "hf"] = "e2b",
     **kwargs: Any,
 ) -> SandboxBackend:
     """Create a sandbox backend by name.
@@ -48,6 +55,8 @@ def create_sandbox_backend(
     (set ``E2B_API_URL``).
 
     For ``"docker"``: local Docker, no external dependencies.
+
+    For ``"hf"``: Hugging Face Jobs via ``hf-sandbox``.
     """
     if backend == "e2b":
         from .e2b_backend import E2BSandboxBackend
@@ -55,4 +64,10 @@ def create_sandbox_backend(
         return E2BSandboxBackend(**kwargs)
     elif backend == "docker":
         return DockerSandboxBackend(**kwargs)
-    raise ValueError(f"Unknown sandbox backend: {backend!r}. Use 'e2b' or 'docker'.")
+    elif backend == "hf":
+        from .hf_backend import HFSandboxBackend
+
+        return HFSandboxBackend(**kwargs)
+    raise ValueError(
+        f"Unknown sandbox backend: {backend!r}. Use 'e2b', 'docker', or 'hf'."
+    )
diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py
new file mode 100644
index 000000000..410a8daea
--- /dev/null
+++ b/src/openenv/core/harness/sandbox/hf_backend.py
@@ -0,0 +1,306 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Hugging Face Sandbox implementation of :class:`SandboxBackend`.
+
+Wraps `hf-sandbox` (https://github.com/huggingface/hf-sandbox) so OpenEnv
+harnesses can use it through the same protocol.
+"""
+
+from __future__ import annotations
+
+import re
+import time
+import uuid
+from pathlib import PurePosixPath
+from threading import Event
+from typing import Any
+
+from hf_sandbox import Sandbox
+from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxHandle
+
+_ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+
+class HFSandboxError(RuntimeError):
+    """Base class for HF sandbox backend errors."""
+
+
+class HFSandboxCreateError(HFSandboxError):
+    """Raised when backend cannot create a sandbox."""
+
+
+class HFBgJob:
+    """Background process handle for :class:`HFSandboxHandle`."""
+
+    def __init__(
+        self,
+        sandbox: "HFSandboxHandle",
+        *,
+        pid: int,
+        marker_path: str,
+        poll_interval_s: float = 0.5,
+    ) -> None:
+        self._sandbox = sandbox
+        self._pid = pid
+        self._marker_path = marker_path
+        self._poll_interval_s = poll_interval_s
+        self._done = Event()
+        self._exit_code: int | None = None
+
+    @property
+    def pid(self) -> int:
+        return self._pid
+
+    def wait(self, timeout: float | None = None) -> int:
+        deadline = None if timeout is None else (time.monotonic() + timeout)
+        while True:
+            if self._done.is_set():
+                return self._exit_code if self._exit_code is not None else 0
+            if deadline is not None and time.monotonic() > deadline:
+                raise TimeoutError(
+                    f"Background command (pid={self._pid}) "
+                    f"did not exit within {timeout}s"
+                )
+
+            marker = self._sandbox.exec(
+                f"cat {_shell_quote(self._marker_path)}",
+                timeout=10,
+            )
+            if marker.exit_code == 0 and marker.stdout.strip():
+                self._exit_code = _parse_exit_code(marker.stdout.strip(), default=0)
+                self._done.set()
+                return self._exit_code
+
+            alive = self._sandbox.exec(f"kill -0 {self._pid}", timeout=10)
+            if alive.exit_code != 0:
+                self._exit_code = 1
+                self._done.set()
+                return self._exit_code
+
+            time.sleep(self._poll_interval_s)
+
+    def kill(self) -> None:
+        if self._done.is_set():
+            return
+        try:
+            self._sandbox.exec(f"kill -9 {self._pid}", timeout=5)
+        except Exception:
+            pass
+        self._exit_code = 137
+        self._done.set()
+
+
+class HFSandboxHandle:
+    """Wraps a live ``hf_sandbox.Sandbox`` to satisfy :class:`SandboxHandle`."""
+
+    def __init__(
+        self,
+        sandbox: Any,
+        *,
+        default_envs: dict[str, str] | None = None,
+    ) -> None:
+        self._sbx = sandbox
+        self._default_envs = dict(default_envs or {})
+        self._bg_jobs: list[HFBgJob] = []
+
+    @property
+    def sandbox_id(self) -> str:
+        return str(getattr(self._sbx, "job_id", "hf-sandbox"))
+
+    @property
+    def raw(self) -> Any:
+        """Escape hatch for callers that need the underlying SDK object."""
+        return self._sbx
+
+    def exec(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+        timeout: float | None = 60,
+    ) -> ExecResult:
+        merged_envs = dict(self._default_envs)
+        merged_envs.update(envs or {})
+        shell_cmd = _with_env_prefix(cmd, merged_envs)
+        timeout_s = _normalize_exec_timeout(timeout)
+        try:
+            result = self._sbx.exec(
+                "bash",
+                "-lc",
+                shell_cmd,
+                workdir=cwd,
+                timeout=timeout_s,
+            )
+            return ExecResult(
+                exit_code=int(getattr(result, "returncode", 1)),
+                stdout=str(getattr(result, "stdout", "") or ""),
+                stderr=str(getattr(result, "stderr", "") or ""),
+            )
+        except Exception as exc:
+            return ExecResult(exit_code=-1, stdout="", stderr=str(exc))
+
+    def start_bg(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+    ) -> BgJob:
+        marker_path = f"/tmp/.openenv_bg_{uuid.uuid4().hex[:12]}.exit"
+        wrapped = f"{cmd}; rc=$?; echo $rc > {_shell_quote(marker_path)}"
+        launch_cmd = f"nohup bash -lc {_shell_quote(wrapped)} >/dev/null 2>&1 & echo $!"
+
+        result = self.exec(launch_cmd, envs=envs, cwd=cwd, timeout=30)
+        if result.exit_code != 0:
+            raise RuntimeError(
+                f"Failed to start background command: {result.stderr or result.stdout}"
+            )
+
+        pid = _parse_pid(result.stdout)
+        if pid is None:
+            raise RuntimeError(f"Could not extract PID from start_bg output: {result.stdout!r}")
+
+        job = HFBgJob(self, pid=pid, marker_path=marker_path)
+        self._bg_jobs.append(job)
+        return job
+
+    def write_text(self, path: str, content: str) -> None:
+        parent = str(PurePosixPath(path).parent)
+        if parent not in ("", "/"):
+            r = self.exec(f"mkdir -p {_shell_quote(parent)}", timeout=10)
+            if r.exit_code != 0:
+                raise RuntimeError(f"Failed to create parent directory {parent!r}: {r.stderr}")
+        self._sbx.write_file(path, content)
+
+    def read_text(self, path: str) -> str:
+        return str(self._sbx.read_file(path, text=True))
+
+    def exists(self, path: str) -> bool:
+        r = self.exec(f"test -e {_shell_quote(path)}", timeout=10)
+        return r.exit_code == 0
+
+    def kill(self) -> None:
+        for job in self._bg_jobs:
+            try:
+                job.kill()
+            except Exception:
+                pass
+        self._bg_jobs.clear()
+        try:
+            self._sbx.terminate()
+        except Exception:
+            pass
+
+
+class HFSandboxBackend:
+    """Creates HF sandboxes for harness rollouts via ``hf-sandbox``."""
+
+    def __init__(
+        self,
+        *,
+        image: str = "python:3.12",
+        flavor: str = "cpu-basic",
+        timeout: str | None = None,
+        forward_hf_token: bool = False,
+        create_retries: int = 3,
+        create_backoff_s: float = 2.0,
+    ) -> None:
+        self._image = image
+        self._flavor = flavor
+        self._timeout = timeout
+        self._forward_hf_token = forward_hf_token
+        self._create_retries = max(1, int(create_retries))
+        self._create_backoff_s = max(0.0, float(create_backoff_s))
+
+    def create(
+        self,
+        *,
+        timeout_s: int = 900,
+        envs: dict[str, str] | None = None,
+        metadata: dict[str, str] | None = None,
+    ) -> SandboxHandle:
+        # `hf-sandbox` does not support metadata at create-time yet.
+        del metadata
+
+        timeout = self._timeout or _format_timeout(timeout_s)
+        last_error: Exception | None = None
+
+        for attempt in range(self._create_retries):
+            try:
+                sbx = Sandbox.create(
+                    image=self._image,
+                    flavor=self._flavor,
+                    timeout=timeout,
+                    forward_hf_token=self._forward_hf_token,
+                )
+                return HFSandboxHandle(sbx, default_envs=envs)
+            except Exception as exc:  # noqa: BLE001
+                last_error = exc
+                if attempt + 1 < self._create_retries:
+                    time.sleep(self._create_backoff_s * (2**attempt))
+
+        assert last_error is not None
+        raise HFSandboxCreateError(
+            f"Failed to create HF sandbox after {self._create_retries} attempts: "
+            f"{last_error}"
+        ) from last_error
+
+
+def _with_env_prefix(cmd: str, envs: dict[str, str]) -> str:
+    if not envs:
+        return cmd
+    parts: list[str] = []
+    for key, value in envs.items():
+        if not _ENV_KEY_RE.match(key):
+            raise ValueError(f"Invalid environment variable name: {key!r}")
+        parts.append(f"export {key}={_shell_quote(str(value))};")
+    return " ".join(parts) + f" {cmd}"
+
+
+def _normalize_exec_timeout(timeout: float | None) -> int:
+    if timeout is None:
+        return 24 * 60 * 60
+    return max(1, int(timeout))
+
+
+def _format_timeout(timeout_s: int) -> str:
+    timeout_s = max(1, int(timeout_s))
+    if timeout_s % 3600 == 0:
+        return f"{timeout_s // 3600}h"
+    if timeout_s % 60 == 0:
+        return f"{timeout_s // 60}m"
+    return f"{timeout_s}s"
+
+
+def _parse_pid(stdout: str) -> int | None:
+    for line in reversed(stdout.strip().splitlines()):
+        raw = line.strip()
+        if raw.isdigit():
+            return int(raw)
+    return None
+
+
+def _parse_exit_code(raw: str, *, default: int) -> int:
+    try:
+        return int(raw.splitlines()[-1].strip())
+    except Exception:
+        return default
+
+
+def _shell_quote(s: str) -> str:
+    """Single-quote a string for shell, escaping embedded single quotes."""
+    return "'" + s.replace("'", "'\\''") + "'"
+
+
+__all__ = [
+    "HFBgJob",
+    "HFSandboxBackend",
+    "HFSandboxCreateError",
+    "HFSandboxError",
+    "HFSandboxHandle",
+]
diff --git a/tests/core/test_hf_sandbox_backend.py b/tests/core/test_hf_sandbox_backend.py
new file mode 100644
index 000000000..d301b2b2b
--- /dev/null
+++ b/tests/core/test_hf_sandbox_backend.py
@@ -0,0 +1,221 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Unit tests for the HF sandbox backend.
+
+These tests mock ``hf-sandbox`` so they run without network or HF credentials.
+"""
+
+from __future__ import annotations
+
+import importlib
+import re
+import subprocess
+import sys
+import types
+from dataclasses import dataclass, field
+
+import pytest
+
+
+@dataclass
+class _FakeSandbox:
+    job_id: str
+    files: dict[str, str] = field(default_factory=dict)
+    marker_files: dict[str, str] = field(default_factory=dict)
+    bg_jobs: dict[int, dict] = field(default_factory=dict)
+    next_pid: int = 1000
+    terminated: bool = False
+
+    def exec(
+        self,
+        *cmd: str,
+        workdir: str | None = None,
+        stdin: str | None = None,
+        timeout: int = 600,
+    ) -> subprocess.CompletedProcess:
+        del workdir, stdin, timeout
+        if len(cmd) < 3:
+            return subprocess.CompletedProcess(cmd, 1, "", "invalid command")
+        script = cmd[2]
+
+        if "ok_cmd" in script:
+            return subprocess.CompletedProcess(cmd, 0, "ok\n", "")
+        if "fail_cmd" in script:
+            return subprocess.CompletedProcess(cmd, 42, "", "failed")
+        if "timeout_cmd" in script:
+            return subprocess.CompletedProcess(cmd, -1, "", "timeout")
+
+        if "mkdir -p" in script:
+            return subprocess.CompletedProcess(cmd, 0, "", "")
+
+        if "test -e " in script:
+            match = re.search(r"test -e '([^']+)'", script)
+            assert match is not None
+            path = match.group(1)
+            exists = path in self.files or path in self.marker_files
+            return subprocess.CompletedProcess(cmd, 0 if exists else 1, "", "")
+
+        if "cat '/tmp/.openenv_bg_" in script:
+            match = re.search(r"cat '([^']+)'", script)
+            assert match is not None
+            marker = match.group(1)
+            if marker in self.marker_files:
+                return subprocess.CompletedProcess(
+                    cmd,
+                    0,
+                    f"{self.marker_files[marker]}\n",
+                    "",
+                )
+            return subprocess.CompletedProcess(cmd, 1, "", "missing")
+
+        if script.strip().startswith("kill -0 "):
+            pid = int(script.strip().split()[2])
+            alive = self.bg_jobs.get(pid, {}).get("alive", False)
+            return subprocess.CompletedProcess(cmd, 0 if alive else 1, "", "")
+
+        if script.strip().startswith("kill -9 "):
+            pid = int(script.strip().split()[2])
+            if pid in self.bg_jobs:
+                self.bg_jobs[pid]["alive"] = False
+                marker = self.bg_jobs[pid]["marker"]
+                self.marker_files[marker] = "137"
+            return subprocess.CompletedProcess(cmd, 0, "", "")
+
+        if "echo $!" in script:
+            marker_match = re.search(r"(/tmp/\.openenv_bg_[A-Za-z0-9]+\.exit)", script)
+            assert marker_match is not None
+            marker = marker_match.group(1)
+            pid = self.next_pid
+            self.next_pid += 1
+            long_running = "sleep 300" in script
+            self.bg_jobs[pid] = {
+                "marker": marker,
+                "alive": long_running,
+            }
+            if not long_running:
+                self.marker_files[marker] = "0"
+            return subprocess.CompletedProcess(cmd, 0, f"{pid}\n", "")
+
+        return subprocess.CompletedProcess(cmd, 0, "", "")
+
+    def write_file(
+        self,
+        path: str,
+        content: str | bytes | bytearray | memoryview,
+    ) -> None:
+        if isinstance(content, str):
+            normalized = content
+        else:
+            normalized = bytes(content).decode("utf-8", "replace")
+        self.files[path] = normalized
+
+    def read_file(self, path: str, text: bool = True) -> str | bytes:
+        if path not in self.files:
+            raise FileNotFoundError(path)
+        return self.files[path] if text else self.files[path].encode()
+
+    def terminate(self) -> None:
+        self.terminated = True
+
+
+class _FakeSandboxAPI:
+    calls: list[dict] = []
+
+    @classmethod
+    def create(
+        cls,
+        image: str,
+        flavor: str,
+        timeout: str,
+        forward_hf_token: bool,
+    ) -> _FakeSandbox:
+        cls.calls.append(
+            {
+                "image": image,
+                "flavor": flavor,
+                "timeout": timeout,
+                "forward_hf_token": forward_hf_token,
+            }
+        )
+        return _FakeSandbox(job_id="job-123")
+
+
+def _install_fake_hf_sandbox(monkeypatch) -> None:
+    fake_module = types.ModuleType("hf_sandbox")
+    setattr(fake_module, "Sandbox", _FakeSandboxAPI)
+    monkeypatch.setitem(sys.modules, "hf_sandbox", fake_module)
+
+
+class TestHFSandboxBackend:
+    def test_exported_from_package(self, monkeypatch):
+        _install_fake_hf_sandbox(monkeypatch)
+
+        import openenv.core.harness.sandbox as sandbox_pkg
+
+        importlib.reload(sandbox_pkg)
+        assert hasattr(sandbox_pkg, "HFSandboxBackend")
+        assert hasattr(sandbox_pkg, "HFSandboxHandle")
+        assert hasattr(sandbox_pkg, "HFBgJob")
+
+    def test_create_exec_write_read_exists_bg_and_kill(self, monkeypatch):
+        import openenv.core.harness.sandbox.hf_backend as hf_backend
+
+        _install_fake_hf_sandbox(monkeypatch)
+        importlib.reload(hf_backend)
+
+        _FakeSandboxAPI.calls.clear()
+        monkeypatch.setattr(hf_backend, "Sandbox", _FakeSandboxAPI)
+
+        backend = hf_backend.HFSandboxBackend(
+            image="python:3.12",
+            flavor="cpu-basic",
+            forward_hf_token=True,
+        )
+        sandbox = backend.create(timeout_s=120, envs={"GLOBAL_ENV": "on"})
+
+        assert sandbox.sandbox_id == "job-123"
+        assert _FakeSandboxAPI.calls[-1]["timeout"] == "2m"
+
+        ok = sandbox.exec("ok_cmd")
+        assert ok.exit_code == 0
+
+        failed = sandbox.exec("fail_cmd")
+        assert failed.exit_code == 42
+
+        timed = sandbox.exec("timeout_cmd")
+        assert timed.exit_code == -1
+
+        sandbox.write_text("/tmp/hello.txt", "hello")
+        assert sandbox.exists("/tmp/hello.txt")
+        assert sandbox.read_text("/tmp/hello.txt") == "hello"
+
+        short_job = sandbox.start_bg("echo done > /tmp/bg.txt")
+        assert short_job.wait(timeout=2) == 0
+
+        long_job = sandbox.start_bg("sleep 300")
+        with pytest.raises(TimeoutError):
+            long_job.wait(timeout=0.1)
+        long_job.kill()
+        assert isinstance(long_job.wait(timeout=2), int)
+
+        sandbox.kill()
+        raw = getattr(sandbox, "raw", None)
+        assert raw is not None
+        assert raw.terminated is True
+
+    def test_factory_creates_hf_backend(self, monkeypatch):
+        _install_fake_hf_sandbox(monkeypatch)
+
+        import openenv.core.harness.sandbox.hf_backend as hf_backend
+        import openenv.core.harness.sandbox as sandbox_pkg
+
+        importlib.reload(hf_backend)
+        importlib.reload(sandbox_pkg)
+
+        monkeypatch.setattr(hf_backend, "Sandbox", _FakeSandboxAPI)
+        backend = sandbox_pkg.create_sandbox_backend("hf", image="python:3.12")
+        assert isinstance(backend, hf_backend.HFSandboxBackend)

From 9d856401edd6d2e23bebec130716d9674a5eba45 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Fri, 15 May 2026 14:23:56 +0530
Subject: [PATCH 10/35] chore: ruff + usort format pass

---
 envs/agent_world_model_env/server/web_ui.py   |  4 +-
 envs/chat_env/models.py                       |  4 +-
 envs/chat_env/server/chat_environment.py      |  4 +-
 envs/coding_agent_env/client.py               |  1 -
 .../sandbox/build_template.py                 |  2 +-
 .../server/coding_environment.py              |  5 +-
 envs/coding_agent_env/server/gradio_ui.py     |  1 -
 .../server/coding_tools_env_environment.py    | 65 ++++++++++---
 envs/coding_tools_env/server/e2b_sandbox.py   | 23 ++++-
 envs/coding_tools_env/server/gradio_ui.py     | 94 +++++++++++++------
 .../jupyter_env/server/jupyter_environment.py | 10 +-
 envs/repl_env/server/repl_environment.py      |  4 +-
 .../server/terminus_env_environment.py        | 10 +-
 envs/textarena_env/server/gradio_ui.py        |  8 +-
 .../core/harness/sandbox/hf_backend.py        |  8 +-
 tests/core/test_hf_sandbox_backend.py         |  2 +-
 tests/envs/test_coding_agent_env.py           |  8 +-
 17 files changed, 178 insertions(+), 75 deletions(-)

diff --git a/envs/agent_world_model_env/server/web_ui.py b/envs/agent_world_model_env/server/web_ui.py
index 84b10c6b2..09b445d3f 100644
--- a/envs/agent_world_model_env/server/web_ui.py
+++ b/envs/agent_world_model_env/server/web_ui.py
@@ -21,9 +21,7 @@
 
 
 # Keep in sync with DEFAULT_REWARD_CONFIG in config.py.
-_DEFAULT_REWARD_JSON = json.dumps(
-    DEFAULT_REWARD_CONFIG, indent=2
-)
+_DEFAULT_REWARD_JSON = json.dumps(DEFAULT_REWARD_CONFIG, indent=2)
 
 
 def _format_obs_md(payload: dict | None) -> str:
diff --git a/envs/chat_env/models.py b/envs/chat_env/models.py
index 8bc10f09e..da994cbe3 100644
--- a/envs/chat_env/models.py
+++ b/envs/chat_env/models.py
@@ -55,7 +55,9 @@ class ChatState(State):
     """State of the ChatEnvironment containing message history."""
 
     history_messages: list[Message] = Field(default_factory=list)
-    history_tokens: list[list[int]] = Field(default_factory=list)  # Same len as messages
+    history_tokens: list[list[int]] = Field(
+        default_factory=list
+    )  # Same len as messages
 
 
 class ChatObservation(Observation):
diff --git a/envs/chat_env/server/chat_environment.py b/envs/chat_env/server/chat_environment.py
index 90b2d01f0..f66f3e790 100644
--- a/envs/chat_env/server/chat_environment.py
+++ b/envs/chat_env/server/chat_environment.py
@@ -90,7 +90,9 @@ def _coerce_tokens(self, tokens) -> list[int]:
     def _tokenize_conversation(self, conversation: list[Message]) -> list[int]:
         """Tokenize a conversation with a chat-template fallback for base tokenizers."""
         try:
-            tokens = self.tokenizer.apply_chat_template(conversation=conversation, tokenize=True)
+            tokens = self.tokenizer.apply_chat_template(
+                conversation=conversation, tokenize=True
+            )
         except Exception:
             # Some tokenizers (e.g. gpt2) do not define `chat_template`.
             fallback_text = "".join(
diff --git a/envs/coding_agent_env/client.py b/envs/coding_agent_env/client.py
index 8c512090d..7e2a21696 100644
--- a/envs/coding_agent_env/client.py
+++ b/envs/coding_agent_env/client.py
@@ -169,4 +169,3 @@ def _extract_text(result: Any) -> str:
             return text
 
     return str(result)
-
diff --git a/envs/coding_agent_env/sandbox/build_template.py b/envs/coding_agent_env/sandbox/build_template.py
index e22b30185..e1fdac50a 100644
--- a/envs/coding_agent_env/sandbox/build_template.py
+++ b/envs/coding_agent_env/sandbox/build_template.py
@@ -114,7 +114,7 @@ def main(argv: list[str] | None = None) -> int:
     p.add_argument(
         "--name",
         default="coding-agent-rl",
-        help="Template name (default: coding-agent-rl)."
+        help="Template name (default: coding-agent-rl).",
     )
     p.add_argument(
         "--skip-cache",
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index 3f8eabd13..e389eb759 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -480,7 +480,9 @@ def _build_agent_config(
             )
 
         provider = (
-            "openai" if mode == "transparent_proxy" else self._infer_pi_provider(base_url)
+            "openai"
+            if mode == "transparent_proxy"
+            else self._infer_pi_provider(base_url)
         )
         return _GenericAgentConfig(
             base_url=base_url.rstrip("/"),
@@ -654,4 +656,3 @@ def _safe_read(sandbox: Any, path: str) -> str:
             return sandbox.read_text(path) or ""
         except Exception:
             return ""
-
diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py
index 5497ef0f2..ef3f94aeb 100644
--- a/envs/coding_agent_env/server/gradio_ui.py
+++ b/envs/coding_agent_env/server/gradio_ui.py
@@ -611,4 +611,3 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         )
 
     return app
-
diff --git a/envs/coding_tools_env/server/coding_tools_env_environment.py b/envs/coding_tools_env/server/coding_tools_env_environment.py
index 615e7770f..d0ef86675 100644
--- a/envs/coding_tools_env/server/coding_tools_env_environment.py
+++ b/envs/coding_tools_env/server/coding_tools_env_environment.py
@@ -45,16 +45,28 @@ def bash(command: str, timeout: float | None = 30) -> str:
                 return "Error: environment not reset. Call reset() first."
             timeout_value = 30 if timeout is None else float(timeout)
             result = self._sandbox.run_shell(command, timeout_s=timeout_value)
-            self._record("bash", result.ok, result.output, result.error, result.metadata)
-            return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip()
+            self._record(
+                "bash", result.ok, result.output, result.error, result.metadata
+            )
+            return (
+                result.output
+                if result.ok
+                else f"ERROR: {result.error}\n{result.output}".strip()
+            )
 
         @mcp.tool
-        def read(file_path: str, offset: int | None = None, limit: int | None = None) -> str:
+        def read(
+            file_path: str, offset: int | None = None, limit: int | None = None
+        ) -> str:
             """Read file contents using computer instance."""
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
-            result = self._sandbox.read_file(file_path=file_path, offset=offset, limit=limit)
-            self._record("read", result.ok, result.output, result.error, result.metadata)
+            result = self._sandbox.read_file(
+                file_path=file_path, offset=offset, limit=limit
+            )
+            self._record(
+                "read", result.ok, result.output, result.error, result.metadata
+            )
             return result.output if result.ok else f"ERROR: {result.error}"
 
         @mcp.tool
@@ -63,7 +75,9 @@ def write(file_path: str, content: str) -> str:
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
             result = self._sandbox.write_file(file_path=file_path, content=content)
-            self._record("write", result.ok, result.output, result.error, result.metadata)
+            self._record(
+                "write", result.ok, result.output, result.error, result.metadata
+            )
             return result.output if result.ok else f"ERROR: {result.error}"
 
         @mcp.tool
@@ -88,10 +102,14 @@ def edit(
                 updated = original.replace(old_string, new_string)
             else:
                 updated = original.replace(old_string, new_string, 1)
-            write_result = self._sandbox.write_file(file_path=file_path, content=updated)
+            write_result = self._sandbox.write_file(
+                file_path=file_path, content=updated
+            )
             ok = write_result.ok
             msg = "edit ok" if ok else ""
-            self._record("edit", ok, msg, write_result.error, {"replace_all": replace_all})
+            self._record(
+                "edit", ok, msg, write_result.error, {"replace_all": replace_all}
+            )
             return msg if ok else f"ERROR: {write_result.error}"
 
         @mcp.tool
@@ -129,7 +147,11 @@ def multi_edit(file_path: str, edits: list[dict[str, Any]]) -> str:
                 write_result.error,
                 {"applied": applied},
             )
-            return f"applied {applied} edits" if write_result.ok else f"ERROR: {write_result.error}"
+            return (
+                f"applied {applied} edits"
+                if write_result.ok
+                else f"ERROR: {write_result.error}"
+            )
 
         @mcp.tool
         def glob(pattern: str, path: str | None = None) -> str:
@@ -137,17 +159,27 @@ def glob(pattern: str, path: str | None = None) -> str:
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
             result = self._sandbox.glob_files(pattern=pattern, path=path)
-            self._record("glob", result.ok, result.output, result.error, result.metadata)
+            self._record(
+                "glob", result.ok, result.output, result.error, result.metadata
+            )
             return result.output if result.ok else f"ERROR: {result.error}"
 
         @mcp.tool
-        def grep(pattern: str, path: str | None = None, include: str | None = None) -> str:
+        def grep(
+            pattern: str, path: str | None = None, include: str | None = None
+        ) -> str:
             """Search for patterns in files."""
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
             result = self._sandbox.grep(pattern=pattern, path=path, include=include)
-            self._record("grep", result.ok, result.output, result.error, result.metadata)
-            return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip()
+            self._record(
+                "grep", result.ok, result.output, result.error, result.metadata
+            )
+            return (
+                result.output
+                if result.ok
+                else f"ERROR: {result.error}\n{result.output}".strip()
+            )
 
         @mcp.tool
         def ls(path: str = ".", ignore: list[str] | None = None) -> str:
@@ -177,7 +209,9 @@ def todo_write(todos: list[dict[str, Any]]) -> str:
                     self._record("todo_write", False, "", msg, None)
                     return msg
             self._state.todos = validated
-            self._record("todo_write", True, f"stored {len(validated)} todos", None, None)
+            self._record(
+                "todo_write", True, f"stored {len(validated)} todos", None, None
+            )
             return f"stored {len(validated)} todos"
 
         @mcp.tool
@@ -281,7 +315,8 @@ def reset(
                             "sandbox_id": self._state.sandbox_id,
                             "message": "Setup command failed.",
                             "setup_results": [
-                                entry.model_dump() for entry in self._state.setup_results
+                                entry.model_dump()
+                                for entry in self._state.setup_results
                             ],
                         },
                     )
diff --git a/envs/coding_tools_env/server/e2b_sandbox.py b/envs/coding_tools_env/server/e2b_sandbox.py
index 5833c7ecb..d6f77373b 100644
--- a/envs/coding_tools_env/server/e2b_sandbox.py
+++ b/envs/coding_tools_env/server/e2b_sandbox.py
@@ -94,7 +94,11 @@ def read_file(
     def write_file(self, file_path: str, content: str) -> ToolResult:
         try:
             self._sbx.files.write(file_path, content.encode("utf-8"))
-            return ToolResult(ok=True, output="write ok", metadata={"bytes": len(content.encode("utf-8"))})
+            return ToolResult(
+                ok=True,
+                output="write ok",
+                metadata={"bytes": len(content.encode("utf-8"))},
+            )
         except Exception as exc:
             return ToolResult(ok=False, error=f"write failed: {exc}")
 
@@ -111,7 +115,9 @@ def glob_files(self, pattern: str, path: str | None = None) -> ToolResult:
         if result is None:
             return ToolResult(ok=False, error=_format_error(execution))
         matches = result.get("matches", [])
-        return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches})
+        return ToolResult(
+            ok=True, output="\n".join(matches), metadata={"matches": matches}
+        )
 
     def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResult:
         ignore = ignore or []
@@ -137,10 +143,15 @@ def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResu
         if not result.get("ok", False):
             return ToolResult(ok=False, error=str(result.get("error", "ls failed")))
         items = result.get("items", [])
-        lines = [f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}" for item in items]
+        lines = [
+            f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}"
+            for item in items
+        ]
         return ToolResult(ok=True, output="\n".join(lines), metadata={"items": items})
 
-    def grep(self, pattern: str, path: str | None = None, include: str | None = None) -> ToolResult:
+    def grep(
+        self, pattern: str, path: str | None = None, include: str | None = None
+    ) -> ToolResult:
         root = path or "."
         code = (
             "from pathlib import Path\n"
@@ -173,7 +184,9 @@ def grep(self, pattern: str, path: str | None = None, include: str | None = None
         if not result.get("ok", False):
             return ToolResult(ok=False, error=str(result.get("error", "grep failed")))
         matches = result.get("matches", [])
-        return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches})
+        return ToolResult(
+            ok=True, output="\n".join(matches), metadata={"matches": matches}
+        )
 
     def kill(self) -> None:
         try:
diff --git a/envs/coding_tools_env/server/gradio_ui.py b/envs/coding_tools_env/server/gradio_ui.py
index c0d670a99..1f3845141 100644
--- a/envs/coding_tools_env/server/gradio_ui.py
+++ b/envs/coding_tools_env/server/gradio_ui.py
@@ -105,7 +105,9 @@ def _extract_tool_error(result: dict[str, Any]) -> bool:
 
 def _format_status(state: dict[str, Any]) -> str:
     if not state:
-        return "**No active session.** Configure setup/verify and click *Reset sandbox*."
+        return (
+            "**No active session.** Configure setup/verify and click *Reset sandbox*."
+        )
     sandbox_id = state.get("sandbox_id") or "—"
     step_count = state.get("step_count", 0)
     submitted = state.get("submitted", False)
@@ -227,9 +229,9 @@ def state_payload() -> dict[str, Any]:
                         label="edits (JSON array)",
                         language="json",
                         value=(
-                            '[\n'
+                            "[\n"
                             '  {"old_string": "TODO", "new_string": "DONE", "replace_all": false}\n'
-                            ']'
+                            "]"
                         ),
                         lines=8,
                     )
@@ -260,10 +262,10 @@ def state_payload() -> dict[str, Any]:
                         label="todos (JSON array)",
                         language="json",
                         value=(
-                            '[\n'
+                            "[\n"
                             '  {"id":"1","content":"Inspect files",'
                             '"status":"in_progress","priority":"high"}\n'
-                            ']'
+                            "]"
                         ),
                         lines=8,
                     )
@@ -337,23 +339,33 @@ def on_tool_change(tool: str):
             return [help_md, *updates]
 
         tool_dropdown.change(
-            on_tool_change, inputs=[tool_dropdown], outputs=[tool_help, *group_components]
+            on_tool_change,
+            inputs=[tool_dropdown],
+            outputs=[tool_help, *group_components],
         )
 
         # ───────── Result rendering helper ─────────
-        def render_result(tool: str, raw: dict[str, Any]) -> tuple[str, str, str, str, str, list[list[str]]]:
+        def render_result(
+            tool: str, raw: dict[str, Any]
+        ) -> tuple[str, str, str, str, str, list[list[str]]]:
             text = _extract_tool_text(raw)
-            is_error = _extract_tool_error(raw) or text.startswith("ERROR:") or text.startswith("Error:")
+            is_error = (
+                _extract_tool_error(raw)
+                or text.startswith("ERROR:")
+                or text.startswith("Error:")
+            )
             badge = "❌ error" if is_error else "✅ ok"
             status_line = f"**{tool}** — {badge}"
             state = state_payload()
             return (
-                status_line,                       # output_status
-                text,                              # output_view
-                json.dumps(raw, indent=2),         # raw_response
-                _format_status(state),             # state_summary (top + summary panel — same content)
+                status_line,  # output_status
+                text,  # output_view
+                json.dumps(raw, indent=2),  # raw_response
+                _format_status(
+                    state
+                ),  # state_summary (top + summary panel — same content)
                 json.dumps(state, indent=2, default=str),  # state_json
-                _format_history(state),            # history_table
+                _format_history(state),  # history_table
             )
 
         # ───────── Session handlers ─────────
@@ -398,21 +410,33 @@ async def on_close():
         async def on_run(
             tool: str,
             # bash
-            bash_command: str, bash_timeout: float,
+            bash_command: str,
+            bash_timeout: float,
             # read
-            read_path: str, read_offset: float | None, read_limit: float | None,
+            read_path: str,
+            read_offset: float | None,
+            read_limit: float | None,
             # write
-            write_path: str, write_content: str,
+            write_path: str,
+            write_content: str,
             # edit
-            edit_path: str, edit_old: str, edit_new: str, edit_replace_all: bool,
+            edit_path: str,
+            edit_old: str,
+            edit_new: str,
+            edit_replace_all: bool,
             # multi_edit
-            multi_edit_path: str, multi_edit_json: str,
+            multi_edit_path: str,
+            multi_edit_json: str,
             # glob
-            glob_pattern: str, glob_path: str,
+            glob_pattern: str,
+            glob_path: str,
             # grep
-            grep_pattern: str, grep_path: str, grep_include: str,
+            grep_pattern: str,
+            grep_path: str,
+            grep_include: str,
             # ls
-            ls_path: str, ls_ignore: str,
+            ls_path: str,
+            ls_ignore: str,
             # todo_write
             todo_json: str,
         ):
@@ -493,14 +517,26 @@ async def on_run(
         # ───────── Wire up events ─────────
         all_inputs = [
             tool_dropdown,
-            bash_command, bash_timeout,
-            read_path, read_offset, read_limit,
-            write_path, write_content,
-            edit_path, edit_old, edit_new, edit_replace_all,
-            multi_edit_path, multi_edit_json,
-            glob_pattern, glob_path,
-            grep_pattern, grep_path, grep_include,
-            ls_path, ls_ignore,
+            bash_command,
+            bash_timeout,
+            read_path,
+            read_offset,
+            read_limit,
+            write_path,
+            write_content,
+            edit_path,
+            edit_old,
+            edit_new,
+            edit_replace_all,
+            multi_edit_path,
+            multi_edit_json,
+            glob_pattern,
+            glob_path,
+            grep_pattern,
+            grep_path,
+            grep_include,
+            ls_path,
+            ls_ignore,
             todo_json,
         ]
         all_outputs = [
diff --git a/envs/jupyter_env/server/jupyter_environment.py b/envs/jupyter_env/server/jupyter_environment.py
index bc622ae22..b7902e5d2 100644
--- a/envs/jupyter_env/server/jupyter_environment.py
+++ b/envs/jupyter_env/server/jupyter_environment.py
@@ -348,7 +348,10 @@ def step(
     ) -> Observation:
         self._state.step_count += 1
         obs = super().step(action, timeout_s=timeout_s, **kwargs)
-        if self._state.submitted_answer is not None and self._state.last_reward is not None:
+        if (
+            self._state.submitted_answer is not None
+            and self._state.last_reward is not None
+        ):
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
@@ -361,7 +364,10 @@ async def step_async(
     ) -> Observation:
         self._state.step_count += 1
         obs = await super().step_async(action, timeout_s=timeout_s, **kwargs)
-        if self._state.submitted_answer is not None and self._state.last_reward is not None:
+        if (
+            self._state.submitted_answer is not None
+            and self._state.last_reward is not None
+        ):
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
diff --git a/envs/repl_env/server/repl_environment.py b/envs/repl_env/server/repl_environment.py
index f2e6f5d98..13a759c29 100644
--- a/envs/repl_env/server/repl_environment.py
+++ b/envs/repl_env/server/repl_environment.py
@@ -272,9 +272,7 @@ def reset(
         # reset() are treated as equal and don't trigger a redundant rebuild.
         resolved_model = self._resolve_model(llm_model)
         has_runtime_llm = self._runtime_controller is not None
-        model_changed = (
-            has_runtime_llm and resolved_model != self._current_llm_model
-        )
+        model_changed = has_runtime_llm and resolved_model != self._current_llm_model
         token_provided = hf_token is not None
         if not self.llm_query_fn or model_changed or token_provided:
             effective_token = (
diff --git a/envs/terminus_env/server/terminus_env_environment.py b/envs/terminus_env/server/terminus_env_environment.py
index c6f9e1c02..03de18baa 100644
--- a/envs/terminus_env/server/terminus_env_environment.py
+++ b/envs/terminus_env/server/terminus_env_environment.py
@@ -183,7 +183,10 @@ def step(
     ) -> Observation:
         self._state.step_count += 1
         obs = super().step(action, timeout_s=timeout_s, **kwargs)
-        if self._state.submitted_answer is not None and self._state.last_reward is not None:
+        if (
+            self._state.submitted_answer is not None
+            and self._state.last_reward is not None
+        ):
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
@@ -196,7 +199,10 @@ async def step_async(
     ) -> Observation:
         self._state.step_count += 1
         obs = await super().step_async(action, timeout_s=timeout_s, **kwargs)
-        if self._state.submitted_answer is not None and self._state.last_reward is not None:
+        if (
+            self._state.submitted_answer is not None
+            and self._state.last_reward is not None
+        ):
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
diff --git a/envs/textarena_env/server/gradio_ui.py b/envs/textarena_env/server/gradio_ui.py
index 45728fc00..c9bb88cae 100644
--- a/envs/textarena_env/server/gradio_ui.py
+++ b/envs/textarena_env/server/gradio_ui.py
@@ -71,7 +71,9 @@ def _sudoku_demo_html() -> str:
         for col in range(9):
             value = givens.get((row, col), "")
             border_right = "3px solid #0f172a" if col in {2, 5} else "1px solid #94a3b8"
-            border_bottom = "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8"
+            border_bottom = (
+                "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8"
+            )
             background = "#e2e8f0" if value else "#ffffff"
             cells.append(
                 f"""
@@ -82,7 +84,7 @@ def _sudoku_demo_html() -> str:
   align-items: center;
   justify-content: center;
   font-size: 1.1rem;
-  font-weight: {'700' if value else '400'};
+  font-weight: {"700" if value else "400"};
   color: #0f172a;
   background: {background};
   border-right: {border_right};
@@ -105,7 +107,7 @@ def _sudoku_demo_html() -> str:
     border: 3px solid #0f172a;
     background: #ffffff;
   ">
-    {''.join(cells)}
+    {"".join(cells)}
   </div>
   <p style="margin-top: 16px; color: #475569; font-size: 0.95rem; line-height: 1.45;">
     Use the <strong>Playground</strong> tab to reset the game and submit moves in the
diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py
index 410a8daea..bb41356e2 100644
--- a/src/openenv/core/harness/sandbox/hf_backend.py
+++ b/src/openenv/core/harness/sandbox/hf_backend.py
@@ -163,7 +163,9 @@ def start_bg(
 
         pid = _parse_pid(result.stdout)
         if pid is None:
-            raise RuntimeError(f"Could not extract PID from start_bg output: {result.stdout!r}")
+            raise RuntimeError(
+                f"Could not extract PID from start_bg output: {result.stdout!r}"
+            )
 
         job = HFBgJob(self, pid=pid, marker_path=marker_path)
         self._bg_jobs.append(job)
@@ -174,7 +176,9 @@ def write_text(self, path: str, content: str) -> None:
         if parent not in ("", "/"):
             r = self.exec(f"mkdir -p {_shell_quote(parent)}", timeout=10)
             if r.exit_code != 0:
-                raise RuntimeError(f"Failed to create parent directory {parent!r}: {r.stderr}")
+                raise RuntimeError(
+                    f"Failed to create parent directory {parent!r}: {r.stderr}"
+                )
         self._sbx.write_file(path, content)
 
     def read_text(self, path: str) -> str:
diff --git a/tests/core/test_hf_sandbox_backend.py b/tests/core/test_hf_sandbox_backend.py
index d301b2b2b..cd235c748 100644
--- a/tests/core/test_hf_sandbox_backend.py
+++ b/tests/core/test_hf_sandbox_backend.py
@@ -210,8 +210,8 @@ def test_create_exec_write_read_exists_bg_and_kill(self, monkeypatch):
     def test_factory_creates_hf_backend(self, monkeypatch):
         _install_fake_hf_sandbox(monkeypatch)
 
-        import openenv.core.harness.sandbox.hf_backend as hf_backend
         import openenv.core.harness.sandbox as sandbox_pkg
+        import openenv.core.harness.sandbox.hf_backend as hf_backend
 
         importlib.reload(hf_backend)
         importlib.reload(sandbox_pkg)
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py
index ec1f66fa5..3a89a3ce6 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -46,14 +46,14 @@
 def test_public_api_imports() -> None:
     """Top-level package re-exports the documented surface."""
     from coding_agent_env import (  # noqa: F401
-        CommandResult,
-        E2BSandboxBackend,
         CodingAgentConfig,
         CodingAgentEnv,
         CodingAgentSession,
         CodingAgentSessionFactory,
         CodingAgentState,
         CodingAgentTask,
+        CommandResult,
+        E2BSandboxBackend,
         Provider,
         RolloutResult,
         RolloutTurn,
@@ -267,7 +267,9 @@ def test_coding_agent_task_coerce_str() -> None:
 def test_coding_agent_task_coerce_dict() -> None:
     from coding_agent_env import CodingAgentTask
 
-    t = CodingAgentTask.coerce({"instruction": "x", "setup_shell": "pip install pandas"})
+    t = CodingAgentTask.coerce(
+        {"instruction": "x", "setup_shell": "pip install pandas"}
+    )
     assert t.instruction == "x"
     assert t.setup_shell == "pip install pandas"
 

From 2f9435cb5d8936567cad5cdc272a12cb16668963 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Fri, 15 May 2026 23:35:31 +0530
Subject: [PATCH 11/35] refactor: remove transparent_proxy mode and in-sandbox
 interception proxy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the transparent proxy was a passive forwarder that captured logprobs
by injecting logprobs=true into upstream requests. It is replaced by
the interception_gate mode where the trainer owns the forward pass
entirely — no proxy needed inside the sandbox.
---
 envs/coding_agent_env/config.py               |  13 -
 .../sandbox/build_template.py                 |  73 +-
 src/openenv/core/harness/agents/base.py       |   3 -
 src/openenv/core/harness/agents/opencode.py   |   1 -
 src/openenv/core/harness/agents/pi.py         |   1 -
 .../core/harness/sandbox/interception.py      | 660 ------------------
 6 files changed, 3 insertions(+), 748 deletions(-)
 delete mode 100644 src/openenv/core/harness/sandbox/interception.py

diff --git a/envs/coding_agent_env/config.py b/envs/coding_agent_env/config.py
index 2eac8d16f..b3243253e 100644
--- a/envs/coding_agent_env/config.py
+++ b/envs/coding_agent_env/config.py
@@ -51,19 +51,6 @@ class CodingAgentConfig(BaseModel):
     # ``/home/user``. Override when using a root-privileged backend (Docker).
     sandbox_home: str = "/home/user"
 
-    # --- Transparent-proxy tuning --------------------------------------------
-    # Cap ``max_tokens`` / ``max_completion_tokens`` on forwarded requests.
-    # OpenCode defaults to a very large number (~32000) which exceeds some
-    # provider limits (e.g. gpt-4o-mini = 16384). Only used in
-    # ``mode="transparent_proxy"``. ``None`` disables the cap.
-    proxy_max_tokens_cap: int | None = 16384
-    # Per-turn top-k logprobs the proxy requests from the upstream.
-    proxy_top_logprobs: int = 5
-    # Disable reasoning/thinking mode for Qwen3 / Qwen3.5 models. Proxy sets
-    # ``extra_body.chat_template_kwargs.enable_thinking=false`` on forwarded
-    # requests. Ignored by providers that don't support the field.
-    proxy_disable_thinking: bool = False
-
 
 _PROVIDER_NPM = {
     "openai_compatible": "@ai-sdk/openai-compatible",
diff --git a/envs/coding_agent_env/sandbox/build_template.py b/envs/coding_agent_env/sandbox/build_template.py
index e1fdac50a..01978b520 100644
--- a/envs/coding_agent_env/sandbox/build_template.py
+++ b/envs/coding_agent_env/sandbox/build_template.py
@@ -4,35 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Build a pre-baked E2B template with opencode + proxy deps already installed.
-
-Run-time per rollout drops from ~3 min (cold install) to ~30s once the
-template is built, because we skip:
-
-  - ``curl https://opencode.ai/install | bash`` (~30-90s)
-  - ``pip install fastapi uvicorn httpx`` (~30-60s)
-  - directory layout setup
-  - copying the proxy source
-
-The template ships:
-
-  - opencode CLI at ``/home/user/.opencode/bin/opencode``
-  - Python deps for the in-sandbox proxy
-  - The proxy source at ``/home/user/proxy/interception.py``
-  - Pre-created dirs: ``~/.config/opencode``, ``~/logs/{agent,verifier}``,
-    ``~/task``, ``~/workdir``, ``~/proxy``
-  - Default workdir: ``/home/user/workdir``
-
-Usage::
-
-    .venv/bin/python envs/coding_agent_env/sandbox/build_template.py
-    # → builds (or rebuilds) ``coding-agent-rl`` template, prints template id
-
-Then rollout tests can use it via ``--template coding-agent-rl``.
-
-Requires ``E2B_API_KEY`` in the environment. First build is ~3-8 min;
-subsequent builds reuse the cache and can finish in <60s.
-"""
+"""Build a pre-baked E2B template with opencode already installed."""
 
 from __future__ import annotations
 
@@ -43,11 +15,7 @@
 
 from e2b import default_build_logger, Template
 
-
 _REPO_ROOT = Path(__file__).resolve().parents[3]
-_PROXY_SOURCE = (
-    _REPO_ROOT / "src" / "openenv" / "core" / "harness" / "sandbox" / "interception.py"
-)
 
 
 def _load_env(path: Path) -> None:
@@ -65,25 +33,9 @@ def _load_env(path: Path) -> None:
 
 
 def build_template(name: str, *, skip_cache: bool = False) -> str:
-    if not _PROXY_SOURCE.exists():
-        raise RuntimeError(f"proxy source missing at {_PROXY_SOURCE}")
-
-    # Template.copy() resolves relative paths against the caller's source
-    # file directory. This script lives next to ``interception.py`` so the
-    # bare filename works.
-
-    # Stage 1 (root): system-wide pip deps for the proxy.
-    # Stage 2 (user): opencode install + dir layout + proxy copy.
     template = (
         Template()
         .from_python_image("3.12")
-        .pip_install(
-            [
-                "fastapi>=0.104",
-                "uvicorn[standard]>=0.24",
-                "httpx>=0.27",
-            ]
-        )
         .set_user("user")
         .run_cmd("curl -fsSL https://opencode.ai/install | bash")
         .run_cmd("/home/user/.opencode/bin/opencode --version")
@@ -92,13 +44,10 @@ def build_template(name: str, *, skip_cache: bool = False) -> str:
         .make_dir("/home/user/logs/verifier")
         .make_dir("/home/user/task")
         .make_dir("/home/user/workdir")
-        .make_dir("/home/user/proxy")
-        .copy(str(_PROXY_SOURCE), "/home/user/proxy/interception.py")
         .set_workdir("/home/user/workdir")
     )
     if skip_cache:
         template = template.skip_cache()
-
     info = Template.build(
         template,
         name,
@@ -111,31 +60,15 @@ def build_template(name: str, *, skip_cache: bool = False) -> str:
 
 def main(argv: list[str] | None = None) -> int:
     p = argparse.ArgumentParser(prog="build_e2b_template")
-    p.add_argument(
-        "--name",
-        default="coding-agent-rl",
-        help="Template name (default: coding-agent-rl).",
-    )
-    p.add_argument(
-        "--skip-cache",
-        action="store_true",
-        help="Force a clean rebuild, ignoring cache.",
-    )
+    p.add_argument("--name", default="coding-agent-rl")
+    p.add_argument("--skip-cache", action="store_true")
     args = p.parse_args(argv)
-
     _load_env(_REPO_ROOT / "envs" / "coding_agent_env" / "sandbox" / ".env")
     if not os.environ.get("E2B_API_KEY"):
         print("ERROR: E2B_API_KEY required.", file=sys.stderr)
         return 2
-
-    print(f"Building template '{args.name}' (proxy source: {_PROXY_SOURCE})")
-    print(f"Skip cache: {args.skip_cache}")
-    print()
-
     template_id = build_template(args.name, skip_cache=args.skip_cache)
-    print()
     print(f"Built. Template id/name: {template_id}")
-    print(f"Use in code: Sandbox.create(template='{args.name}')")
     return 0
 
 
diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py
index 72cc9a6cf..ded9ba3b8 100644
--- a/src/openenv/core/harness/agents/base.py
+++ b/src/openenv/core/harness/agents/base.py
@@ -174,9 +174,6 @@ class CLIAgentSpec:
     mcp_config: MCPConfigSpec
     """How MCP tool configuration is injected."""
 
-    supports_logprob_proxy: bool = True
-    """Whether this agent can be routed through the interception proxy."""
-
     default_timeout_s: float = 600.0
     """Default per-rollout timeout in seconds."""
 
diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py
index d0146b008..13c17fa04 100644
--- a/src/openenv/core/harness/agents/opencode.py
+++ b/src/openenv/core/harness/agents/opencode.py
@@ -177,7 +177,6 @@ def _system_prompt_content(task: Any, config: Any) -> str | None:
         method="config_file",
         path_template="{home}/.config/opencode/opencode.json",
     ),
-    supports_logprob_proxy=True,
     default_timeout_s=900.0,
     setup=(
         "set -e && "
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index 63e2eb0c3..d7b60569f 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -111,7 +111,6 @@ def _parse_events(line: str) -> AgentEvent | None:
         method="config_file",
         path_template="{workdir}/.mcp.json",
     ),
-    supports_logprob_proxy=True,
     default_timeout_s=600.0,
     setup=(
         "set -e && "
diff --git a/src/openenv/core/harness/sandbox/interception.py b/src/openenv/core/harness/sandbox/interception.py
deleted file mode 100644
index 4e7c857ac..000000000
--- a/src/openenv/core/harness/sandbox/interception.py
+++ /dev/null
@@ -1,660 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""Transparent OpenAI-compatible forwarding proxy with logprob capture.
-
-The proxy is a small FastAPI app that agent CLIs (OpenCode, Claude Code,
-Codex, Pi, etc.) talk to instead of the upstream LLM endpoint. It:
-
-1. Forwards every ``POST /v1/chat/completions`` request to the real upstream
-   URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream
-   returns per-token logprobs.
-2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines
-   trace file.
-3. Returns the upstream response to the agent verbatim (minus the ``logprobs``
-   field, which we strip so the CLI never sees anything unexpected).
-
-The proxy is stateless beyond the trace file. One proxy instance runs per
-session, normally inside the sandbox on ``localhost:7000``.
-
-Run standalone::
-
-    UPSTREAM_API_KEY=... python -m openenv.core.harness.sandbox.interception \\
-        --upstream-url https://vllm.example/v1 \\
-        --trace /tmp/trace.jsonl \\
-        --port 7000
-"""
-
-from __future__ import annotations
-
-import argparse
-import asyncio
-import copy
-import json
-import logging
-import os
-import socket
-import threading
-import time
-from contextlib import asynccontextmanager, closing
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any
-
-import httpx
-import uvicorn
-from fastapi import FastAPI, Request, Response
-from fastapi.responses import JSONResponse, StreamingResponse
-
-
-CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
-_LOG = logging.getLogger(__name__)
-
-
-@dataclass
-class ProxyConfig:
-    """Runtime configuration for one :class:`InterceptionProxy`."""
-
-    upstream_url: str
-    upstream_api_key: str = "intercepted"
-    trace_path: str = "/tmp/opencode-proxy-trace.jsonl"
-    host: str = "127.0.0.1"
-    port: int = 7000
-    top_logprobs: int = 5
-    request_timeout_s: float = 600.0
-    # Cap ``max_tokens`` before forwarding. OpenCode historically asks for very
-    # large values (e.g. 32000) that exceed gpt-4o-mini's 16384 cap; capping
-    # here avoids spurious upstream 400s without requiring the caller to know
-    # per-model limits.
-    max_tokens_cap: int | None = 16384
-    # Disable Qwen-style reasoning/thinking by injecting
-    # ``chat_template_kwargs.enable_thinking=false`` into forwarded requests.
-    disable_thinking: bool = False
-    # Override the ``model`` field on every forwarded request. Some opencode
-    # builds emit a stripped model id (e.g. ``Qwen3.5-4B`` instead of the
-    # ``Qwen/Qwen3.5-4B`` the upstream serves) for their internal
-    # title-generation call. Setting this to the exact upstream model id
-    # bypasses that mismatch.
-    model_override: str | None = None
-
-
-@dataclass
-class TurnRecord:
-    """One intercepted turn, written to the trace file as JSON-lines."""
-
-    turn: int
-    request: dict[str, Any]
-    response: dict[str, Any]
-    logprobs: list[dict[str, Any]] | None
-    completion_tokens: list[str]
-    completion_token_ids: list[int]
-    per_token_logps: list[float]
-    finish_reason: str | None
-    latency_s: float
-    timestamp: float = field(default_factory=time.time)
-
-    def to_json(self) -> str:
-        return json.dumps(self.__dict__, default=str)
-
-
-def _build_app(cfg: ProxyConfig) -> FastAPI:
-    """Construct the FastAPI app that serves one proxy session."""
-
-    state: dict[str, Any] = {"turn": 0, "lock": asyncio.Lock()}
-
-    # HTTP client reused across requests. ``None`` auth header — we let each
-    # request carry its own ``Authorization`` populated from ``upstream_api_key``.
-    client = httpx.AsyncClient(timeout=cfg.request_timeout_s)
-    trace_file = open(cfg.trace_path, "a", buffering=1)
-
-    @asynccontextmanager
-    async def lifespan(_: FastAPI) -> Any:
-        try:
-            yield
-        finally:
-            await client.aclose()
-            trace_file.close()
-
-    app = FastAPI(title="opencode-interception-proxy", lifespan=lifespan)
-
-    @app.get("/healthz")
-    def healthz() -> dict[str, str]:
-        return {"status": "ok"}
-
-    @app.post(CHAT_COMPLETIONS_PATH)
-    async def chat_completions(request: Request) -> Response:
-        raw_body = await request.body()
-        try:
-            body = json.loads(raw_body)
-        except json.JSONDecodeError:
-            return JSONResponse(status_code=400, content={"error": "invalid json body"})
-
-        forwarded_body = _prepare_forwarded_body(body, cfg)
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {cfg.upstream_api_key}",
-        }
-        upstream_url = _resolve_upstream_url(cfg.upstream_url)
-
-        async with state["lock"]:
-            state["turn"] += 1
-            turn_idx = state["turn"]
-
-        if forwarded_body.get("stream"):
-            return await _proxy_streaming(
-                client=client,
-                upstream_url=upstream_url,
-                headers=headers,
-                forwarded_body=forwarded_body,
-                original_body=body,
-                trace_file=trace_file,
-                turn_idx=turn_idx,
-            )
-        return await _proxy_unary(
-            client=client,
-            upstream_url=upstream_url,
-            headers=headers,
-            forwarded_body=forwarded_body,
-            original_body=body,
-            trace_file=trace_file,
-            turn_idx=turn_idx,
-        )
-
-    return app
-
-
-def _prepare_forwarded_body(body: dict[str, Any], cfg: ProxyConfig) -> dict[str, Any]:
-    """Return the body we actually send upstream.
-
-    - Injects ``logprobs=true`` + ``top_logprobs`` so the upstream emits
-      per-token logprobs.
-    - Caps ``max_tokens`` / ``max_completion_tokens`` to ``max_tokens_cap``.
-    - For models that reject ``max_tokens`` (e.g. gpt-5.x), translates to
-      ``max_completion_tokens``.
-    """
-    forwarded = copy.deepcopy(body)
-    forwarded.setdefault("logprobs", True)
-    forwarded.setdefault("top_logprobs", cfg.top_logprobs)
-
-    # GPT-5.x and newer: ``max_tokens`` is rejected; must use
-    # ``max_completion_tokens``. Detect via model string so we don't break
-    # gpt-4.x or vLLM-hosted models that accept ``max_tokens``.
-    model = str(forwarded.get("model", ""))
-    needs_translation = _model_uses_max_completion_tokens(model)
-    if needs_translation and "max_tokens" in forwarded:
-        value = forwarded.pop("max_tokens")
-        forwarded.setdefault("max_completion_tokens", value)
-
-    if cfg.max_tokens_cap is not None:
-        for key in ("max_tokens", "max_completion_tokens"):
-            value = forwarded.get(key)
-            if isinstance(value, int) and value > cfg.max_tokens_cap:
-                forwarded[key] = cfg.max_tokens_cap
-
-    if cfg.disable_thinking:
-        # vLLM applies chat_template_kwargs to the tokenizer's chat template
-        # for Qwen3/Qwen3.5 models, turning off <think>...</think> generation.
-        extra = forwarded.setdefault("chat_template_kwargs", {})
-        extra.setdefault("enable_thinking", False)
-
-    if cfg.model_override:
-        forwarded["model"] = cfg.model_override
-
-    return forwarded
-
-
-def _model_uses_max_completion_tokens(model: str) -> bool:
-    """Heuristic: ``True`` for models that reject ``max_tokens``."""
-    # Strip a provider prefix opencode may have prepended (e.g. "intercepted/").
-    bare = model.split("/", 1)[-1].lower()
-    return bare.startswith(("gpt-5", "o1", "o3", "o4"))
-
-
-def _resolve_upstream_url(upstream: str) -> str:
-    """Build the fully qualified chat-completions URL from a base URL."""
-    base = upstream.rstrip("/")
-    if base.endswith("/v1"):
-        return f"{base}/chat/completions"
-    return f"{base}{CHAT_COMPLETIONS_PATH}"
-
-
-async def _proxy_unary(
-    *,
-    client: httpx.AsyncClient,
-    upstream_url: str,
-    headers: dict[str, str],
-    forwarded_body: dict[str, Any],
-    original_body: dict[str, Any],
-    trace_file: Any,
-    turn_idx: int,
-) -> Response:
-    start = time.time()
-    upstream_response = await client.post(
-        upstream_url, content=json.dumps(forwarded_body), headers=headers
-    )
-    latency = time.time() - start
-    try:
-        response_json = upstream_response.json()
-    except Exception:
-        return Response(
-            content=upstream_response.content,
-            status_code=upstream_response.status_code,
-            media_type=upstream_response.headers.get(
-                "content-type", "application/json"
-            ),
-        )
-
-    record = _build_turn_record(
-        turn_idx=turn_idx,
-        request_body=forwarded_body,
-        response_json=response_json,
-        latency_s=latency,
-    )
-    trace_file.write(record.to_json() + "\n")
-    sanitized = _strip_logprobs(response_json)
-    return JSONResponse(content=sanitized, status_code=upstream_response.status_code)
-
-
-async def _proxy_streaming(
-    *,
-    client: httpx.AsyncClient,
-    upstream_url: str,
-    headers: dict[str, str],
-    forwarded_body: dict[str, Any],
-    original_body: dict[str, Any],
-    trace_file: Any,
-    turn_idx: int,
-) -> Response:
-    """Forward an SSE stream while accumulating the full response.
-
-    Opens the upstream stream and inspects the status. On non-2xx, reads the
-    full body (an error JSON, not SSE) and returns it to the caller as a
-    regular JSON response — previously we silently emitted an empty
-    ``text/event-stream`` which opencode interpreted as an empty assistant
-    turn. Both the error body and the latency are written to the trace file
-    so debugging a broken rollout doesn't require another round-trip.
-    """
-
-    start = time.time()
-
-    # Open the stream outside the generator so we can branch on status before
-    # committing to a streaming response shape.
-    upstream_cm = client.stream(
-        "POST",
-        upstream_url,
-        content=json.dumps(forwarded_body),
-        headers=headers,
-    )
-    upstream = await upstream_cm.__aenter__()
-
-    if upstream.status_code >= 400:
-        # Upstream responded with an error body (not SSE). Read it fully and
-        # return as a non-streaming JSON payload.
-        error_bytes = await upstream.aread()
-        await upstream_cm.__aexit__(None, None, None)
-        latency = time.time() - start
-        try:
-            error_json = json.loads(error_bytes.decode() or "{}")
-        except Exception:
-            error_json = {"error": error_bytes.decode(errors="replace")[:4000]}
-        record = _build_turn_record(
-            turn_idx=turn_idx,
-            request_body=forwarded_body,
-            response_json={
-                "choices": [],
-                "usage": None,
-                "upstream_status": upstream.status_code,
-                "upstream_error": error_json,
-            },
-            latency_s=latency,
-        )
-        trace_file.write(record.to_json() + "\n")
-        _LOG.warning(
-            "proxy turn %s: upstream %s: %s",
-            turn_idx,
-            upstream.status_code,
-            str(error_json)[:400],
-        )
-        return JSONResponse(content=error_json, status_code=upstream.status_code)
-
-    async def _stream() -> Any:
-        accumulated: dict[str, Any] = {
-            "content_by_idx": {},
-            "tool_calls_by_idx": {},
-            "finish_by_idx": {},
-            "logprobs_by_idx": {},
-        }
-        last_chunk: dict[str, Any] = {}
-        try:
-            async for line in upstream.aiter_lines():
-                if not line:
-                    yield "\n"
-                    continue
-                yield line + "\n"
-                if not line.startswith("data:"):
-                    continue
-                data = line[len("data:") :].strip()
-                if data == "[DONE]":
-                    continue
-                try:
-                    chunk = json.loads(data)
-                except json.JSONDecodeError:
-                    continue
-                last_chunk = chunk
-                _accumulate_stream_chunk(chunk, accumulated)
-        finally:
-            await upstream_cm.__aexit__(None, None, None)
-
-        latency = time.time() - start
-        response_json = _assemble_streamed_response(last_chunk, accumulated)
-        record = _build_turn_record(
-            turn_idx=turn_idx,
-            request_body=forwarded_body,
-            response_json=response_json,
-            latency_s=latency,
-        )
-        trace_file.write(record.to_json() + "\n")
-
-    return StreamingResponse(_stream(), media_type="text/event-stream")
-
-
-def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None:
-    for choice in chunk.get("choices", []) or []:
-        idx = choice.get("index", 0)
-        delta = choice.get("delta") or {}
-        content = delta.get("content")
-        if content:
-            acc["content_by_idx"].setdefault(idx, []).append(content)
-        # HF-Router's Qwen thinking mode streams the chain-of-thought under a
-        # separate ``reasoning`` field (per Together/Scaleway). Accumulate it
-        # so the assembled response surfaces it — otherwise it's dropped and
-        # proxy_turn observability is lost for thinking-mode rollouts.
-        reasoning = delta.get("reasoning")
-        if reasoning:
-            acc.setdefault("reasoning_by_idx", {}).setdefault(idx, []).append(reasoning)
-        for tc in delta.get("tool_calls") or []:
-            tc_idx = tc.get("index", 0)
-            bucket = acc["tool_calls_by_idx"].setdefault(
-                (idx, tc_idx),
-                {
-                    "id": None,
-                    "type": "function",
-                    "function": {"name": "", "arguments": ""},
-                },
-            )
-            if tc.get("id"):
-                bucket["id"] = tc["id"]
-            fn = tc.get("function") or {}
-            if fn.get("name"):
-                bucket["function"]["name"] += fn["name"]
-            if fn.get("arguments"):
-                bucket["function"]["arguments"] += fn["arguments"]
-        if choice.get("finish_reason"):
-            acc["finish_by_idx"][idx] = choice["finish_reason"]
-        lp = choice.get("logprobs") or {}
-        content_lp = lp.get("content")
-        if content_lp:
-            acc["logprobs_by_idx"].setdefault(idx, []).extend(content_lp)
-
-
-def _assemble_streamed_response(
-    last_chunk: dict[str, Any], acc: dict[str, Any]
-) -> dict[str, Any]:
-    indices = sorted(
-        set(acc["content_by_idx"])
-        | set(acc["finish_by_idx"])
-        | {k[0] for k in acc["tool_calls_by_idx"]}
-        | set(acc["logprobs_by_idx"])
-        | {0}
-    )
-    choices: list[dict[str, Any]] = []
-    for idx in indices:
-        tool_calls = [
-            acc["tool_calls_by_idx"][k]
-            for k in sorted(acc["tool_calls_by_idx"])
-            if k[0] == idx
-        ]
-        message: dict[str, Any] = {"role": "assistant"}
-        content = "".join(acc["content_by_idx"].get(idx, []))
-        if content:
-            message["content"] = content
-        reasoning = "".join((acc.get("reasoning_by_idx") or {}).get(idx, []))
-        if reasoning:
-            message["reasoning"] = reasoning
-        if tool_calls:
-            message["tool_calls"] = tool_calls
-        choice: dict[str, Any] = {
-            "index": idx,
-            "message": message,
-            "finish_reason": acc["finish_by_idx"].get(idx),
-        }
-        if acc["logprobs_by_idx"].get(idx):
-            choice["logprobs"] = {"content": acc["logprobs_by_idx"][idx]}
-        choices.append(choice)
-    return {
-        "id": last_chunk.get("id", ""),
-        "object": "chat.completion",
-        "model": last_chunk.get("model", ""),
-        "choices": choices,
-        "usage": last_chunk.get("usage"),
-    }
-
-
-def _build_turn_record(
-    *,
-    turn_idx: int,
-    request_body: dict[str, Any],
-    response_json: dict[str, Any],
-    latency_s: float,
-) -> TurnRecord:
-    """Extract per-token logprobs into a normalized :class:`TurnRecord`."""
-
-    choice = (response_json.get("choices") or [{}])[0]
-    logprobs_field = choice.get("logprobs") or {}
-    content_lp = logprobs_field.get("content") or []
-
-    tokens: list[str] = []
-    token_ids: list[int] = []
-    per_token_logps: list[float] = []
-    for entry in content_lp:
-        tokens.append(entry.get("token", ""))
-        # OpenAI returns no raw token ids; vLLM returns them as ``token_id``.
-        token_id = entry.get("token_id")
-        if token_id is not None:
-            token_ids.append(int(token_id))
-        lp = entry.get("logprob")
-        if lp is not None:
-            per_token_logps.append(float(lp))
-
-    return TurnRecord(
-        turn=turn_idx,
-        request=request_body,
-        response=response_json,
-        logprobs=content_lp,
-        completion_tokens=tokens,
-        completion_token_ids=token_ids,
-        per_token_logps=per_token_logps,
-        finish_reason=choice.get("finish_reason"),
-        latency_s=latency_s,
-    )
-
-
-def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]:
-    """Return a copy of the response with ``choices[*].logprobs`` removed."""
-
-    out = dict(response_json)
-    choices = out.get("choices")
-    if isinstance(choices, list):
-        out["choices"] = [
-            {k: v for k, v in (ch or {}).items() if k != "logprobs"} for ch in choices
-        ]
-    return out
-
-
-# ---------------------------------------------------------------------------
-# Standalone runner (used inside the sandbox)
-# ---------------------------------------------------------------------------
-
-
-def serve(cfg: ProxyConfig) -> None:
-    """Start the proxy and block (for use as the sandbox-side entry point)."""
-
-    app = _build_app(cfg)
-    uvicorn.run(app, host=cfg.host, port=cfg.port, log_level="warning")
-
-
-class InterceptionProxy:
-    """Thread-backed controller for running the proxy locally.
-
-    Used by unit tests and by any in-process driver that wants a short-lived
-    proxy on the local machine. Inside a sandbox we invoke :func:`serve`
-    directly via ``python -m openenv.core.harness.sandbox.interception``.
-    """
-
-    def __init__(self, cfg: ProxyConfig) -> None:
-        self._cfg = cfg
-        self._server: uvicorn.Server | None = None
-        self._thread: threading.Thread | None = None
-        self._ready = threading.Event()
-
-    @property
-    def url(self) -> str:
-        return f"http://{self._cfg.host}:{self._cfg.port}/v1"
-
-    @property
-    def config(self) -> ProxyConfig:
-        return self._cfg
-
-    def start(self) -> None:
-        app = _build_app(self._cfg)
-        config = uvicorn.Config(
-            app,
-            host=self._cfg.host,
-            port=self._cfg.port,
-            log_level="warning",
-            lifespan="on",
-        )
-        self._server = uvicorn.Server(config)
-        self._thread = threading.Thread(target=self._run_server, daemon=True)
-        self._thread.start()
-        # Wait for the server to accept connections.
-        deadline = time.time() + 10
-        while time.time() < deadline:
-            if _port_open(self._cfg.host, self._cfg.port):
-                self._ready.set()
-                return
-            time.sleep(0.05)
-        raise RuntimeError("InterceptionProxy failed to start within 10s")
-
-    def _run_server(self) -> None:
-        assert self._server is not None
-        self._server.run()
-
-    def stop(self) -> None:
-        if self._server is None:
-            return
-        self._server.should_exit = True
-        if self._thread is not None:
-            self._thread.join(timeout=5)
-        self._server = None
-        self._thread = None
-
-    def __enter__(self) -> "InterceptionProxy":
-        self.start()
-        return self
-
-    def __exit__(self, *exc) -> None:
-        self.stop()
-
-
-def _port_open(host: str, port: int) -> bool:
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-        s.settimeout(0.2)
-        return s.connect_ex((host, port)) == 0
-
-
-# ---------------------------------------------------------------------------
-# Trace reader (used by the session to pull captured turns back)
-# ---------------------------------------------------------------------------
-
-
-def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]:
-    """Read a proxy trace file into a list of dicts."""
-
-    trace: list[dict[str, Any]] = []
-    p = Path(path)
-    if not p.exists():
-        return trace
-    for line in p.read_text().splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        trace.append(json.loads(line))
-    return trace
-
-
-# ---------------------------------------------------------------------------
-# CLI entry point
-# ---------------------------------------------------------------------------
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(prog="openenv.core.harness.sandbox.interception")
-    parser.add_argument("--upstream-url", required=True)
-    parser.add_argument(
-        "--upstream-api-key",
-        default=None,
-        help=(
-            "Upstream API key. Prefer OPENCODE_UPSTREAM_API_KEY so the key "
-            "does not appear in process argv."
-        ),
-    )
-    parser.add_argument("--trace", default="/tmp/opencode-proxy-trace.jsonl")
-    parser.add_argument("--host", default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=7000)
-    parser.add_argument("--top-logprobs", type=int, default=5)
-    parser.add_argument("--request-timeout", type=float, default=600.0)
-    parser.add_argument(
-        "--max-tokens-cap",
-        type=int,
-        default=None,
-        help="Clamp max_tokens/max_completion_tokens on forwarded requests.",
-    )
-    parser.add_argument(
-        "--disable-thinking",
-        action="store_true",
-        help="Inject chat_template_kwargs.enable_thinking=false (Qwen3/Qwen3.5).",
-    )
-    parser.add_argument(
-        "--model-override",
-        default=None,
-        help="Rewrite the `model` field on every forwarded request.",
-    )
-    args = parser.parse_args()
-    upstream_api_key = (
-        args.upstream_api_key
-        or os.environ.get("OPENCODE_UPSTREAM_API_KEY")
-        or os.environ.get("UPSTREAM_API_KEY")
-        or "intercepted"
-    )
-
-    cfg = ProxyConfig(
-        upstream_url=args.upstream_url,
-        upstream_api_key=upstream_api_key,
-        trace_path=args.trace,
-        host=args.host,
-        port=args.port,
-        top_logprobs=args.top_logprobs,
-        request_timeout_s=args.request_timeout,
-        max_tokens_cap=args.max_tokens_cap,
-        disable_thinking=args.disable_thinking,
-        model_override=args.model_override,
-    )
-    serve(cfg)
-
-
-if __name__ == "__main__":
-    main()

From 71bd9e96326d2618ba3c28646d803faa4ec4f9b3 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Fri, 15 May 2026 23:35:44 +0530
Subject: [PATCH 12/35] feat: InterceptionServer + interception_gate mode for
 trainer-owned generation

InterceptionServer (aiohttp) runs on the trainer host. Each rollout
registers a queue. The agent's OPENAI_BASE_URL points at
`{base_url}/rollout/{id}/v1`. When the agent makes an LLM call it
blocks at the server. The training loop dequeues the request, calls
vLLM with logprobs=True and return_token_ids=True, and delivers the
response back via deliver_response().
---
 pyproject.toml                                |   1 +
 src/openenv/core/harness/agents/__init__.py   |   4 +
 src/openenv/core/harness/agents/cli_driver.py | 444 +++++-------------
 .../harness/agents/interception_server.py     | 324 +++++++++++++
 4 files changed, 445 insertions(+), 328 deletions(-)
 create mode 100644 src/openenv/core/harness/agents/interception_server.py

diff --git a/pyproject.toml b/pyproject.toml
index 08f1bb6d3..e40b79c9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     # Web UI dependencies
     "gradio>=4.0.0",
     "httpx>=0.28.1",
+    "aiohttp>=3.13.5",
 ]
 
 [project.optional-dependencies]
diff --git a/src/openenv/core/harness/agents/__init__.py b/src/openenv/core/harness/agents/__init__.py
index 8ef31976b..b715582a4 100644
--- a/src/openenv/core/harness/agents/__init__.py
+++ b/src/openenv/core/harness/agents/__init__.py
@@ -28,6 +28,7 @@
     CLIAgentSpec,
     MCPConfigSpec,
 )
+from .interception_server import deliver_response, InterceptionServer
 
 # Registry
 
@@ -104,4 +105,7 @@ def _auto_import(name: str) -> None:
     "ArtifactSpec",
     "CLIAgentSpec",
     "MCPConfigSpec",
+    # Interception gate
+    "InterceptionServer",
+    "deliver_response",
 ]
diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 42ac460f1..0e58af9e0 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -6,26 +6,24 @@
 
 """Shared CLI agent driver, session, and session factory.
 
-The :class:`CLIAgentDriver` factors out the common 70% of CLI harness
-lifecycle — sandbox creation, MCP config injection, interception proxy
-setup, subprocess management, and result collection.
+Two modes are supported:
 
-It is **fully generic**: it reads the :class:`CLIAgentSpec`'s declarative
-data fields and executes them mechanically. No per-agent code lives here.
-
-The :class:`CLIAgentSession` implements :class:`ResourceSession` and
-the :class:`CLIAgentSessionFactory` implements :class:`ResourceSessionFactory`,
-so the CLI agent driver integrates seamlessly with the existing harness
-runtime from PR #603.
+- ``black_box`` — the agent talks directly to the upstream LLM. No logprob
+  capture. For eval and demos.
+- ``interception_gate`` — the agent's LLM calls are routed to an
+  :class:`InterceptionServer` running on the trainer host. The training
+  loop owns the forward pass and delivers responses back. For RL training.
 """
 
 from __future__ import annotations
 
+import asyncio
 import json
 import logging
 import shlex
+import threading
 import time
-from pathlib import Path
+import uuid
 from typing import Any, Callable, Literal
 
 from openenv.core.env_server.mcp_types import Tool
@@ -39,37 +37,16 @@
 from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 
 from .base import CLIAgentSpec
+from .interception_server import deliver_response, InterceptionServer
 
 
 _log = logging.getLogger(__name__)
 
-# Interception proxy defaults
-_PROXY_PORT = 7000
-_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
-_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
-
-# Where the proxy source lives on disk. Uploaded into sandboxes that don't
-# already have it baked in.
-_PROXY_SOURCE_PATH = Path(__file__).resolve().parents[1] / "sandbox" / "interception.py"
-
-# Verifier type — callable that checks the agent's work and returns a result
 Verifier = Callable[..., VerifyResult]
 
 
-# CLIAgentSession
-
-
 class CLIAgentSession(ResourceSession):
-    """Per-rollout session wrapping one sandbox with one running agent CLI.
-
-    The session is created already-running: :meth:`CLIAgentSessionFactory.create`
-    launches the agent before returning. Typical usage::
-
-        session = factory.create(task)
-        session.wait_for_completion()
-        result = session.verify([])
-        session.close()
-    """
+    """Per-rollout session wrapping one sandbox with one running agent CLI."""
 
     def __init__(
         self,
@@ -80,9 +57,10 @@ def __init__(
         config: Any,
         verifier: Verifier | None = None,
         base_url_override: str | None = None,
-        proxy_trace_path: str | None = None,
-        proxy_bg_job: BgJob | None = None,
         agent_bg_job: BgJob | None = None,
+        interception_server: InterceptionServer | None = None,
+        interception_rollout_id: str | None = None,
+        interception_queue: asyncio.Queue | None = None,
     ) -> None:
         self.spec = spec
         self.sandbox = sandbox
@@ -90,11 +68,10 @@ def __init__(
         self.config = config
         self._verifier = verifier
         self._base_url_override = base_url_override
-        self._proxy_trace_path = proxy_trace_path
-        self._proxy_bg_job = proxy_bg_job
         self._agent_bg_job = agent_bg_job
-
-    # ResourceSession contract
+        self._interception_server = interception_server
+        self._interception_rollout_id = interception_rollout_id
+        self._interception_queue = interception_queue
 
     def initial_messages(self) -> list[Message]:
         instruction = (
@@ -105,7 +82,6 @@ def initial_messages(self) -> list[Message]:
         return [{"role": "user", "content": instruction}]
 
     def list_tools(self) -> list[Tool]:
-        # CLI agents own their own tool loop — none are exposed to the harness.
         return []
 
     def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
@@ -132,16 +108,13 @@ def close(self) -> None:
             except Exception:
                 pass
             self._agent_bg_job = None
-        if self._proxy_bg_job is not None:
-            try:
-                self._proxy_bg_job.kill()
-            except Exception:
-                pass
-            self._proxy_bg_job = None
+        if (
+            self._interception_server is not None
+            and self._interception_rollout_id is not None
+        ):
+            self._interception_server.unregister_rollout(self._interception_rollout_id)
         self.sandbox.kill()
 
-    # CLI-agent-specific API
-
     def wait_for_completion(self, timeout_s: float | None = None) -> int:
         """Block until the agent exits, returning its exit code."""
         budget = timeout_s if timeout_s is not None else self.spec.default_timeout_s
@@ -152,11 +125,7 @@ def wait_for_completion(self, timeout_s: float | None = None) -> int:
         return self._agent_bg_job.wait(timeout=budget)
 
     def collect_artifacts(self) -> dict[str, Any]:
-        """Collect all artifacts declared in ``spec.artifacts`` from the sandbox.
-
-        Returns a dict keyed by artifact name. Missing optional artifacts are
-        silently skipped.
-        """
+        """Collect all artifacts declared in ``spec.artifacts`` from the sandbox."""
         result: dict[str, Any] = {}
         if not self.spec.artifacts:
             return result
@@ -166,9 +135,6 @@ def collect_artifacts(self) -> dict[str, Any]:
                 if artifact_spec.format == "json":
                     result[name] = json.loads(content)
                 elif artifact_spec.format == "jsonl":
-                    # Parse valid JSON lines, skip non-JSON preamble
-                    # (e.g. opencode emits database migration messages
-                    # before the first JSON event).
                     records = []
                     for line in content.splitlines():
                         line = line.strip()
@@ -195,72 +161,85 @@ def collect_artifacts(self) -> dict[str, Any]:
                 )
         return result
 
-    def fetch_proxy_trace(self) -> list[dict[str, Any]]:
-        """Return per-turn proxy-captured records (transparent_proxy mode only).
+    # interception_gate API
 
-        Each entry has ``request``, ``response``, ``completion_tokens``,
-        ``completion_token_ids``, ``per_token_logps``, ``finish_reason``,
-        and ``latency_s``. Returns ``[]`` in black_box mode.
+    async def next_request(
+        self, timeout_s: float | None = None
+    ) -> dict[str, Any] | None:
+        """Await the next LLM request from the agent (interception_gate only).
+
+        Returns the intercept dict, or ``None`` when the agent has exited.
         """
-        if self._proxy_trace_path is None:
-            return []
-        try:
-            content = self.sandbox.read_text(self._proxy_trace_path)
-        except Exception:
-            return []
-        records: list[dict[str, Any]] = []
-        for line in content.splitlines():
-            line = line.strip()
-            if not line:
+        if self._interception_queue is None:
+            raise RuntimeError(
+                "next_request() is only available in interception_gate mode."
+            )
+        server = self._interception_server
+        assert server is not None
+
+        deadline = time.time() + (timeout_s or self.spec.default_timeout_s)
+        while True:
+            remaining = deadline - time.time()
+            if remaining <= 0:
+                raise TimeoutError(
+                    f"{self.spec.name} interception_gate: no request within timeout"
+                )
+            try:
+                request_id = await asyncio.wait_for(
+                    self._interception_queue.get(),
+                    timeout=min(remaining, 1.0),
+                )
+                return server.intercepts[request_id]
+            except asyncio.TimeoutError:
+                if self._agent_bg_job is not None:
+                    done_event = getattr(self._agent_bg_job, "_done", None)
+                    if (
+                        done_event is not None
+                        and isinstance(done_event, threading.Event)
+                        and done_event.is_set()
+                    ):
+                        return None
                 continue
-            records.append(json.loads(line))
-        return records
-
 
-# CLIAgentDriver — shared lifecycle
+    async def deliver(
+        self, intercept: dict[str, Any], response_dict: dict[str, Any]
+    ) -> None:
+        """Return a trainer-generated response to the waiting agent."""
+        await deliver_response(intercept, response_dict)
 
 
 class CLIAgentDriver:
-    """Shared driver for all CLI-based agentic harnesses.
-
-    Implements the common lifecycle:
-
-    1. Create sandbox (via :class:`SandboxBackend`)
-    2. Wait for sandbox ready (``echo ok`` probe)
-    3. Install agent CLI — run ``spec.setup`` commands (skipped if
-       ``spec.install_check_cmd`` succeeds, i.e. pre-baked template)
-    4. Upload ``spec.files`` into the sandbox
-    5. Write MCP config (via ``spec.build_mcp_config``)
-    6. Set environment variables from ``spec.env`` (with placeholder
-       resolution)
-    7. Optionally start interception proxy (transparent_proxy mode)
-    8. Build CLI command (via ``spec.build_command``)
-    9. Launch agent as bg process
-    10. Return a :class:`CLIAgentSession`
-    """
+    """Shared driver for all CLI-based agentic harnesses."""
 
     def __init__(
         self,
         spec: CLIAgentSpec,
         sandbox_backend: SandboxBackend,
-        mode: Literal["black_box", "transparent_proxy"] = "black_box",
+        mode: Literal["black_box", "interception_gate"] = "black_box",
         *,
         install_timeout_s: int = 240,
         setup_timeout_s: int = 300,
-        proxy_top_logprobs: int = 5,
-        proxy_max_tokens_cap: int | None = 16384,
-        proxy_disable_thinking: bool = False,
+        interception_server: InterceptionServer | None = None,
+        interception_base_url: str | None = None,
     ) -> None:
-        if mode not in {"black_box", "transparent_proxy"}:
+        if mode not in {"black_box", "interception_gate"}:
             raise ValueError(f"Unknown mode: {mode!r}")
+        if mode == "interception_gate":
+            if interception_server is None:
+                raise ValueError(
+                    "interception_gate mode requires an InterceptionServer instance."
+                )
+            if interception_base_url is None:
+                raise ValueError(
+                    "interception_gate mode requires interception_base_url."
+                )
         self.spec = spec
         self.sandbox_backend = sandbox_backend
         self.mode = mode
         self._install_timeout_s = install_timeout_s
         self._setup_timeout_s = setup_timeout_s
-        self._proxy_top_logprobs = proxy_top_logprobs
-        self._proxy_max_tokens_cap = proxy_max_tokens_cap
-        self._proxy_disable_thinking = proxy_disable_thinking
+        self._interception_server = interception_server
+        self._interception_base_url = interception_base_url
 
     def create_session(
         self,
@@ -271,35 +250,16 @@ def create_session(
         seed: int | None = None,
         episode_id: str | None = None,
     ) -> CLIAgentSession:
-        """Create a fully bootstrapped session with a running agent.
-
-        This is the main entry point. It:
-        1. Creates a sandbox
-        2. Bootstraps it (install agent, upload files, write MCP config)
-        3. Optionally starts the interception proxy
-        4. Launches the agent subprocess
-        5. Returns a ready-to-use :class:`CLIAgentSession`
-        """
         timeout_s = (
             config.agent_timeout_s
             if hasattr(config, "agent_timeout_s")
             else self.spec.default_timeout_s
         )
         sandbox_timeout = int(timeout_s) + 300
-
-        _log.info(
-            "%s driver: creating sandbox timeout=%ds mode=%s",
-            self.spec.name,
-            sandbox_timeout,
-            self.mode,
-        )
         sandbox = self.sandbox_backend.create(
             timeout_s=sandbox_timeout,
             metadata={"episode_id": episode_id} if episode_id else None,
         )
-        sid = getattr(sandbox, "sandbox_id", "?")
-        _log.info("%s driver: sandbox=%s — bootstrapping…", self.spec.name, sid)
-
         try:
             self._bootstrap_sandbox(sandbox, task, config)
         except Exception as exc:
@@ -308,33 +268,21 @@ def create_session(
             raise
 
         base_url_override: str | None = None
-        proxy_trace_path: str | None = None
-        proxy_bg_job: BgJob | None = None
-
-        if self.mode == "transparent_proxy":
-            base_url = config.base_url if hasattr(config, "base_url") else ""
-            api_key = config.api_key if hasattr(config, "api_key") else "intercepted"
-            model = config.model if hasattr(config, "model") else ""
-
-            _log.info(
-                "%s driver: starting interception proxy on :%d → %s",
-                self.spec.name,
-                _PROXY_PORT,
-                base_url,
-            )
-            proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy(
-                sandbox,
-                base_url=base_url,
-                api_key=api_key,
-                model=model,
+        interception_rollout_id: str | None = None
+        interception_queue: asyncio.Queue | None = None
+
+        if self.mode == "interception_gate":
+            assert self._interception_server is not None
+            assert self._interception_base_url is not None
+            rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}"
+            interception_rollout_id = rollout_id
+            interception_queue = self._interception_server.register_rollout(rollout_id)
+            base_url_override = (
+                f"{self._interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1"
             )
-            _log.info("%s driver: proxy up at %s", self.spec.name, base_url_override)
 
         agent_bg_job = self._start_agent(
-            sandbox,
-            task,
-            config,
-            base_url_override=base_url_override,
+            sandbox, task, config, base_url_override=base_url_override
         )
 
         return CLIAgentSession(
@@ -344,35 +292,20 @@ def create_session(
             config=config,
             verifier=verifier,
             base_url_override=base_url_override,
-            proxy_trace_path=proxy_trace_path,
-            proxy_bg_job=proxy_bg_job,
             agent_bg_job=agent_bg_job,
+            interception_server=self._interception_server,
+            interception_rollout_id=interception_rollout_id,
+            interception_queue=interception_queue,
         )
 
-    # Bootstrap stages
-
     def _bootstrap_sandbox(
-        self,
-        sandbox: SandboxHandle,
-        task: Any,
-        config: Any,
+        self, sandbox: SandboxHandle, task: Any, config: Any
     ) -> None:
-        """Install agent, upload files, write MCP config."""
-
-        # Stage 1: wait for sandbox readiness
         self._wait_for_sandbox_ready(sandbox)
-
-        # Stage 2: install agent CLI (skip if pre-baked)
         if not self._agent_already_installed(sandbox):
             self._install_agent(sandbox)
-
-        # Stage 3: upload spec.files
         self._upload_files(sandbox, task, config)
-
-        # Stage 4: write MCP config (if the spec provides a builder)
         self._write_mcp_config(sandbox, config)
-
-        # Stage 5: run task.setup_shell if present
         setup_shell = task.setup_shell if hasattr(task, "setup_shell") else None
         if setup_shell:
             r = sandbox.exec(setup_shell, timeout=self._setup_timeout_s)
@@ -382,13 +315,8 @@ def _bootstrap_sandbox(
                 )
 
     def _wait_for_sandbox_ready(
-        self,
-        sandbox: SandboxHandle,
-        *,
-        attempts: int = 15,
-        delay_s: float = 1.0,
+        self, sandbox: SandboxHandle, *, attempts: int = 15, delay_s: float = 1.0
     ) -> None:
-        """Probe sandbox until ``echo ok`` succeeds."""
         last_err = ""
         for _ in range(attempts):
             try:
@@ -405,7 +333,6 @@ def _wait_for_sandbox_ready(
         )
 
     def _agent_already_installed(self, sandbox: SandboxHandle) -> bool:
-        """Check if the agent CLI is already available in the sandbox."""
         cmd = " ".join(shlex.quote(c) for c in self.spec.install_check_cmd)
         try:
             r = sandbox.exec(cmd, timeout=10)
@@ -414,11 +341,9 @@ def _agent_already_installed(self, sandbox: SandboxHandle) -> bool:
             return False
 
     def _install_agent(self, sandbox: SandboxHandle) -> None:
-        """Run ``spec.setup`` commands to install the agent CLI."""
         if self.spec.setup is None:
             raise RuntimeError(
-                f"Agent {self.spec.name!r} is not installed in the sandbox "
-                "and no setup commands are provided in the spec."
+                f"Agent {self.spec.name!r} is not installed and no setup commands provided."
             )
         commands = (
             [self.spec.setup] if isinstance(self.spec.setup, str) else self.spec.setup
@@ -433,34 +358,22 @@ def _install_agent(self, sandbox: SandboxHandle) -> None:
                 label=f"{self.spec.name} install",
             )
 
-    def _upload_files(
-        self,
-        sandbox: SandboxHandle,
-        task: Any,
-        config: Any,
-    ) -> None:
-        """Upload ``spec.files`` into the sandbox, resolving callables."""
+    def _upload_files(self, sandbox: SandboxHandle, task: Any, config: Any) -> None:
         if not self.spec.files:
             return
         for path, content_or_fn in self.spec.files.items():
-            if callable(content_or_fn):
-                content = content_or_fn(task, config)
-            else:
-                content = content_or_fn
+            content = (
+                content_or_fn(task, config)
+                if callable(content_or_fn)
+                else content_or_fn
+            )
             if content is not None:
                 sandbox.write_text(path, content)
-
-        # Also upload task.upload_files if the task has them.
         upload_files = task.upload_files if hasattr(task, "upload_files") else {}
         for path, content in upload_files.items():
             sandbox.write_text(path, content)
 
-    def _write_mcp_config(
-        self,
-        sandbox: SandboxHandle,
-        config: Any,
-    ) -> None:
-        """Write MCP configuration using the spec's builder."""
+    def _write_mcp_config(self, sandbox: SandboxHandle, config: Any) -> None:
         if self.spec.build_mcp_config is None:
             return
         if (
@@ -476,15 +389,12 @@ def _write_mcp_config(
                 config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
             )
             mcp_path = self.spec.mcp_config.path_template.format(
-                workdir=workdir,
-                home=home,
+                workdir=workdir, home=home
             )
             mcp_content = self.spec.build_mcp_config(self.spec, [], workdir)
             if mcp_content:
                 sandbox.write_text(mcp_path, mcp_content)
 
-    # Agent launch
-
     def _start_agent(
         self,
         sandbox: SandboxHandle,
@@ -493,17 +403,14 @@ def _start_agent(
         *,
         base_url_override: str | None = None,
     ) -> BgJob:
-        """Build CLI command, resolve env vars, and launch as bg process."""
-        # Build command via spec hook
         if self.spec.build_command is not None:
             cmd = self.spec.build_command(self.spec, config, task, None)
         else:
             cmd = " ".join(shlex.quote(c) for c in self.spec.base_command)
-
-        # Resolve environment variables
         envs = self._resolve_env_vars(config, base_url_override=base_url_override)
-
-        _log.info("%s driver: launching agent", self.spec.name)
+        if self.mode == "interception_gate" and self._interception_server is not None:
+            envs["OPENAI_API_KEY"] = self._interception_server.secret
+            envs["ANTHROPIC_API_KEY"] = self._interception_server.secret
         return sandbox.start_bg(cmd, envs=envs)
 
     def _resolve_env_vars(
@@ -512,124 +419,24 @@ def _resolve_env_vars(
         *,
         base_url_override: str | None = None,
     ) -> dict[str, str]:
-        """Build the env var dict for the agent process.
-
-        If ``spec.build_env_vars`` is provided, delegate to it.
-        Otherwise resolve ``{placeholder}`` substitutions in ``spec.env``.
-        """
         if self.spec.build_env_vars is not None:
             return self.spec.build_env_vars(self.spec, config)
-
         if not self.spec.env:
             return {}
-
         base_url = base_url_override or (
             config.base_url if hasattr(config, "base_url") else ""
         )
         api_key = config.api_key if hasattr(config, "api_key") else "intercepted"
         model = config.model if hasattr(config, "model") else ""
-
-        substitutions = {
-            "base_url": base_url,
-            "api_key": api_key,
-            "model": model,
-        }
-
+        substitutions = {"base_url": base_url, "api_key": api_key, "model": model}
         resolved: dict[str, str] = {}
         for key, value in self.spec.env.items():
             try:
                 resolved[key] = value.format(**substitutions)
             except KeyError:
-                # If a placeholder isn't in our substitutions, keep it as-is.
                 resolved[key] = value
         return resolved
 
-    # Interception proxy
-
-    def _start_proxy(
-        self,
-        sandbox: SandboxHandle,
-        *,
-        base_url: str,
-        api_key: str,
-        model: str,
-    ) -> tuple[BgJob, str, str]:
-        """Install deps, start proxy as bg job, wait for healthz.
-
-        Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``.
-        """
-        proxy_already_present = sandbox.exists("/home/user/proxy/interception.py")
-
-        if not proxy_already_present:
-            self._exec_with_retry(
-                sandbox,
-                "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
-                "'httpx>=0.27' 2>&1 | tail -20",
-                timeout=180,
-                attempts=3,
-                backoff_s=2.0,
-                label="proxy deps install",
-            )
-            sandbox.write_text(
-                "/home/user/proxy/interception.py",
-                _PROXY_SOURCE_PATH.read_text(),
-            )
-            sandbox.write_text("/home/user/proxy/__init__.py", "")
-
-        proxy_args = [
-            "python",
-            "interception.py",
-            "--upstream-url",
-            base_url,
-            "--trace",
-            _PROXY_TRACE_PATH,
-            "--port",
-            str(_PROXY_PORT),
-            "--top-logprobs",
-            str(self._proxy_top_logprobs),
-        ]
-        if self._proxy_max_tokens_cap is not None:
-            proxy_args.extend(["--max-tokens-cap", str(self._proxy_max_tokens_cap)])
-        if self._proxy_disable_thinking:
-            proxy_args.append("--disable-thinking")
-        if model:
-            proxy_args.extend(["--model-override", model])
-
-        quoted = " ".join(shlex.quote(a) for a in proxy_args)
-        proxy_cmd = (
-            f"cd /home/user/proxy && {quoted} > {shlex.quote(_PROXY_LOG_PATH)} 2>&1"
-        )
-        proxy_env = {"OPENCODE_UPSTREAM_API_KEY": api_key}
-        proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env)
-
-        # Wait for proxy healthz
-        attempts = 120
-        interval_s = 0.5
-        for _ in range(attempts):
-            r = sandbox.exec(
-                f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz",
-                timeout=5,
-            )
-            if r.exit_code == 0:
-                break
-            time.sleep(interval_s)
-        else:
-            log_content = ""
-            try:
-                log_content = sandbox.read_text(_PROXY_LOG_PATH)
-            except Exception:
-                pass
-            proxy_job.kill()
-            raise RuntimeError(
-                f"proxy did not start within {attempts * interval_s:.0f}s. "
-                f"log:\n{log_content[-2000:]}"
-            )
-
-        override_url = f"http://127.0.0.1:{_PROXY_PORT}/v1"
-        return proxy_job, override_url, _PROXY_TRACE_PATH
-
-    # Utilities
-
     def _exec_with_retry(
         self,
         sandbox: SandboxHandle,
@@ -640,7 +447,6 @@ def _exec_with_retry(
         backoff_s: float = 3.0,
         label: str = "cmd",
     ) -> Any:
-        """Run ``sandbox.exec`` with exponential backoff on transient failure."""
         last_stdout = ""
         last_stderr = ""
         last_exit = 0
@@ -666,30 +472,19 @@ def _exec_with_retry(
         )
 
 
-# CLIAgentSessionFactory
-
-
 class CLIAgentSessionFactory(ResourceSessionFactory):
-    """Factory that produces :class:`CLIAgentSession` instances for any
-    registered agent.
-
-    Wraps :class:`CLIAgentDriver` to satisfy the
-    :class:`ResourceSessionFactory` contract from PR #603.
-    """
-
     def __init__(
         self,
         *,
         spec: CLIAgentSpec,
         config: Any,
         sandbox_backend: SandboxBackend,
-        mode: Literal["black_box", "transparent_proxy"] = "black_box",
+        mode: Literal["black_box", "interception_gate"] = "black_box",
         verifier: Verifier | None = None,
         install_timeout_s: int = 240,
         setup_timeout_s: int = 300,
-        proxy_top_logprobs: int = 5,
-        proxy_max_tokens_cap: int | None = 16384,
-        proxy_disable_thinking: bool = False,
+        interception_server: InterceptionServer | None = None,
+        interception_base_url: str | None = None,
     ) -> None:
         self._spec = spec
         self._config = config
@@ -700,9 +495,8 @@ def __init__(
             mode=mode,
             install_timeout_s=install_timeout_s,
             setup_timeout_s=setup_timeout_s,
-            proxy_top_logprobs=proxy_top_logprobs,
-            proxy_max_tokens_cap=proxy_max_tokens_cap,
-            proxy_disable_thinking=proxy_disable_thinking,
+            interception_server=interception_server,
+            interception_base_url=interception_base_url,
         )
 
     def create(
@@ -711,7 +505,6 @@ def create(
         seed: int | None = None,
         episode_id: str | None = None,
     ) -> CLIAgentSession:
-        """Create one isolated session for a rollout."""
         return self._driver.create_session(
             task=task,
             config=self._config,
@@ -721,9 +514,4 @@ def create(
         )
 
 
-__all__ = [
-    "CLIAgentDriver",
-    "CLIAgentSession",
-    "CLIAgentSessionFactory",
-    "Verifier",
-]
+__all__ = ["CLIAgentDriver", "CLIAgentSession", "CLIAgentSessionFactory", "Verifier"]
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
new file mode 100644
index 000000000..a075ec8b4
--- /dev/null
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -0,0 +1,324 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Host-side interception server for trainer-owned generation.
+
+The :class:`InterceptionServer` runs on the trainer node, outside any
+sandbox. Each sandbox's agent is pointed at::
+
+    http://<host>:<port>/rollout/<rollout_id>/v1
+
+When the agent makes an LLM call it blocks at this server. The training
+loop calls :meth:`~InterceptionServer.register_rollout` to get a queue,
+``await queue.get()`` to dequeue the pending request, runs its own vLLM
+forward pass, then calls :func:`deliver_response` to unblock the agent.
+
+The caller is responsible for making the server reachable from the sandbox.
+For Docker sandboxes on the same machine, ``host.docker.internal:<port>``
+works. For remote sandboxes (E2B, HF Sandbox), set up your own tunnel
+(ngrok, frp, public IP, VPN) and pass the URL as
+``interception_base_url``.
+
+Usage — training loop::
+
+    server = InterceptionServer(port=8765)
+    await server.start()
+
+    # Make the server reachable — your responsibility.
+    # Docker: base_url = f"http://host.docker.internal:{server.port}"
+    # Remote: base_url = your_tunnel_or_public_url
+
+    queue = server.register_rollout(rollout_id)
+    # Agent runs with OPENAI_BASE_URL = f"{base_url}/rollout/{rollout_id}/v1"
+
+    while True:
+        request_id = await asyncio.wait_for(queue.get(), timeout=...)
+        intercept = server.intercepts[request_id]
+        response = await vllm.generate(intercept["messages"], ...)
+        await deliver_response(intercept, response)
+
+    server.unregister_rollout(rollout_id)
+    await server.stop()
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hmac
+import json
+import logging
+import secrets
+import time
+import uuid
+from typing import Any
+
+from aiohttp import web
+
+
+_log = logging.getLogger(__name__)
+
+_KEEPALIVE_INTERVAL_S = 3.0
+_MAX_REQUEST_BODY = 16 * 1024 * 1024
+
+
+class InterceptionServer:
+    """Async HTTP server that gates every LLM call from sandboxed agents.
+
+    One shared instance handles all concurrent rollouts. Each rollout is
+    identified by a ``rollout_id`` in the URL path.
+    """
+
+    def __init__(self, port: int = 0, secret: str | None = None) -> None:
+        self.port = port
+        self.secret = secret or secrets.token_urlsafe(32)
+        self._app: web.Application | None = None
+        self._runner: web.AppRunner | None = None
+        self._site: web.TCPSite | None = None
+        self._lock = asyncio.Lock()
+        self.active_rollouts: dict[str, dict[str, Any]] = {}
+        self.intercepts: dict[str, dict[str, Any]] = {}
+
+    async def start(self) -> None:
+        async with self._lock:
+            if self._app is not None:
+                return
+            app = web.Application(client_max_size=_MAX_REQUEST_BODY)
+            app.router.add_post(
+                "/rollout/{rollout_id}/v1/chat/completions",
+                self._handle_chat_completions,
+            )
+            app.router.add_get("/health", self._handle_health)
+            runner = web.AppRunner(app)
+            await runner.setup()
+            site = web.TCPSite(runner, "0.0.0.0", self.port)
+            await site.start()
+            if self.port == 0:
+                server = getattr(site, "_server", None)
+                sockets = getattr(server, "sockets", None) if server else None
+                if sockets:
+                    self.port = sockets[0].getsockname()[1]
+            if self.port == 0:
+                raise RuntimeError("Failed to resolve OS-assigned port")
+            self._app = app
+            self._runner = runner
+            self._site = site
+            _log.info("InterceptionServer listening on :%d", self.port)
+
+    async def stop(self) -> None:
+        async with self._lock:
+            if self._runner is None:
+                return
+            for intercept in list(self.intercepts.values()):
+                fut: asyncio.Future | None = intercept.get("response_future")
+                if fut and not fut.done():
+                    fut.cancel()
+                cq: asyncio.Queue | None = intercept.get("chunk_queue")
+                if cq is not None:
+                    try:
+                        cq.put_nowait(None)
+                    except asyncio.QueueFull:
+                        pass
+            self.intercepts.clear()
+            self.active_rollouts.clear()
+            try:
+                await self._runner.cleanup()
+            except RuntimeError:
+                pass
+            self._runner = None
+            self._site = None
+            self._app = None
+
+    def register_rollout(
+        self,
+        rollout_id: str,
+        state: dict[str, Any] | None = None,
+    ) -> asyncio.Queue:
+        queue: asyncio.Queue = asyncio.Queue()
+        self.active_rollouts[rollout_id] = {
+            "request_id_queue": queue,
+            "state": state,
+        }
+        return queue
+
+    def unregister_rollout(self, rollout_id: str) -> None:
+        for request_id in list(self.intercepts):
+            intercept = self.intercepts.get(request_id)
+            if intercept and intercept.get("rollout_id") == rollout_id:
+                fut: asyncio.Future | None = intercept.get("response_future")
+                if fut and not fut.done():
+                    fut.cancel()
+                cq: asyncio.Queue | None = intercept.get("chunk_queue")
+                if cq is not None:
+                    try:
+                        cq.put_nowait(None)
+                    except asyncio.QueueFull:
+                        pass
+                del self.intercepts[request_id]
+        self.active_rollouts.pop(rollout_id, None)
+
+    def _authorized(self, request: web.Request) -> bool:
+        auth = request.headers.get("Authorization", "")
+        api_key = request.headers.get("x-api-key", "")
+        return hmac.compare_digest(
+            auth, f"Bearer {self.secret}"
+        ) or hmac.compare_digest(api_key, self.secret)
+
+    async def _handle_health(self, request: web.Request) -> web.Response:
+        return web.json_response({"status": "ok"})
+
+    async def _handle_chat_completions(
+        self, request: web.Request
+    ) -> web.StreamResponse | web.Response:
+        if not self._authorized(request):
+            return web.json_response({"error": "Unauthorized"}, status=401)
+
+        rollout_id = request.match_info["rollout_id"]
+        context = self.active_rollouts.get(rollout_id)
+        if not context:
+            return web.json_response({"error": "rollout not found"}, status=404)
+
+        try:
+            body = await request.json()
+        except Exception as exc:
+            return web.json_response({"error": f"invalid JSON: {exc}"}, status=400)
+
+        is_streaming = bool(body.get("stream"))
+        request_id = f"req_{uuid.uuid4().hex[:8]}"
+        chunk_queue: asyncio.Queue | None = asyncio.Queue() if is_streaming else None
+
+        intercept: dict[str, Any] = {
+            "request_id": request_id,
+            "rollout_id": rollout_id,
+            "messages": body.get("messages"),
+            "model": body.get("model"),
+            "tools": body.get("tools"),
+            "stream": is_streaming,
+            "chunk_queue": chunk_queue,
+            "response_future": asyncio.get_event_loop().create_future(),
+            "body": body,
+        }
+        self.intercepts[request_id] = intercept
+        await context["request_id_queue"].put(request_id)
+
+        if is_streaming:
+            return await self._stream_response(request, intercept)
+
+        try:
+            response_dict = await intercept["response_future"]
+        except asyncio.CancelledError:
+            return web.json_response({"error": "rollout cancelled"}, status=499)
+        except Exception as exc:
+            return web.json_response({"error": str(exc)}, status=500)
+
+        return web.json_response(response_dict)
+
+    async def _stream_response(
+        self, request: web.Request, intercept: dict[str, Any]
+    ) -> web.StreamResponse:
+        chunk_queue: asyncio.Queue = intercept["chunk_queue"]
+        resp = web.StreamResponse(
+            status=200,
+            headers={
+                "Content-Type": "text/event-stream",
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            },
+        )
+        await resp.prepare(request)
+        get_task: asyncio.Task | None = None
+        try:
+            while True:
+                if get_task is None:
+                    get_task = asyncio.create_task(chunk_queue.get())
+                done, _ = await asyncio.wait({get_task}, timeout=_KEEPALIVE_INTERVAL_S)
+                if get_task not in done:
+                    await resp.write(b": keepalive\n\n")
+                    continue
+                chunk = get_task.result()
+                get_task = None
+                if chunk is None:
+                    await resp.write(b"data: [DONE]\n\n")
+                    break
+                await resp.write(f"data: {json.dumps(chunk)}\n\n".encode())
+                await asyncio.sleep(0)
+        except (asyncio.CancelledError, ConnectionResetError):
+            pass
+        finally:
+            if get_task and not get_task.done():
+                get_task.cancel()
+        try:
+            await resp.write_eof()
+        except Exception:
+            pass
+        return resp
+
+
+async def deliver_response(
+    intercept: dict[str, Any], response_dict: dict[str, Any]
+) -> None:
+    """Unblock the agent's HTTP handler with ``response_dict``.
+
+    For non-streaming requests, resolves the future directly.
+    For streaming requests, synthesizes SSE chunks from the complete
+    response and signals EOF.
+    """
+    is_streaming = intercept.get("stream", False)
+    chunk_queue: asyncio.Queue | None = intercept.get("chunk_queue")
+    future: asyncio.Future | None = intercept.get("response_future")
+
+    if not is_streaming:
+        if future and not future.done():
+            future.set_result(response_dict)
+        return
+
+    if chunk_queue is None:
+        raise RuntimeError("chunk_queue missing on streaming intercept")
+
+    choices = response_dict.get("choices") or []
+    for choice in choices:
+        msg = choice.get("message") or {}
+        content_chunk = {
+            "id": response_dict.get("id", ""),
+            "object": "chat.completion.chunk",
+            "created": response_dict.get("created", int(time.time())),
+            "model": response_dict.get("model", ""),
+            "choices": [
+                {
+                    "index": choice.get("index", 0),
+                    "delta": {
+                        "role": "assistant",
+                        "content": msg.get("content"),
+                        "tool_calls": msg.get("tool_calls"),
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        }
+        await chunk_queue.put(content_chunk)
+        finish_chunk = {
+            "id": response_dict.get("id", ""),
+            "object": "chat.completion.chunk",
+            "created": response_dict.get("created", int(time.time())),
+            "model": response_dict.get("model", ""),
+            "choices": [
+                {
+                    "index": choice.get("index", 0),
+                    "delta": {},
+                    "finish_reason": choice.get("finish_reason"),
+                }
+            ],
+        }
+        await chunk_queue.put(finish_chunk)
+
+    await chunk_queue.put(None)
+    if future and not future.done():
+        future.set_result(response_dict)
+
+
+__all__ = [
+    "InterceptionServer",
+    "deliver_response",
+]

From 171a3eaf23da0b0f109bd40af635b398ac10482c Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Fri, 15 May 2026 23:35:54 +0530
Subject: [PATCH 13/35] refactor: wire coding_agent_env with interception_gate

---
 envs/coding_agent_env/client.py               |   7 +-
 envs/coding_agent_env/harness.py              | 163 ++----------------
 envs/coding_agent_env/models.py               |   2 +-
 .../server/coding_environment.py              |  68 +-------
 envs/coding_agent_env/server/gradio_ui.py     |   6 +-
 5 files changed, 26 insertions(+), 220 deletions(-)

diff --git a/envs/coding_agent_env/client.py b/envs/coding_agent_env/client.py
index 7e2a21696..c1e0f6f92 100644
--- a/envs/coding_agent_env/client.py
+++ b/envs/coding_agent_env/client.py
@@ -63,7 +63,7 @@ def run_rollout(
         verify: list[str] | None = None,
         # Bookkeeping / tunables
         task_id: str = "",
-        mode: str = "transparent_proxy",
+        mode: str = "black_box",
         disable_thinking: bool | None = None,
         max_tokens_cap: int = 4096,
         top_logprobs: int = 5,
@@ -87,8 +87,9 @@ def run_rollout(
                 Reward = ``passed_count / total`` unless any command writes a
                 float to ``/home/user/logs/verifier/reward.txt`` (override).
             task_id: Echoed back in the result for traceability.
-            mode: ``"transparent_proxy"`` (captures per-token logprobs via
-                an in-sandbox FastAPI proxy) or ``"black_box"`` (no proxy).
+            mode: ``"black_box"`` (agent talks directly to the LLM) or
+                ``"interception_gate"`` (LLM calls routed to trainer-side
+                InterceptionServer for trainer-owned generation).
             disable_thinking: Inject
                 ``chat_template_kwargs.enable_thinking=false`` on forwarded
                 requests. Needed for Qwen3.5 vLLM; harmless on Instruct
diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py
index ccbfa2cfc..295b07ac3 100644
--- a/envs/coding_agent_env/harness.py
+++ b/envs/coding_agent_env/harness.py
@@ -4,22 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Coding-agent session factory + session — backed by CLIAgentDriver.
-
-This module exposes :class:`CodingAgentSession` and
-:class:`CodingAgentSessionFactory` built on top of the generic
-:class:`CLIAgentDriver` / :class:`CLIAgentSession` /
-:class:`CLIAgentSessionFactory` from ``openenv.core.harness.agents``.
-
-Agent-specific (OpenCode spec) configuration (``opencode.json`` generation, provider
-mapping, tool enable/disable) is handled by
-:mod:`coding_agent_env.opencode_runtime` builders wired into the
-:data:`OPENCODE_SPEC` via callable hooks.
-"""
+"""Coding-agent session factory + session — backed by CLIAgentDriver."""
 
 from __future__ import annotations
 
-from pathlib import Path
 from typing import Any, Literal
 
 from openenv.core.harness import ResourceSessionFactory
@@ -28,8 +16,9 @@
     CLIAgentSession,
     Verifier,
 )
+from openenv.core.harness.agents.interception_server import InterceptionServer
 from openenv.core.harness.agents.opencode import OPENCODE_SPEC
-from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
+from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
 
 from .config import CodingAgentConfig
 from .opencode_runtime import (
@@ -45,29 +34,7 @@
 from .task import CodingAgentTask
 
 
-# Inside-sandbox proxy paths (Mode B).
-_PROXY_PORT = 7000
-_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
-_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
-
-_PROXY_SOURCE_PATH = (
-    Path(__file__).resolve().parents[2]
-    / "src"
-    / "openenv"
-    / "core"
-    / "harness"
-    / "sandbox"
-    / "interception.py"
-)
-
-
 class CodingAgentSession(CLIAgentSession):
-    """One live coding-agent rollout inside a sandbox.
-
-    Extends :class:`CLIAgentSession` with Agent-specific (OpenCode spec) convenience
-    methods (``fetch_trace``, ``wait_for_completion`` with config-aware timeout).
-    """
-
     def __init__(
         self,
         *,
@@ -76,9 +43,6 @@ def __init__(
         task: CodingAgentTask,
         verifier: Verifier | None = None,
         base_url_override: str | None = None,
-        proxy_trace_path: str | None = None,
-        proxy_bg_job: BgJob | None = None,
-        agent_bg_job: BgJob | None = None,
     ) -> None:
         super().__init__(
             spec=OPENCODE_SPEC,
@@ -87,28 +51,18 @@ def __init__(
             config=config,
             verifier=verifier,
             base_url_override=base_url_override,
-            proxy_trace_path=proxy_trace_path,
-            proxy_bg_job=proxy_bg_job,
-            agent_bg_job=agent_bg_job,
         )
 
     def fetch_trace(self) -> str:
-        """Return the raw ``opencode run`` log (JSONL when ``run_format=json``)."""
         return self.sandbox.read_text(agent_log_path(self.config))
 
     def wait_for_completion(self, timeout_s: float | None = None) -> int:
-        """Block until the agent exits, returning its exit code."""
         budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
         if self._agent_bg_job is None:
-            raise RuntimeError("Agent not started; call start_agent() first.")
+            raise RuntimeError("Agent not started.")
         return self._agent_bg_job.wait(timeout=budget)
 
     def start_agent(self) -> None:
-        """Launch ``opencode run`` as a background subprocess in the sandbox.
-
-        The factory starts the agent during ``create()``; this method is a no-op
-        if the agent is already running.
-        """
         if self._agent_bg_job is not None:
             return
         cmd = build_run_cmd(self.config)
@@ -117,28 +71,19 @@ def start_agent(self) -> None:
 
 
 class CodingAgentSessionFactory(ResourceSessionFactory):
-    """Produce isolated per-rollout :class:`CodingAgentSession` instances.
-
-    The factory owns sandbox provisioning, opencode install, config injection,
-    and (Mode B) proxy startup. Each :meth:`create` call returns a fresh
-    sandbox with a running agent.
-
-    Internally delegates to :class:`CLIAgentDriver` for the generic
-    sandbox lifecycle (readiness probing, install retry, proxy startup).
-    Agent-specific (OpenCode spec) config generation uses ``opencode_runtime`` builders.
-    """
-
     def __init__(
         self,
         *,
         config: CodingAgentConfig,
         sandbox_backend: SandboxBackend,
-        mode: Literal["black_box", "transparent_proxy"] = "black_box",
+        mode: Literal["black_box", "interception_gate"] = "black_box",
         verifier: Verifier | None = None,
         install_timeout_s: int = 240,
         setup_timeout_s: int = 300,
+        interception_server: InterceptionServer | None = None,
+        interception_base_url: str | None = None,
     ) -> None:
-        if mode not in {"black_box", "transparent_proxy"}:
+        if mode not in {"black_box", "interception_gate"}:
             raise ValueError(f"Unknown mode: {mode!r}")
         self._config = config
         self._backend = sandbox_backend
@@ -146,17 +91,14 @@ def __init__(
         self._verifier = verifier
         self._install_timeout_s = install_timeout_s
         self._setup_timeout_s = setup_timeout_s
-
-        # Build a CLIAgentDriver for the shared lifecycle.
         self._driver = CLIAgentDriver(
             spec=OPENCODE_SPEC,
             sandbox_backend=sandbox_backend,
             mode=mode,
             install_timeout_s=install_timeout_s,
             setup_timeout_s=setup_timeout_s,
-            proxy_top_logprobs=config.proxy_top_logprobs,
-            proxy_max_tokens_cap=config.proxy_max_tokens_cap,
-            proxy_disable_thinking=config.proxy_disable_thinking,
+            interception_server=interception_server,
+            interception_base_url=interception_base_url,
         )
 
     def create(
@@ -168,87 +110,29 @@ def create(
         import logging
 
         _log = logging.getLogger(__name__)
-
         oc_task = CodingAgentTask.coerce(task)
         sandbox_timeout = int(self._config.agent_timeout_s) + 300
-
-        _log.info(
-            "factory.create: creating sandbox timeout=%ds mode=%s",
-            sandbox_timeout,
-            self._mode,
-        )
         sandbox = self._backend.create(
             timeout_s=sandbox_timeout,
             metadata={"episode_id": episode_id} if episode_id else None,
         )
-        sid = getattr(sandbox, "sandbox_id", "?")
-        _log.info("factory.create: sandbox=%s — bootstrapping…", sid)
-
         try:
             self._bootstrap_sandbox(sandbox, oc_task)
         except Exception as exc:
             _log.error("factory.create: bootstrap failed: %r", exc)
             sandbox.kill()
             raise
-
-        base_url_override: str | None = None
-        proxy_trace_path: str | None = None
-        proxy_bg_job: BgJob | None = None
-        if self._mode == "transparent_proxy":
-            _log.info(
-                "factory.create: starting interception proxy on :%d → %s",
-                _PROXY_PORT,
-                self._config.base_url,
-            )
-            proxy_bg_job, base_url_override, proxy_trace_path = (
-                self._driver._start_proxy(
-                    sandbox,
-                    base_url=self._config.base_url,
-                    api_key=self._config.api_key,
-                    model=self._config.model,
-                )
-            )
-            _log.info("factory.create: proxy up at %s", base_url_override)
-            # Rewrite opencode.json so opencode points at the proxy.
-            proxy_cfg = CodingAgentConfig(
-                **{
-                    **self._config.model_dump(),
-                    "provider": "openai_compatible",
-                    "base_url": base_url_override,
-                }
-            )
-            sandbox.write_text(
-                opencode_config_path(self._config),
-                build_opencode_json(proxy_cfg),
-            )
-
         session = CodingAgentSession(
             sandbox=sandbox,
             config=self._config,
             task=oc_task,
             verifier=self._verifier,
-            base_url_override=base_url_override,
-            proxy_trace_path=proxy_trace_path,
-            proxy_bg_job=proxy_bg_job,
         )
         session.start_agent()
         return session
 
-    # ------------------------------------------------------------------
-    # Bootstrap — delegates to CLIAgentDriver utilities
-    # ------------------------------------------------------------------
-
-    def _bootstrap_sandbox(
-        self,
-        sandbox: SandboxHandle,
-        task: CodingAgentTask,
-    ) -> None:
-        """Install opencode, write config + task files, run optional setup."""
-
-        # Stage 1: wait for the sandbox to be responsive.
+    def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: CodingAgentTask) -> None:
         self._driver._wait_for_sandbox_ready(sandbox)
-
-        # Stage 2: install opencode (skipped if pre-baked).
         if not self._driver._agent_already_installed(sandbox):
             self._driver._exec_with_retry(
                 sandbox,
@@ -258,24 +142,16 @@ def _bootstrap_sandbox(
                 backoff_s=3.0,
                 label="opencode install",
             )
-
-        # Stage 3: write opencode.json + task files.
         sandbox.write_text(
-            opencode_config_path(self._config),
-            build_opencode_json(self._config),
+            opencode_config_path(self._config), build_opencode_json(self._config)
         )
         sandbox.write_text(instruction_path(self._config), task.instruction)
-
         if self._config.system_prompt:
             sandbox.write_text(
-                system_prompt_path(self._config),
-                self._config.system_prompt,
+                system_prompt_path(self._config), self._config.system_prompt
             )
-
         for remote_path, content in task.upload_files.items():
             sandbox.write_text(remote_path, content)
-
-        # Stage 4: extra setup
         if self._config.extra_setup_shell:
             self._driver._exec_with_retry(
                 sandbox,
@@ -285,7 +161,6 @@ def _bootstrap_sandbox(
                 backoff_s=2.0,
                 label="extra_setup_shell",
             )
-
         if task.setup_shell:
             r = sandbox.exec(task.setup_shell, timeout=self._setup_timeout_s)
             if r.exit_code != 0:
@@ -293,18 +168,6 @@ def _bootstrap_sandbox(
                     f"task.setup_shell failed ({r.exit_code}): {r.stderr}"
                 )
 
-    def _start_proxy(
-        self,
-        sandbox: SandboxHandle,
-    ) -> tuple[BgJob, str, str]:
-        """Start proxy — delegates to driver."""
-        return self._driver._start_proxy(
-            sandbox,
-            base_url=self._config.base_url,
-            api_key=self._config.api_key,
-            model=self._config.model,
-        )
-
 
 __all__ = [
     "CodingAgentSession",
diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py
index 3e31962fb..821b1bd57 100644
--- a/envs/coding_agent_env/models.py
+++ b/envs/coding_agent_env/models.py
@@ -59,7 +59,7 @@ class RolloutResult(BaseModel):
     reward: float | None = None
     agent_exit_code: int | None = None
     wall_s: float = 0.0
-    mode: str = "transparent_proxy"
+    mode: str = "black_box"
 
     # Per-step results
     setup_results: list[CommandResult] = Field(default_factory=list)
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index e389eb759..0c8598cdd 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -143,7 +143,7 @@ def run_rollout(
             verify: Optional[list[str]] = None,
             # Bookkeeping / tunables
             task_id: str = "",
-            mode: str = "transparent_proxy",
+            mode: str = "black_box",
             disable_thinking: Optional[bool] = None,
             max_tokens_cap: int = 4096,
             top_logprobs: int = 5,
@@ -359,10 +359,7 @@ def _emit(msg: str) -> None:
             )
             session = factory.create(task=rollout_task)
             result.sandbox_id = session.sandbox.sandbox_id
-            _emit(
-                f"sandbox ready: {result.sandbox_id} — agent started "
-                f"({'proxy on :7000, logprobs capturing' if mode == 'transparent_proxy' else 'direct LLM, no logprobs'})"
-            )
+            _emit(f"sandbox ready: {result.sandbox_id} — agent started (mode={mode})")
 
             # Run setup commands one at a time, *before* the agent starts.
             # The factory has already started the agent in start_agent()
@@ -474,16 +471,9 @@ def _build_agent_config(
                 api_key=api_key,
                 model=model,
                 agent_timeout_s=agent_timeout_s,
-                proxy_disable_thinking=disable_thinking,
-                proxy_top_logprobs=top_logprobs,
-                proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
             )
 
-        provider = (
-            "openai"
-            if mode == "transparent_proxy"
-            else self._infer_pi_provider(base_url)
-        )
+        provider = self._infer_pi_provider(base_url)
         return _GenericAgentConfig(
             base_url=base_url.rstrip("/"),
             api_key=api_key,
@@ -524,9 +514,6 @@ def _build_session_factory(
             sandbox_backend=backend,
             mode=mode,
             verifier=None,
-            proxy_disable_thinking=disable_thinking,
-            proxy_top_logprobs=top_logprobs,
-            proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
         )
 
     @staticmethod
@@ -602,53 +589,8 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]:
         return files, extras
 
     def _collect_proxy_turns(self, session: Any) -> list[Any]:
-        turns: list[Any] = []
-
-        records: list[dict[str, Any]] = []
-        if hasattr(session, "fetch_proxy_trace"):
-            try:
-                fetched = session.fetch_proxy_trace()
-                if isinstance(fetched, list):
-                    records = [r for r in fetched if isinstance(r, dict)]
-            except Exception:
-                records = []
-
-        if not records:
-            proxy_trace_path = getattr(session, "_proxy_trace_path", None)
-            if not proxy_trace_path:
-                return turns
-            raw = self._safe_read(session.sandbox, proxy_trace_path)
-            for line in raw.splitlines():
-                line = line.strip()
-                if not line:
-                    continue
-                try:
-                    rec = json.loads(line)
-                except Exception:
-                    continue
-                if isinstance(rec, dict):
-                    records.append(rec)
-
-        for rec in records:
-            response = rec.get("response") or {}
-            turns.append(
-                self._RolloutTurn(
-                    turn=int(rec.get("turn") or 0),
-                    finish_reason=rec.get("finish_reason"),
-                    completion_tokens=list(rec.get("completion_tokens") or []),
-                    completion_token_ids=list(rec.get("completion_token_ids") or []),
-                    per_token_logps=[
-                        float(x)
-                        for x in (rec.get("per_token_logps") or [])
-                        if x is not None
-                    ],
-                    latency_s=float(rec.get("latency_s") or 0.0),
-                    timestamp=float(rec.get("timestamp") or 0.0),
-                    upstream_status=response.get("upstream_status"),
-                    upstream_error=response.get("upstream_error"),
-                )
-            )
-        return turns
+        """Logprob capture is now owned by the training loop via interception_gate."""
+        return []
 
     @staticmethod
     def _safe_read(sandbox: Any, path: str) -> str:
diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py
index ef3f94aeb..82f130ce3 100644
--- a/envs/coding_agent_env/server/gradio_ui.py
+++ b/envs/coding_agent_env/server/gradio_ui.py
@@ -158,7 +158,7 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]:
 
 def _logprobs_md(turns: list[dict[str, Any]]) -> str:
     if not turns:
-        return "_No proxy turns captured._\n\nThis is normal in `black_box` mode. In `transparent_proxy` mode, an empty list usually means the agent never made an LLM call (check the agent log)."
+        return "_No proxy turns captured._\n\nLogprob capture is handled by the training loop via `interception_gate` mode."
     n = len(turns)
     productive = sum(1 for t in turns if t.get("completion_tokens"))
     total_toks = sum(len(t.get("completion_tokens") or []) for t in turns)
@@ -523,8 +523,8 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         with gr.Accordion("Tunables", open=False):
             with gr.Row():
                 mode = gr.Dropdown(
-                    choices=["transparent_proxy", "black_box"],
-                    value="transparent_proxy",
+                    choices=["black_box", "interception_gate"],
+                    value="black_box",
                     label="mode",
                 )
                 disable_thinking = gr.Dropdown(

From 52a024e2c616d4cb46c8dc03fb99fcc990201d5d Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Fri, 15 May 2026 23:36:04 +0530
Subject: [PATCH 14/35] chore: update tests for interception_gate, remove proxy
 test cases

---
 tests/core/test_cli_agent_driver.py |  81 ++-----------------
 tests/envs/test_coding_agent_env.py | 118 +++-------------------------
 2 files changed, 16 insertions(+), 183 deletions(-)

diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 29bf06caa..0b218f19a 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -207,7 +207,6 @@ def test_cli_agent_spec_minimal(self):
             mcp_config=MCPConfigSpec(method="cli_flags"),
         )
         assert spec.name == "test-agent"
-        assert spec.supports_logprob_proxy is True
         assert spec.default_timeout_s == 600.0
         assert spec.setup is None
         assert spec.files is None
@@ -229,7 +228,6 @@ def test_cli_agent_spec_full(self):
             mcp_config=MCPConfigSpec(
                 method="config_file", path_template="{workdir}/mcp.json"
             ),
-            supports_logprob_proxy=True,
             default_timeout_s=900.0,
             setup="npm install -g full-agent",
             files={
@@ -457,41 +455,16 @@ def test_create_session_skips_install_when_prebaked(self):
         assert not any("apt-get install" in cmd for cmd in sbx.executed)
         session.close()
 
-    def test_create_session_with_proxy(self):
+    def test_create_session_interception_gate_requires_server(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
 
         spec = _make_test_spec()
-        backend = FakeSandboxBackend()
-        driver = CLIAgentDriver(
-            spec=spec,
-            sandbox_backend=backend,
-            mode="transparent_proxy",
-        )
-
-        session = driver.create_session(
-            task=FakeTask(),
-            config=FakeConfig(),
-        )
-
-        sbx = backend.created[0]
-
-        # Proxy source should have been uploaded
-        assert "/home/user/proxy/interception.py" in sbx.written
-        assert "/home/user/proxy/__init__.py" in sbx.written
-
-        # Proxy should have been started as bg (before agent)
-        # and agent as second bg
-        assert len(sbx.bg_commands) == 2
-        proxy_cmd, proxy_envs = sbx.bg_commands[0]
-        assert "interception.py" in proxy_cmd
-        assert proxy_envs == {"OPENCODE_UPSTREAM_API_KEY": "sk-test-key"}
-
-        # Agent env should point at proxy
-        agent_cmd, agent_envs = sbx.bg_commands[1]
-        assert agent_envs is not None
-        assert agent_envs["BASE_URL"] == "http://127.0.0.1:7000/v1"
-
-        session.close()
+        with pytest.raises(ValueError, match="InterceptionServer"):
+            CLIAgentDriver(
+                spec=spec,
+                sandbox_backend=FakeSandboxBackend(),
+                mode="interception_gate",
+            )
 
     def test_create_session_uploads_task_files(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
@@ -679,49 +652,12 @@ def test_collect_artifacts_missing_required_raises(self):
         with pytest.raises(FileNotFoundError):
             session.collect_artifacts()
 
-    def test_fetch_proxy_trace_black_box(self):
-        from openenv.core.harness.agents.cli_driver import CLIAgentSession
-
-        spec = _make_test_spec()
-        session = CLIAgentSession(
-            spec=spec,
-            sandbox=FakeSandbox(),
-            task=FakeTask(),
-            config=FakeConfig(),
-            proxy_trace_path=None,
-        )
-        assert session.fetch_proxy_trace() == []
-
-    def test_fetch_proxy_trace_with_data(self):
-        from openenv.core.harness.agents.cli_driver import CLIAgentSession
-
-        spec = _make_test_spec()
-        sbx = FakeSandbox()
-        trace_path = "/logs/proxy_trace.jsonl"
-        sbx.written[trace_path] = (
-            json.dumps({"turn": 1, "latency_s": 0.5})
-            + "\n"
-            + json.dumps({"turn": 2, "latency_s": 0.3})
-            + "\n"
-        )
-        session = CLIAgentSession(
-            spec=spec,
-            sandbox=sbx,
-            task=FakeTask(),
-            config=FakeConfig(),
-            proxy_trace_path=trace_path,
-        )
-        trace = session.fetch_proxy_trace()
-        assert len(trace) == 2
-        assert trace[0]["turn"] == 1
-
     def test_close_kills_sandbox_and_jobs(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentSession
 
         spec = _make_test_spec()
         sbx = FakeSandbox()
         agent_job = FakeBgJob()
-        proxy_job = FakeBgJob()
 
         session = CLIAgentSession(
             spec=spec,
@@ -729,12 +665,10 @@ def test_close_kills_sandbox_and_jobs(self):
             task=FakeTask(),
             config=FakeConfig(),
             agent_bg_job=agent_job,
-            proxy_bg_job=proxy_job,
         )
         session.close()
         assert sbx._killed
         assert session._agent_bg_job is None
-        assert session._proxy_bg_job is None
 
 
 class TestCLIAgentSessionFactory:
@@ -814,7 +748,6 @@ def test_spec_fields(self):
             "/home/user/.opencode/bin/opencode",
             "--version",
         ]
-        assert OPENCODE_SPEC.supports_logprob_proxy is True
         assert OPENCODE_SPEC.default_timeout_s == 900.0
         assert OPENCODE_SPEC.mcp_config.method == "config_file"
         assert OPENCODE_SPEC.mcp_config.path_template is not None
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py
index 3a89a3ce6..6626c1c59 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -24,7 +24,6 @@
 from __future__ import annotations
 
 import os
-import shlex
 import sys
 
 import pytest
@@ -172,7 +171,7 @@ def test_build_agent_config_opencode() -> None:
     env = CodingAgentEnvironment()
     cfg = env._build_agent_config(
         agent="opencode",
-        mode="transparent_proxy",
+        mode="black_box",
         base_url="https://api.openai.com/v1",
         api_key="sk-test",
         model="gpt-4o-mini",
@@ -182,9 +181,8 @@ def test_build_agent_config_opencode() -> None:
         max_tokens_cap=2048,
     )
     assert isinstance(cfg, env._CodingAgentConfig)
-    assert cfg.proxy_disable_thinking is True
-    assert cfg.proxy_top_logprobs == 7
-    assert cfg.proxy_max_tokens_cap == 2048
+    assert cfg.model == "gpt-4o-mini"
+    assert cfg.agent_timeout_s == 123.0
 
 
 def test_build_agent_config_pi() -> None:
@@ -206,9 +204,9 @@ def test_build_agent_config_pi() -> None:
     assert cfg.thinking == "off"
     assert cfg.model == "zai-org/GLM-5.1"
 
-    cfg_proxy = env._build_agent_config(
+    cfg_gate = env._build_agent_config(
         agent="pi",
-        mode="transparent_proxy",
+        mode="interception_gate",
         base_url="https://router.huggingface.co/v1",
         api_key="hf_xxx",
         model="zai-org/GLM-5.1",
@@ -217,7 +215,7 @@ def test_build_agent_config_pi() -> None:
         top_logprobs=5,
         max_tokens_cap=4096,
     )
-    assert cfg_proxy.provider == "openai"
+    assert cfg_gate.provider == "huggingface"
 
 
 # ---------------------------------------------------------------------------
@@ -234,7 +232,7 @@ def test_rollout_result_serializes_round_trip() -> None:
         reward=0.75,
         agent_exit_code=0,
         wall_s=12.5,
-        mode="transparent_proxy",
+        mode="black_box",
         setup_results=[CommandResult(cmd="pip install pandas", exit_code=0)],
         verify_results=[CommandResult(cmd="pytest", exit_code=1, stderr="boom")],
         proxy_turns=[
@@ -288,105 +286,6 @@ def test_coding_agent_task_coerce_rejects_unknown_type() -> None:
         CodingAgentTask.coerce(42)  # type: ignore[arg-type]
 
 
-def test_start_proxy_keeps_upstream_key_out_of_command() -> None:
-    """The proxy API key must be passed via env, not shell argv."""
-    from coding_agent_env import CodingAgentConfig, CodingAgentSessionFactory
-
-    class FakeExecResult:
-        exit_code = 0
-        stdout = "ok"
-        stderr = ""
-
-    class FakeBgJob:
-        def wait(self, timeout: float | None = None) -> int:
-            return 0
-
-        def kill(self) -> None:
-            pass
-
-    class FakeSandbox:
-        sandbox_id = "fake-sandbox"
-
-        def __init__(self) -> None:
-            self.started_cmd: str | None = None
-            self.started_envs: dict[str, str] | None = None
-            self.written: dict[str, str] = {}
-
-        def exec(self, *args, **kwargs) -> FakeExecResult:
-            return FakeExecResult()
-
-        def start_bg(self, cmd: str, *, envs=None, cwd=None) -> FakeBgJob:
-            self.started_cmd = cmd
-            self.started_envs = envs
-            return FakeBgJob()
-
-        def write_text(self, path: str, content: str) -> None:
-            self.written[path] = content
-
-        def read_text(self, path: str) -> str:
-            return ""
-
-        def exists(self, path: str) -> bool:
-            return path in self.written
-
-        def kill(self) -> None:
-            pass
-
-    secret = "sk-test '$(leak)"
-    model = "provider/model'; touch /tmp/pwn #"
-    config = CodingAgentConfig(
-        base_url="https://example.test/v1?x='y",
-        api_key=secret,
-        model=model,
-    )
-    sandbox = FakeSandbox()
-    factory = CodingAgentSessionFactory(
-        config=config,
-        sandbox_backend=object(),  # unused by this protected-method test
-        mode="transparent_proxy",
-    )
-
-    # _start_proxy delegates to CLIAgentDriver._start_proxy which runs the
-    # proxy inside the sandbox. The driver handles dep install + source upload.
-    factory._start_proxy(sandbox)
-
-    assert sandbox.started_cmd is not None
-    assert sandbox.started_envs == {"OPENCODE_UPSTREAM_API_KEY": secret}
-    assert secret not in sandbox.started_cmd
-    assert "--upstream-api-key" not in sandbox.started_cmd
-
-    argv = shlex.split(sandbox.started_cmd.split("&&", 1)[1].split(">", 1)[0].strip())
-    assert argv[argv.index("--upstream-url") + 1] == config.base_url
-    assert argv[argv.index("--model-override") + 1] == model
-
-
-def test_interception_cli_reads_upstream_key_from_env(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    from openenv.core.harness.sandbox import interception
-
-    captured = {}
-
-    def fake_serve(cfg) -> None:
-        captured["cfg"] = cfg
-
-    monkeypatch.setattr(interception, "serve", fake_serve)
-    monkeypatch.setenv("OPENCODE_UPSTREAM_API_KEY", "sk-from-env")
-    monkeypatch.setattr(
-        sys,
-        "argv",
-        [
-            "interception.py",
-            "--upstream-url",
-            "https://example.test/v1",
-        ],
-    )
-
-    interception.main()
-
-    assert captured["cfg"].upstream_api_key == "sk-from-env"
-
-
 # ---------------------------------------------------------------------------
 # Integration — only runs when E2B + endpoint creds are present and the
 # user explicitly opts in via ``pytest -m integration``.
@@ -447,7 +346,8 @@ async def _go() -> RolloutResult:
     assert result.reward == 1.0, (
         f"expected reward=1.0 got {result.reward}: {result.error}"
     )
-    assert result.proxy_turns, "expected at least one captured LLM turn"
+    # proxy_turns is now always empty — logprob capture is trainer-owned
+    # via interception_gate mode, not captured by the environment.
     assert any(f.endswith("/binary_search.py") for f in result.files), (
         f"expected binary_search.py in workdir, got {list(result.files)}"
     )

From 4b1b707e749f900706b1d5677335030c15fb19c6 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 02:24:35 +0530
Subject: [PATCH 15/35] chore: address greptile review comments

---
 .../server/coding_environment.py              | 50 ++++++++++---------
 src/openenv/core/harness/agents/cli_driver.py |  2 -
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index 0c8598cdd..ceee49002 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -98,12 +98,14 @@ def __init__(self) -> None:
 
         from openenv.core.harness.agents import get_agent_spec
         from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
-        from coding_agent_env import (
-            E2BSandboxBackend,
-            CodingAgentConfig,
-            CodingAgentSessionFactory,
-            CodingAgentTask,
-        )
+        from coding_agent_env.config import CodingAgentConfig
+        from coding_agent_env.harness import CodingAgentSessionFactory
+        from coding_agent_env.task import CodingAgentTask
+
+        try:
+            from openenv.core.harness.sandbox import E2BSandboxBackend
+        except ImportError:
+            E2BSandboxBackend = None  # type: ignore[assignment,misc]
 
         self._CommandResult = CommandResult
         self._RolloutResult = RolloutResult
@@ -330,14 +332,18 @@ def _emit(msg: str) -> None:
             max_tokens_cap=max_tokens_cap,
         )
 
-        # Concatenate setup commands into a single ``set -e`` script and let
-        # the primitive run it as ``task.setup_shell`` before the agent
-        # starts. The per-command tracking happens here too — we re-run
-        # each command in a wrapper that captures exit/stdout/stderr.
-        # That way the primitive still aborts on setup failure AND we get
-        # observability in the response.
+        # Concatenate setup commands into a single ``set -e`` script so the
+        # primitive runs them inside _bootstrap_sandbox BEFORE the agent
+        # starts. This avoids the race where the agent's first tool call
+        # depends on files or packages that setup is still installing.
+        setup_shell: str | None = None
+        if setup:
+            # ``set -e`` makes the script abort on the first failing command.
+            setup_shell = "set -e\n" + "\n".join(setup)
+
         rollout_task = self._CodingAgentTask(
             instruction=instruction,
+            setup_shell=setup_shell,
             metadata={"task_id": task_id, "agent": agent},
         )
 
@@ -361,23 +367,21 @@ def _emit(msg: str) -> None:
             result.sandbox_id = session.sandbox.sandbox_id
             _emit(f"sandbox ready: {result.sandbox_id} — agent started (mode={mode})")
 
-            # Run setup commands one at a time, *before* the agent starts.
-            # The factory has already started the agent in start_agent()
-            # during create(); to keep the order "setup → agent → verify"
-            # we'd need to restructure. As a pragmatic compromise we run
-            # setup IMMEDIATELY after create(), which races with the agent
-            # for ~1-2s but is fine for typical pip/git/download work
-            # because most agent CLIs take a while before their first model
-            # call.
+            # Re-run setup commands individually for per-command
+            # observability in the response. The commands already ran
+            # atomically via setup_shell above, so these re-runs are
+            # idempotent — they exist only to populate
+            # result.setup_results with per-command exit/stdout/stderr.
             for i, cmd in enumerate(setup, 1):
-                _emit(f"setup [{i}/{len(setup)}]: {cmd[:80]}")
                 cr = self._exec_command(session.sandbox, cmd)
                 result.setup_results.append(cr)
                 if cr.exit_code != 0:
+                    # Should not happen — setup_shell already succeeded
+                    # during bootstrap, but record it for diagnostics.
                     result.error = (
-                        f"setup command failed (exit {cr.exit_code}): {cmd[:120]}"
+                        f"setup replay failed (exit {cr.exit_code}): {cmd[:120]}"
                     )
-                    _emit(f"setup FAILED at [{i}]: exit={cr.exit_code}")
+                    _emit(f"setup replay FAILED at [{i}]: exit={cr.exit_code}")
                     break
 
             # Block until the agent is done (or setup already failed).
diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 0e58af9e0..1d934777d 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -458,8 +458,6 @@ def _exec_with_retry(
                 last_stdout = r.stdout or ""
                 last_stderr = r.stderr or ""
                 last_exit = r.exit_code
-                if last_stderr.strip():
-                    break
             except Exception as exc:
                 last_stderr = f"{type(exc).__name__}: {exc}"
                 last_exit = -1

From a478fa8bf0c19384c8c7bf3eae74d2206d8ca9ae Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:35:47 +0530
Subject: [PATCH 16/35] refactor: extract sandbox bootstrap to driver and fix
 interception races

---
 envs/coding_agent_env/README.md               |  34 ++---
 envs/coding_agent_env/client.py               |   9 +-
 envs/coding_agent_env/harness.py              |  54 ++------
 envs/coding_agent_env/models.py               |  11 +-
 envs/coding_agent_env/pyproject.toml          |   4 +-
 .../server/coding_environment.py              |  69 ++++++----
 examples/coding_agent_env_simple.py           |  26 +---
 src/openenv/core/harness/agents/cli_driver.py |  40 ++++--
 .../harness/agents/interception_server.py     | 101 ++++++++++----
 src/openenv/core/harness/agents/opencode.py   |  11 +-
 src/openenv/core/harness/agents/pi.py         |  10 +-
 src/openenv/core/harness/sandbox/_util.py     |  12 ++
 .../core/harness/sandbox/docker_backend.py    |  43 +++---
 .../core/harness/sandbox/hf_backend.py        |  22 ++--
 tests/core/test_cli_agent_driver.py           | 114 ++++++++++++++++
 tests/core/test_docker_sandbox_backend.py     |   4 +-
 tests/core/test_hf_sandbox_backend.py         |   6 +-
 tests/core/test_interception_server.py        | 124 ++++++++++++++++++
 tests/envs/test_coding_agent_env.py           |  52 ++++++++
 19 files changed, 541 insertions(+), 205 deletions(-)
 create mode 100644 src/openenv/core/harness/sandbox/_util.py
 create mode 100644 tests/core/test_interception_server.py

diff --git a/envs/coding_agent_env/README.md b/envs/coding_agent_env/README.md
index 11fb88188..7825e5c25 100644
--- a/envs/coding_agent_env/README.md
+++ b/envs/coding_agent_env/README.md
@@ -9,7 +9,7 @@ app_port: 8000
 base_path: /web
 tags:
   - openenv
-short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B with logprob capture
+short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B
 ---
 
 # Coding Agent Environment for OpenEnv
@@ -17,13 +17,13 @@ short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B with lo
 `coding_agent_env` runs coding-agent harnesses (currently
 [OpenCode](https://opencode.ai) and [Pi](https://github.com/badlogic/pi-mono))
 inside an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible
-LLM endpoint, optionally capturing per-token logprobs for GRPO training.
+LLM endpoint with optional trainer-owned interception for RL training.
 
 **🚀 Try it live**: [`AdithyaSK/coding-agent-env`](https://huggingface.co/spaces/AdithyaSK/coding-agent-env)
 
 The deployed Space exposes:
 
-- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward + logprobs.
+- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward.
 - **MCP tool API** at [`/mcp`](https://adithyask-coding-agent-env.hf.space/mcp) — programmatic `run_rollout` calls.
 - **OpenAPI docs** at [`/docs`](https://adithyask-coding-agent-env.hf.space/docs).
 - **Health** at [`/health`](https://adithyask-coding-agent-env.hf.space/health).
@@ -83,7 +83,6 @@ async def main():
         result = RolloutResult.model_validate_json(_extract_text(raw))
 
         print("reward:", result.reward)
-        print("turns:", len(result.proxy_turns))
         print("files:", list(result.files.keys()))
         print("wall:", result.wall_s, "s")
 
@@ -95,7 +94,6 @@ Expected output (~20s with the prebaked template):
 
 ```
 reward: 1.0
-turns: 3
 files: ['/home/user/workdir/binary_search.py', ...]
 wall: 19.8 s
 ```
@@ -134,11 +132,10 @@ factory = CodingAgentSessionFactory(
         model="gpt-4o-mini",
     ),
     sandbox_backend=E2BSandboxBackend(),
-    mode="transparent_proxy",                   # captures per-token logprobs
+    mode="interception_gate",                  # trainer-owned interception mode
 )
 session = factory.create(task=CodingAgentTask(instruction="..."))
 session.wait_for_completion()
-turns = session.fetch_proxy_trace()             # per-turn (tokens, logprobs)
 session.close()
 ```
 
@@ -195,23 +192,23 @@ directly.
 | `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. |
 | `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. |
 | `task_id` | `str` | `""` | Echoed back in result. |
-| `mode` | `str` | `"transparent_proxy"` | Or `"black_box"` (no logprobs). |
+| `mode` | `str` | `"black_box"` | Or `"interception_gate"` for trainer-owned generation. |
 | `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. |
 | `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. |
-| `top_logprobs` | `int` | `5` | HF Router cap is 5; OpenAI 0–20; vLLM unbounded. |
+| `top_logprobs` | `int` | `5` | Reserved for trainer-owned interception workflows. |
 | `agent_timeout_s` | `float` | `600.0` | Hard wall budget for the selected harness. |
 | `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. |
 
 Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
-`verify_results[]`, `proxy_turns[]`, `files{}`, `agent_log_tail`,
+`verify_results[]`, `files{}`, `agent_log_tail`,
 `proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`.
 
 ## Two Operating Modes
 
 | Mode | What it does | Best for |
 |---|---|---|
-| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards harness LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. |
-| **`black_box`** | No proxy. The selected harness talks straight to `base_url`. | Smoke tests, eval, SFT data collection. |
+| **`black_box`** (default) | The selected harness talks directly to `base_url`. | Smoke tests, eval, SFT data collection. |
+| **`interception_gate`** | Agent calls are routed through trainer-host interception endpoints. Trainer owns forward pass + trajectory capture. | RL training with trainer-owned generation. |
 
 ## Environment Variables
 
@@ -230,21 +227,17 @@ sibling `.env` file; on HF Spaces, set them as **Space secrets**.
 | **OpenAI endpoint** | | |
 | `OPENAI_API_KEY` | required for `endpoint="openai"` | Standard OpenAI key. |
 | `OPENAI_BASE_URL` | no | Defaults to `https://api.openai.com/v1`. |
-| `OPENAI_MODEL` | no | Defaults to `gpt-4o-mini` (gpt-5.x and o-series refuse logprobs). |
+| `OPENAI_MODEL` | no | Defaults to `gpt-4o-mini`. |
 | **HF Router endpoint** | | |
 | `HF_ROUTER_API_KEY` | required for `endpoint="hf_router"` | HF user token. |
 | `HF_ROUTER_BASE_URL` | no | Defaults to `https://router.huggingface.co/v1`. |
 | `HF_ROUTER_MODEL` | no | Defaults to `Qwen/Qwen3-4B-Instruct-2507:nscale`. |
 
-Pick `provider:` suffixes that actually return logprobs:
-**Together / Nscale / Scaleway / SambaNova / Cerebras**. Avoid Novita /
-Hyperbolic / Featherless (silent drop) and Groq (HTTP 400).
 
 ## Pre-baked E2B Template
 
 The first rollout in a fresh E2B sandbox spends ~2 min installing
-harness tooling and the proxy's Python deps. Build a one-time template that
-ships those pre-installed:
+harness tooling. Build a one-time template that ships those pre-installed:
 
 ```bash
 .venv/bin/python envs/coding_agent_env/sandbox/build_template.py
@@ -290,7 +283,8 @@ src/openenv/core/harness/sandbox/
 ├── base.py                         # SandboxBackend / SandboxHandle protocols
 ├── e2b_backend.py                  # E2B implementation
 ├── docker_backend.py               # local Docker backend
-└── interception.py                 # in-sandbox FastAPI proxy (logprob capture)
+├── hf_backend.py                   # HF sandbox backend
+└── _util.py                        # shared sandbox shell utilities
 ```
 
 ## References
@@ -299,4 +293,4 @@ src/openenv/core/harness/sandbox/
 - [OpenCode CLI](https://opencode.ai/docs/cli/)
 - [Pi](https://github.com/badlogic/pi-mono)
 - [E2B Python SDK](https://e2b.dev/docs)
-- [HF Inference Providers logprob matrix](../../../DOCS/HF/hf_inference_providers_logprobs.md)
+
diff --git a/envs/coding_agent_env/client.py b/envs/coding_agent_env/client.py
index c1e0f6f92..492060a25 100644
--- a/envs/coding_agent_env/client.py
+++ b/envs/coding_agent_env/client.py
@@ -25,7 +25,7 @@
             verify=["python /home/user/test.py"],
             task_id="binary_search_v1",
         )
-        print(result.reward, len(result.proxy_turns))
+        print(result.reward)
 """
 
 from __future__ import annotations
@@ -95,15 +95,14 @@ def run_rollout(
                 requests. Needed for Qwen3.5 vLLM; harmless on Instruct
                 variants; rejected by OpenAI direct.
             max_tokens_cap: Clamp on per-turn ``max_tokens``.
-            top_logprobs: Top-k logprobs requested upstream. HF Router caps
-                at 5; OpenAI accepts up to 20; vLLM is unbounded.
+            top_logprobs: Reserved for trainer-owned interception workflows.
             agent_timeout_s: Hard wall-clock budget for one agent run.
             template: E2B template name (e.g. ``"coding-agent-rl"``). Empty
                 string uses the default (slow) base image.
 
         Returns:
-            A :class:`RolloutResult` with reward, per-turn logprobs, file
-            outputs, setup/verify results, and diagnostic tails.
+            A :class:`RolloutResult` with reward, file outputs,
+            setup/verify results, and diagnostic tails.
         """
         raw = self.call_tool(
             "run_rollout",
diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py
index 295b07ac3..2355260f5 100644
--- a/envs/coding_agent_env/harness.py
+++ b/envs/coding_agent_env/harness.py
@@ -24,12 +24,7 @@
 from .opencode_runtime import (
     agent_log_path,
     build_env_vars,
-    build_install_cmd,
-    build_opencode_json,
     build_run_cmd,
-    instruction_path,
-    opencode_config_path,
-    system_prompt_path,
 )
 from .task import CodingAgentTask
 
@@ -87,10 +82,7 @@ def __init__(
             raise ValueError(f"Unknown mode: {mode!r}")
         self._config = config
         self._backend = sandbox_backend
-        self._mode = mode
         self._verifier = verifier
-        self._install_timeout_s = install_timeout_s
-        self._setup_timeout_s = setup_timeout_s
         self._driver = CLIAgentDriver(
             spec=OPENCODE_SPEC,
             sandbox_backend=sandbox_backend,
@@ -111,6 +103,16 @@ def create(
 
         _log = logging.getLogger(__name__)
         oc_task = CodingAgentTask.coerce(task)
+        setup_parts: list[str] = []
+        if self._config.extra_setup_shell:
+            setup_parts.append(self._config.extra_setup_shell)
+        if oc_task.setup_shell:
+            setup_parts.append(oc_task.setup_shell)
+        if setup_parts:
+            oc_task = oc_task.model_copy(
+                update={"setup_shell": "set -e\n" + "\n".join(setup_parts)}
+            )
+
         sandbox_timeout = int(self._config.agent_timeout_s) + 300
         sandbox = self._backend.create(
             timeout_s=sandbox_timeout,
@@ -132,41 +134,7 @@ def create(
         return session
 
     def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: CodingAgentTask) -> None:
-        self._driver._wait_for_sandbox_ready(sandbox)
-        if not self._driver._agent_already_installed(sandbox):
-            self._driver._exec_with_retry(
-                sandbox,
-                build_install_cmd(self._config),
-                timeout=self._install_timeout_s,
-                attempts=3,
-                backoff_s=3.0,
-                label="opencode install",
-            )
-        sandbox.write_text(
-            opencode_config_path(self._config), build_opencode_json(self._config)
-        )
-        sandbox.write_text(instruction_path(self._config), task.instruction)
-        if self._config.system_prompt:
-            sandbox.write_text(
-                system_prompt_path(self._config), self._config.system_prompt
-            )
-        for remote_path, content in task.upload_files.items():
-            sandbox.write_text(remote_path, content)
-        if self._config.extra_setup_shell:
-            self._driver._exec_with_retry(
-                sandbox,
-                self._config.extra_setup_shell,
-                timeout=self._setup_timeout_s,
-                attempts=2,
-                backoff_s=2.0,
-                label="extra_setup_shell",
-            )
-        if task.setup_shell:
-            r = sandbox.exec(task.setup_shell, timeout=self._setup_timeout_s)
-            if r.exit_code != 0:
-                raise RuntimeError(
-                    f"task.setup_shell failed ({r.exit_code}): {r.stderr}"
-                )
+        self._driver.bootstrap_sandbox(sandbox, task, self._config)
 
 
 __all__ = [
diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py
index 821b1bd57..2111d84d5 100644
--- a/envs/coding_agent_env/models.py
+++ b/envs/coding_agent_env/models.py
@@ -21,7 +21,7 @@
 
 
 class RolloutTurn(BaseModel):
-    """One intercepted LLM turn captured by the in-sandbox proxy (Mode B)."""
+    """One intercepted LLM turn shape (trainer-owned in interception_gate mode)."""
 
     turn: int
     finish_reason: str | None = None
@@ -45,11 +45,7 @@ class CommandResult(BaseModel):
 
 
 class RolloutResult(BaseModel):
-    """Full payload returned from one ``run_rollout`` invocation.
-
-    The trainer (or any client) decodes this from the MCP tool result JSON
-    and feeds ``proxy_turns`` + ``reward`` into GRPO.
-    """
+    """Full payload returned from one ``run_rollout`` invocation."""
 
     # Identifiers
     task_id: str = ""
@@ -65,7 +61,8 @@ class RolloutResult(BaseModel):
     setup_results: list[CommandResult] = Field(default_factory=list)
     verify_results: list[CommandResult] = Field(default_factory=list)
 
-    # Per-turn LLM trajectory (empty in black_box mode)
+    # Per-turn LLM trajectory placeholder. Capture is trainer-owned in
+    # interception_gate mode; environment currently leaves this empty.
     proxy_turns: list[RolloutTurn] = Field(default_factory=list)
 
     # Filesystem the agent produced (path -> contents, truncated)
diff --git a/envs/coding_agent_env/pyproject.toml b/envs/coding_agent_env/pyproject.toml
index 276d3e0be..d935a0bf5 100644
--- a/envs/coding_agent_env/pyproject.toml
+++ b/envs/coding_agent_env/pyproject.toml
@@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "openenv-coding-agent-env"
 version = "0.1.0"
-description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints, optionally capturing per-token logprobs."
+description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints."
 requires-python = ">=3.10"
 dependencies = [
     # Core OpenEnv (server + MCP). 0.3.0 ships the harness runtime.
@@ -26,7 +26,7 @@ dependencies = [
     # behavior drift on Space rebuilds.
     "gradio>=6.0.0",
 
-    # OpenCode harness primitive — sandbox + proxy + agent driver
+    # OpenCode harness primitive — sandbox + agent driver
     "httpx>=0.27.0",
     "e2b>=1.0.0",
 ]
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index ceee49002..af70b292e 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -15,8 +15,8 @@
 Reward = ``passed_verify_commands / total`` unless a verify command writes
 a float to ``/home/user/logs/verifier/reward.txt`` (override).
 
-Returns a JSON-serialized :class:`RolloutResult` with reward + per-turn
-logprobs (Mode B) + setup/verify command results + file outputs.
+Returns a JSON-serialized :class:`RolloutResult` with reward,
+setup/verify command results, and file outputs.
 """
 
 from __future__ import annotations
@@ -184,6 +184,11 @@ def run_rollout(
                 raise ValueError(
                     f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}"
                 )
+            if mode not in {"black_box", "interception_gate"}:
+                raise ValueError(
+                    "unsupported mode {!r}; supported modes: ('black_box', "
+                    "'interception_gate')".format(mode)
+                )
             if not (base_url and api_key and model):
                 raise ValueError(
                     "must provide either ``endpoint`` (one of "
@@ -303,6 +308,12 @@ def _emit(msg: str) -> None:
                 except Exception:
                     pass
 
+        if mode not in {"black_box", "interception_gate"}:
+            raise ValueError(
+                "unsupported mode {!r}; supported modes: ('black_box', "
+                "'interception_gate')".format(mode)
+            )
+
         result = self._RolloutResult(task_id=task_id, mode=mode)
         t0 = time.time()
 
@@ -347,18 +358,17 @@ def _emit(msg: str) -> None:
             metadata={"task_id": task_id, "agent": agent},
         )
 
-        factory = self._build_session_factory(
-            agent=agent,
-            config=config,
-            mode=mode,
-            template=template,
-            disable_thinking=disable_thinking,
-            top_logprobs=top_logprobs,
-            max_tokens_cap=max_tokens_cap,
-        )
-
         session = None
         try:
+            factory = self._build_session_factory(
+                agent=agent,
+                config=config,
+                mode=mode,
+                template=template,
+                disable_thinking=disable_thinking,
+                top_logprobs=top_logprobs,
+                max_tokens_cap=max_tokens_cap,
+            )
             _emit(
                 f"creating E2B sandbox (template={template or 'default'}) — "
                 "this is the slow phase (~5–60s cold, ~5s with template)"
@@ -367,24 +377,22 @@ def _emit(msg: str) -> None:
             result.sandbox_id = session.sandbox.sandbox_id
             _emit(f"sandbox ready: {result.sandbox_id} — agent started (mode={mode})")
 
-            # Re-run setup commands individually for per-command
-            # observability in the response. The commands already ran
-            # atomically via setup_shell above, so these re-runs are
-            # idempotent — they exist only to populate
-            # result.setup_results with per-command exit/stdout/stderr.
-            for i, cmd in enumerate(setup, 1):
-                cr = self._exec_command(session.sandbox, cmd)
-                result.setup_results.append(cr)
-                if cr.exit_code != 0:
-                    # Should not happen — setup_shell already succeeded
-                    # during bootstrap, but record it for diagnostics.
-                    result.error = (
-                        f"setup replay failed (exit {cr.exit_code}): {cmd[:120]}"
+            # setup commands already ran atomically during sandbox bootstrap.
+            # Avoid re-running them here because many setup scripts are not
+            # idempotent (e.g., migrations, one-shot installs, destructive prep).
+            # We still surface per-command bookkeeping for callers.
+            for cmd in setup:
+                result.setup_results.append(
+                    self._CommandResult(
+                        cmd=cmd,
+                        exit_code=0,
+                        stdout="executed during bootstrap",
+                        stderr="",
+                        duration_s=0.0,
                     )
-                    _emit(f"setup replay FAILED at [{i}]: exit={cr.exit_code}")
-                    break
+                )
 
-            # Block until the agent is done (or setup already failed).
+            # Block until the agent is done.
             if result.error is None:
                 _emit(
                     f"agent running — {agent} CLI in sandbox "
@@ -498,6 +506,11 @@ def _build_session_factory(
         top_logprobs: int,
         max_tokens_cap: int,
     ) -> Any:
+        if self._E2BSandboxBackend is None:
+            raise RuntimeError(
+                "E2BSandboxBackend unavailable: install optional dependency 'e2b'."
+            )
+
         backend_kwargs: dict[str, Any] = {}
         if template:
             backend_kwargs["template"] = template
diff --git a/examples/coding_agent_env_simple.py b/examples/coding_agent_env_simple.py
index f8996e586..caf81bad8 100644
--- a/examples/coding_agent_env_simple.py
+++ b/examples/coding_agent_env_simple.py
@@ -14,12 +14,9 @@
   1. Spawns a fresh E2B sandbox (using the prebaked ``coding-agent-rl``
      template — falls back to a cold install if the template isn't
      present in your E2B account).
-  2. Bootstraps an in-sandbox FastAPI proxy that captures per-token
-     logprobs (``mode="transparent_proxy"``).
-  3. Runs the selected harness CLI with the instruction.
-  4. Executes the verify bash commands; reward = passed / total.
-  5. Returns a ``RolloutResult`` with reward + per-turn logprobs +
-     the file contents the agent produced.
+  2. Runs the selected harness CLI with the instruction.
+  3. Executes the verify bash commands; reward = passed / total.
+  4. Returns a ``RolloutResult`` with reward + produced file contents.
 
 Prerequisites
 -------------
@@ -34,7 +31,6 @@
 Expected output (~20s with the prebaked template)::
 
     reward: 1.0
-    turns:  3
     files:  ['/home/user/workdir/binary_search.py', ...]
     wall:   19.8 s
 """
@@ -54,7 +50,9 @@
 from coding_agent_env.models import RolloutResult  # noqa: E402
 
 
-SPACE = os.environ.get("CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space")
+SPACE = os.environ.get(
+    "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space"
+)
 
 INSTRUCTION = (
     "Create a single Python file named `binary_search.py` in the current "
@@ -109,8 +107,6 @@ async def main() -> int:
 
     print("--- result ---")
     print(f"reward:    {result.reward}")
-    print(f"turns:     {len(result.proxy_turns)}")
-    print(f"tokens:    {sum(len(t.completion_tokens) for t in result.proxy_turns)}")
     print(f"sandbox:   {result.sandbox_id}")
     print(f"wall_s:    {result.wall_s}")
     print(f"files:     {sorted(result.files)}")
@@ -118,16 +114,6 @@ async def main() -> int:
     if result.error:
         print(f"error:     {result.error}")
 
-    if result.proxy_turns:
-        first = next((t for t in result.proxy_turns if t.completion_tokens), None)
-        if first:
-            print()
-            print("--- first productive turn (first 8 tokens with logprobs) ---")
-            toks = first.completion_tokens[:8]
-            lps = first.per_token_logps[:8]
-            for tok, lp in zip(toks, lps):
-                print(f"  {tok!r:<14}  {lp:+.3f}")
-
     return 0 if result.reward == 1.0 else 1
 
 
diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 1d934777d..42161bfec 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -21,7 +21,6 @@
 import json
 import logging
 import shlex
-import threading
 import time
 import uuid
 from typing import Any, Callable, Literal
@@ -117,11 +116,14 @@ def close(self) -> None:
 
     def wait_for_completion(self, timeout_s: float | None = None) -> int:
         """Block until the agent exits, returning its exit code."""
-        budget = timeout_s if timeout_s is not None else self.spec.default_timeout_s
-        if hasattr(self.config, "agent_timeout_s"):
-            budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
         if self._agent_bg_job is None:
             raise RuntimeError("Agent not started.")
+        default_timeout = (
+            self.config.agent_timeout_s
+            if hasattr(self.config, "agent_timeout_s")
+            else self.spec.default_timeout_s
+        )
+        budget = timeout_s if timeout_s is not None else default_timeout
         return self._agent_bg_job.wait(timeout=budget)
 
     def collect_artifacts(self) -> dict[str, Any]:
@@ -189,17 +191,19 @@ async def next_request(
                     self._interception_queue.get(),
                     timeout=min(remaining, 1.0),
                 )
-                return server.intercepts[request_id]
+                intercept = server.get_intercept(request_id)
+                if intercept is not None:
+                    return intercept
             except asyncio.TimeoutError:
-                if self._agent_bg_job is not None:
-                    done_event = getattr(self._agent_bg_job, "_done", None)
-                    if (
-                        done_event is not None
-                        and isinstance(done_event, threading.Event)
-                        and done_event.is_set()
-                    ):
-                        return None
-                continue
+                pass
+
+            if self._agent_bg_job is not None:
+                try:
+                    self._agent_bg_job.wait(timeout=0)
+                    return None
+                except TimeoutError:
+                    pass
+            continue
 
     async def deliver(
         self, intercept: dict[str, Any], response_dict: dict[str, Any]
@@ -241,6 +245,14 @@ def __init__(
         self._interception_server = interception_server
         self._interception_base_url = interception_base_url
 
+    def bootstrap_sandbox(self, sandbox: SandboxHandle, task: Any, config: Any) -> None:
+        """Public bootstrap hook used by external wrappers.
+
+        Runs readiness checks, optional install, file upload, MCP config write,
+        and task setup shell execution.
+        """
+        self._bootstrap_sandbox(sandbox, task, config)
+
     def create_session(
         self,
         task: Any,
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index a075ec8b4..1aa4edf57 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -36,7 +36,9 @@
 
     while True:
         request_id = await asyncio.wait_for(queue.get(), timeout=...)
-        intercept = server.intercepts[request_id]
+        intercept = server.get_intercept(request_id)
+        if intercept is None:
+            continue
         response = await vllm.generate(intercept["messages"], ...)
         await deliver_response(intercept, response)
 
@@ -51,6 +53,7 @@
 import json
 import logging
 import secrets
+import threading
 import time
 import uuid
 from typing import Any
@@ -71,13 +74,20 @@ class InterceptionServer:
     identified by a ``rollout_id`` in the URL path.
     """
 
-    def __init__(self, port: int = 0, secret: str | None = None) -> None:
+    def __init__(
+        self,
+        port: int = 0,
+        secret: str | None = None,
+        host: str = "127.0.0.1",
+    ) -> None:
         self.port = port
+        self.host = host
         self.secret = secret or secrets.token_urlsafe(32)
         self._app: web.Application | None = None
         self._runner: web.AppRunner | None = None
         self._site: web.TCPSite | None = None
         self._lock = asyncio.Lock()
+        self._state_lock = threading.RLock()
         self.active_rollouts: dict[str, dict[str, Any]] = {}
         self.intercepts: dict[str, dict[str, Any]] = {}
 
@@ -93,7 +103,9 @@ async def start(self) -> None:
             app.router.add_get("/health", self._handle_health)
             runner = web.AppRunner(app)
             await runner.setup()
-            site = web.TCPSite(runner, "0.0.0.0", self.port)
+            if self.host == "0.0.0.0":
+                _log.warning("InterceptionServer exposed on all interfaces (0.0.0.0).")
+            site = web.TCPSite(runner, self.host, self.port)
             await site.start()
             if self.port == 0:
                 server = getattr(site, "_server", None)
@@ -111,7 +123,11 @@ async def stop(self) -> None:
         async with self._lock:
             if self._runner is None:
                 return
-            for intercept in list(self.intercepts.values()):
+            with self._state_lock:
+                intercepts = list(self.intercepts.values())
+                self.intercepts.clear()
+                self.active_rollouts.clear()
+            for intercept in intercepts:
                 fut: asyncio.Future | None = intercept.get("response_future")
                 if fut and not fut.done():
                     fut.cancel()
@@ -121,8 +137,6 @@ async def stop(self) -> None:
                         cq.put_nowait(None)
                     except asyncio.QueueFull:
                         pass
-            self.intercepts.clear()
-            self.active_rollouts.clear()
             try:
                 await self._runner.cleanup()
             except RuntimeError:
@@ -137,27 +151,39 @@ def register_rollout(
         state: dict[str, Any] | None = None,
     ) -> asyncio.Queue:
         queue: asyncio.Queue = asyncio.Queue()
-        self.active_rollouts[rollout_id] = {
-            "request_id_queue": queue,
-            "state": state,
-        }
+        with self._state_lock:
+            self.active_rollouts[rollout_id] = {
+                "request_id_queue": queue,
+                "state": state,
+            }
         return queue
 
     def unregister_rollout(self, rollout_id: str) -> None:
-        for request_id in list(self.intercepts):
-            intercept = self.intercepts.get(request_id)
-            if intercept and intercept.get("rollout_id") == rollout_id:
-                fut: asyncio.Future | None = intercept.get("response_future")
-                if fut and not fut.done():
-                    fut.cancel()
-                cq: asyncio.Queue | None = intercept.get("chunk_queue")
-                if cq is not None:
-                    try:
-                        cq.put_nowait(None)
-                    except asyncio.QueueFull:
-                        pass
+        with self._state_lock:
+            matching_ids = [
+                request_id
+                for request_id, intercept in self.intercepts.items()
+                if intercept.get("rollout_id") == rollout_id
+            ]
+            matching_intercepts = [self.intercepts[i] for i in matching_ids]
+            for request_id in matching_ids:
                 del self.intercepts[request_id]
-        self.active_rollouts.pop(rollout_id, None)
+            self.active_rollouts.pop(rollout_id, None)
+
+        for intercept in matching_intercepts:
+            fut: asyncio.Future | None = intercept.get("response_future")
+            if fut and not fut.done():
+                fut.cancel()
+            cq: asyncio.Queue | None = intercept.get("chunk_queue")
+            if cq is not None:
+                try:
+                    cq.put_nowait(None)
+                except asyncio.QueueFull:
+                    pass
+
+    def get_intercept(self, request_id: str) -> dict[str, Any] | None:
+        with self._state_lock:
+            return self.intercepts.get(request_id)
 
     def _authorized(self, request: web.Request) -> bool:
         auth = request.headers.get("Authorization", "")
@@ -176,7 +202,8 @@ async def _handle_chat_completions(
             return web.json_response({"error": "Unauthorized"}, status=401)
 
         rollout_id = request.match_info["rollout_id"]
-        context = self.active_rollouts.get(rollout_id)
+        with self._state_lock:
+            context = self.active_rollouts.get(rollout_id)
         if not context:
             return web.json_response({"error": "rollout not found"}, status=404)
 
@@ -197,11 +224,16 @@ async def _handle_chat_completions(
             "tools": body.get("tools"),
             "stream": is_streaming,
             "chunk_queue": chunk_queue,
-            "response_future": asyncio.get_event_loop().create_future(),
+            "response_future": asyncio.get_running_loop().create_future(),
             "body": body,
         }
-        self.intercepts[request_id] = intercept
-        await context["request_id_queue"].put(request_id)
+        with self._state_lock:
+            context = self.active_rollouts.get(rollout_id)
+            if context is None:
+                return web.json_response({"error": "rollout not found"}, status=404)
+            self.intercepts[request_id] = intercept
+            request_queue: asyncio.Queue = context["request_id_queue"]
+        await request_queue.put(request_id)
 
         if is_streaming:
             return await self._stream_response(request, intercept)
@@ -210,8 +242,12 @@ async def _handle_chat_completions(
             response_dict = await intercept["response_future"]
         except asyncio.CancelledError:
             return web.json_response({"error": "rollout cancelled"}, status=499)
-        except Exception as exc:
-            return web.json_response({"error": str(exc)}, status=500)
+        except Exception:
+            _log.exception("interception request %s failed", request_id)
+            return web.json_response({"error": "internal error"}, status=500)
+        finally:
+            with self._state_lock:
+                self.intercepts.pop(request_id, None)
 
         return web.json_response(response_dict)
 
@@ -249,6 +285,13 @@ async def _stream_response(
         finally:
             if get_task and not get_task.done():
                 get_task.cancel()
+            fut: asyncio.Future | None = intercept.get("response_future")
+            if fut and not fut.done():
+                fut.cancel()
+            request_id = intercept.get("request_id")
+            if isinstance(request_id, str):
+                with self._state_lock:
+                    self.intercepts.pop(request_id, None)
         try:
             await resp.write_eof()
         except Exception:
diff --git a/src/openenv/core/harness/agents/opencode.py b/src/openenv/core/harness/agents/opencode.py
index 13c17fa04..9a829c3e2 100644
--- a/src/openenv/core/harness/agents/opencode.py
+++ b/src/openenv/core/harness/agents/opencode.py
@@ -19,6 +19,7 @@
 from __future__ import annotations
 
 import json
+import shlex
 from typing import Any
 
 from . import register_agent
@@ -39,11 +40,15 @@ def _build_opencode_command(
     log_file = f"{home}/logs/agent/opencode.jsonl"
     workdir = f"{home}/workdir"
 
+    workdir_q = shlex.quote(workdir)
+    instruction_q = shlex.quote(instruction_file)
+    log_q = shlex.quote(log_file)
+
     return (
         f'export PATH="$HOME/.opencode/bin:$PATH" && '
-        f"cd {workdir} && git init -q 2>/dev/null; "
-        f'opencode run {format_flag} "$(cat {instruction_file})" '
-        f"2>&1 | tee {log_file}"
+        f"cd {workdir_q} && git init -q 2>/dev/null; "
+        f'opencode run {format_flag} "$(cat {instruction_q})" '
+        f"2>&1 | tee {log_q}"
     ).strip()
 
 
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index d7b60569f..dcc552842 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -61,12 +61,16 @@ def _build_command(
     if hasattr(config, "thinking") and config.thinking:
         thinking = f" --thinking {shlex.quote(config.thinking)}"
 
+    workdir_q = shlex.quote(workdir)
+    instruction_q = shlex.quote(instruction_file)
+    log_q = shlex.quote(log_file)
+
     return (
-        f"cd {workdir} && git init -q 2>/dev/null; "
+        f"cd {workdir_q} && git init -q 2>/dev/null; "
         f"pi --no-session --no-context-files"
         f"{provider}{model}{thinking}"
-        f" -p @{instruction_file}"
-        f" 2>&1 | tee {log_file}"
+        f" -p @{instruction_q}"
+        f" 2>&1 | tee {log_q}"
     )
 
 
diff --git a/src/openenv/core/harness/sandbox/_util.py b/src/openenv/core/harness/sandbox/_util.py
new file mode 100644
index 000000000..6291b0fb3
--- /dev/null
+++ b/src/openenv/core/harness/sandbox/_util.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+
+def shell_quote(s: str) -> str:
+    """Single-quote a string for shell, escaping embedded single quotes."""
+    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py
index 559817d1b..28447ce2e 100644
--- a/src/openenv/core/harness/sandbox/docker_backend.py
+++ b/src/openenv/core/harness/sandbox/docker_backend.py
@@ -31,6 +31,7 @@
 import uuid
 from pathlib import PurePosixPath
 
+from openenv.core.harness.sandbox._util import shell_quote
 from openenv.core.harness.sandbox.base import BgJob, ExecResult
 
 _log = logging.getLogger(__name__)
@@ -45,12 +46,14 @@ class DockerBgJob:
     """
 
     def __init__(
-        self, container_id: str, pid: int, poll_thread: threading.Thread
+        self,
+        container_id: str,
+        pid: int,
+        poll_thread: threading.Thread | None = None,
     ) -> None:
         self._container_id = container_id
         self._pid = pid
         self._exit_code: int | None = None
-        self._error: BaseException | None = None
         self._done = threading.Event()
         self._poll_thread = poll_thread
 
@@ -63,8 +66,6 @@ def wait(self, timeout: float | None = None) -> int:
             raise TimeoutError(
                 f"Background command (pid={self._pid}) did not exit within {timeout}s"
             )
-        if self._error is not None:
-            raise self._error
         return self._exit_code if self._exit_code is not None else 0
 
     def kill(self) -> None:
@@ -127,8 +128,8 @@ def start_bg(
         envs: dict[str, str] | None = None,
         cwd: str | None = None,
     ) -> BgJob:
-        marker = f"/tmp/.bg_{uuid.uuid4().hex[:8]}"
-        wrapped = f"bash -c {_shell_quote(cmd + f'; echo $? > {marker}')} &\necho $!"
+        marker = f"/tmp/.bg_{uuid.uuid4().hex}"
+        wrapped = f"bash -c {shell_quote(cmd + f'; echo $? > {marker}')} &\necho $!"
         docker_cmd = self._build_exec_cmd(envs=envs, cwd=cwd)
         docker_cmd.extend(["bash", "-c", wrapped])
         result = subprocess.run(docker_cmd, capture_output=True, text=True, timeout=10)
@@ -147,7 +148,7 @@ def start_bg(
             )
         pid = int(pid_line)
 
-        job = DockerBgJob(self._container_id, pid, poll_thread=None)  # type: ignore[arg-type]
+        job = DockerBgJob(self._container_id, pid)
         poll_thread = threading.Thread(
             target=self._poll_bg_job,
             args=(job, marker),
@@ -174,7 +175,7 @@ def write_text(self, path: str, content: str) -> None:
                 self._container_id,
                 "bash",
                 "-c",
-                f"cat > {_shell_quote(path)}",
+                f"cat > {shell_quote(path)}",
             ],
             input=content.encode(),
             capture_output=True,
@@ -233,6 +234,7 @@ def _build_exec_cmd(
         return cmd
 
     def _poll_bg_job(self, job: DockerBgJob, marker: str) -> None:
+        consecutive_failures = 0
         while not job._done.is_set():
             try:
                 result = subprocess.run(
@@ -245,22 +247,38 @@ def _poll_bg_job(self, job: DockerBgJob, marker: str) -> None:
                     job._exit_code = int(result.stdout.strip())
                     job._done.set()
                     return
+                if "No such container" in (result.stderr or ""):
+                    job._exit_code = 1
+                    job._done.set()
+                    return
             except Exception:
-                pass
+                consecutive_failures += 1
+            else:
+                consecutive_failures = 0
 
             # Also check if PID is gone (crash without writing marker).
             try:
                 check = subprocess.run(
                     ["docker", "exec", self._container_id, "kill", "-0", str(job._pid)],
                     capture_output=True,
+                    text=True,
                     timeout=5,
                 )
                 if check.returncode != 0:
                     job._exit_code = 1
                     job._done.set()
                     return
+                if "No such container" in (check.stderr or ""):
+                    job._exit_code = 1
+                    job._done.set()
+                    return
             except Exception:
-                pass
+                consecutive_failures += 1
+
+            if consecutive_failures >= 10:
+                job._exit_code = 1
+                job._done.set()
+                return
 
             time.sleep(0.5)
 
@@ -332,8 +350,3 @@ def create(
             "Docker sandbox created: %s (image=%s)", container_id[:12], self._image
         )
         return DockerSandboxHandle(container_id, user=self._user)
-
-
-def _shell_quote(s: str) -> str:
-    """Single-quote a string for shell, escaping embedded single quotes."""
-    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py
index bb41356e2..43ec5ad95 100644
--- a/src/openenv/core/harness/sandbox/hf_backend.py
+++ b/src/openenv/core/harness/sandbox/hf_backend.py
@@ -20,6 +20,7 @@
 from typing import Any
 
 from hf_sandbox import Sandbox
+from openenv.core.harness.sandbox._util import shell_quote
 from openenv.core.harness.sandbox.base import BgJob, ExecResult, SandboxHandle
 
 _ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
@@ -67,7 +68,7 @@ def wait(self, timeout: float | None = None) -> int:
                 )
 
             marker = self._sandbox.exec(
-                f"cat {_shell_quote(self._marker_path)}",
+                f"cat {shell_quote(self._marker_path)}",
                 timeout=10,
             )
             if marker.exit_code == 0 and marker.stdout.strip():
@@ -152,8 +153,8 @@ def start_bg(
         cwd: str | None = None,
     ) -> BgJob:
         marker_path = f"/tmp/.openenv_bg_{uuid.uuid4().hex[:12]}.exit"
-        wrapped = f"{cmd}; rc=$?; echo $rc > {_shell_quote(marker_path)}"
-        launch_cmd = f"nohup bash -lc {_shell_quote(wrapped)} >/dev/null 2>&1 & echo $!"
+        wrapped = f"{cmd}; rc=$?; echo $rc > {shell_quote(marker_path)}"
+        launch_cmd = f"nohup bash -lc {shell_quote(wrapped)} >/dev/null 2>&1 & echo $!"
 
         result = self.exec(launch_cmd, envs=envs, cwd=cwd, timeout=30)
         if result.exit_code != 0:
@@ -174,7 +175,7 @@ def start_bg(
     def write_text(self, path: str, content: str) -> None:
         parent = str(PurePosixPath(path).parent)
         if parent not in ("", "/"):
-            r = self.exec(f"mkdir -p {_shell_quote(parent)}", timeout=10)
+            r = self.exec(f"mkdir -p {shell_quote(parent)}", timeout=10)
             if r.exit_code != 0:
                 raise RuntimeError(
                     f"Failed to create parent directory {parent!r}: {r.stderr}"
@@ -185,7 +186,7 @@ def read_text(self, path: str) -> str:
         return str(self._sbx.read_file(path, text=True))
 
     def exists(self, path: str) -> bool:
-        r = self.exec(f"test -e {_shell_quote(path)}", timeout=10)
+        r = self.exec(f"test -e {shell_quote(path)}", timeout=10)
         return r.exit_code == 0
 
     def kill(self) -> None:
@@ -250,8 +251,8 @@ def create(
 
         assert last_error is not None
         raise HFSandboxCreateError(
-            f"Failed to create HF sandbox after {self._create_retries} attempts: "
-            f"{last_error}"
+            f"Failed to create HF sandbox after {self._create_retries} attempts "
+            f"({type(last_error).__name__})."
         ) from last_error
 
 
@@ -262,7 +263,7 @@ def _with_env_prefix(cmd: str, envs: dict[str, str]) -> str:
     for key, value in envs.items():
         if not _ENV_KEY_RE.match(key):
             raise ValueError(f"Invalid environment variable name: {key!r}")
-        parts.append(f"export {key}={_shell_quote(str(value))};")
+        parts.append(f"export {key}={shell_quote(str(value))};")
     return " ".join(parts) + f" {cmd}"
 
 
@@ -296,11 +297,6 @@ def _parse_exit_code(raw: str, *, default: int) -> int:
         return default
 
 
-def _shell_quote(s: str) -> str:
-    """Single-quote a string for shell, escaping embedded single quotes."""
-    return "'" + s.replace("'", "'\\''") + "'"
-
-
 __all__ = [
     "HFBgJob",
     "HFSandboxBackend",
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 0b218f19a..af9629970 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -483,6 +483,52 @@ def test_create_session_uploads_task_files(self):
         assert sbx.written["/extra/data.json"] == '{"key": "value"}'
         session.close()
 
+    def test_opencode_black_box_api_key_stays_out_of_command_argv(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        secret = "sk-test '$(leak)"
+        config = FakeConfig(api_key=secret)
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(
+            spec=OPENCODE_SPEC,
+            sandbox_backend=backend,
+            mode="black_box",
+        )
+
+        session = driver.create_session(task=FakeTask(), config=config)
+        sbx = backend.created[0]
+        cmd, envs = sbx.bg_commands[-1]
+        assert secret not in cmd
+        assert envs is not None
+        assert envs["OPENAI_API_KEY"] == secret
+        session.close()
+
+    def test_opencode_interception_gate_uses_server_secret_not_user_key(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+        from openenv.core.harness.agents.interception_server import InterceptionServer
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        secret = "sk-test '$(leak)"
+        config = FakeConfig(api_key=secret)
+        backend = FakeSandboxBackend()
+        server = InterceptionServer(port=0, secret="gate-secret")
+        driver = CLIAgentDriver(
+            spec=OPENCODE_SPEC,
+            sandbox_backend=backend,
+            mode="interception_gate",
+            interception_server=server,
+            interception_base_url="http://127.0.0.1:8765",
+        )
+
+        session = driver.create_session(task=FakeTask(), config=config)
+        sbx = backend.created[0]
+        cmd, envs = sbx.bg_commands[-1]
+        assert secret not in cmd
+        assert envs is not None
+        assert envs["OPENAI_API_KEY"] == "gate-secret"
+        session.close()
+
     def test_create_session_runs_task_setup_shell(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
 
@@ -670,6 +716,32 @@ def test_close_kills_sandbox_and_jobs(self):
         assert sbx._killed
         assert session._agent_bg_job is None
 
+    @pytest.mark.asyncio
+    async def test_next_request_handles_missing_intercept_without_keyerror(self):
+        import asyncio
+
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+        from openenv.core.harness.agents.interception_server import InterceptionServer
+
+        spec = _make_test_spec()
+        sbx = FakeSandbox()
+        queue: asyncio.Queue[str] = asyncio.Queue()
+        await queue.put("req_missing")
+
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+            agent_bg_job=FakeBgJob(),
+            interception_server=InterceptionServer(secret="s"),
+            interception_rollout_id="rollout-1",
+            interception_queue=queue,
+        )
+
+        # Missing request IDs can happen if unregister_rollout races with queue.get().
+        assert await session.next_request(timeout_s=0.2) is None
+
 
 class TestCLIAgentSessionFactory:
     """Tests for the ResourceSessionFactory wrapper."""
@@ -775,6 +847,25 @@ class OcConfig:
         assert "--format json" in cmd
         assert "/home/user/task/instruction.md" in cmd
 
+    def test_build_command_quotes_paths(self):
+        from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+
+        @dataclass
+        class OcConfig:
+            sandbox_home: str = "/home/user with space"
+            run_format: str = "json"
+
+        assert OPENCODE_SPEC.build_command is not None
+        cmd = OPENCODE_SPEC.build_command(
+            OPENCODE_SPEC,
+            OcConfig(),
+            FakeTask(instruction="Write hello.py"),
+            None,
+        )
+        assert "cd '/home/user with space/workdir'" in cmd
+        assert "cat '/home/user with space/task/instruction.md'" in cmd
+        assert "tee '/home/user with space/logs/agent/opencode.jsonl'" in cmd
+
     def test_build_mcp_config(self):
         from openenv.core.harness.agents.opencode import OPENCODE_SPEC
 
@@ -888,6 +979,29 @@ def test_opencode_driver_integration(self):
         session.close()
 
 
+class TestPiSpec:
+    def test_build_command_quotes_paths(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        @dataclass
+        class PiConfig:
+            sandbox_home: str = "/home/user with space"
+            provider: str = "openai"
+            model: str = "model/name"
+            thinking: str = "off"
+
+        assert PI_SPEC.build_command is not None
+        cmd = PI_SPEC.build_command(
+            PI_SPEC,
+            PiConfig(),
+            FakeTask(instruction="Write hello.py"),
+            None,
+        )
+        assert "cd '/home/user with space/workdir'" in cmd
+        assert "-p @'/home/user with space/task/instruction.txt'" in cmd
+        assert "tee '/home/user with space/logs/agent/pi.txt'" in cmd
+
+
 # Env var resolution
 
 
diff --git a/tests/core/test_docker_sandbox_backend.py b/tests/core/test_docker_sandbox_backend.py
index b47f6bd4e..b2eebddd2 100644
--- a/tests/core/test_docker_sandbox_backend.py
+++ b/tests/core/test_docker_sandbox_backend.py
@@ -289,13 +289,13 @@ def test_factory_creates_docker_backend(self):
             sandbox.kill()
 
     def test_satisfies_sandbox_handle_protocol(self):
-        from openenv.core.harness.sandbox import SandboxBackend
+        from openenv.core.harness.sandbox import SandboxHandle
         from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend
 
         backend = DockerSandboxBackend(image="ubuntu:22.04")
         sandbox = backend.create(timeout_s=60)
         try:
-            assert isinstance(sandbox, SandboxBackend) or hasattr(sandbox, "exec")
+            assert isinstance(sandbox, SandboxHandle)
             assert hasattr(sandbox, "sandbox_id")
             assert hasattr(sandbox, "exec")
             assert hasattr(sandbox, "start_bg")
diff --git a/tests/core/test_hf_sandbox_backend.py b/tests/core/test_hf_sandbox_backend.py
index cd235c748..9cd94b5d8 100644
--- a/tests/core/test_hf_sandbox_backend.py
+++ b/tests/core/test_hf_sandbox_backend.py
@@ -150,6 +150,11 @@ def _install_fake_hf_sandbox(monkeypatch) -> None:
     monkeypatch.setitem(sys.modules, "hf_sandbox", fake_module)
 
 
+@pytest.fixture(autouse=True)
+def _reset_fake_hf_calls() -> None:
+    _FakeSandboxAPI.calls.clear()
+
+
 class TestHFSandboxBackend:
     def test_exported_from_package(self, monkeypatch):
         _install_fake_hf_sandbox(monkeypatch)
@@ -167,7 +172,6 @@ def test_create_exec_write_read_exists_bg_and_kill(self, monkeypatch):
         _install_fake_hf_sandbox(monkeypatch)
         importlib.reload(hf_backend)
 
-        _FakeSandboxAPI.calls.clear()
         monkeypatch.setattr(hf_backend, "Sandbox", _FakeSandboxAPI)
 
         backend = hf_backend.HFSandboxBackend(
diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py
new file mode 100644
index 000000000..45922849c
--- /dev/null
+++ b/tests/core/test_interception_server.py
@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import asyncio
+
+import aiohttp
+import pytest
+
+from openenv.core.harness.agents.interception_server import (
+    InterceptionServer,
+    deliver_response,
+)
+
+
+@pytest.mark.asyncio
+async def test_interception_server_rejects_unauthorized_requests() -> None:
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    try:
+        async with aiohttp.ClientSession() as client:
+            resp = await client.post(
+                f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions",
+                json={"messages": []},
+            )
+            assert resp.status == 401
+    finally:
+        await server.stop()
+
+
+@pytest.mark.asyncio
+async def test_interception_server_returns_404_for_unknown_rollout() -> None:
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    try:
+        async with aiohttp.ClientSession() as client:
+            resp = await client.post(
+                f"http://127.0.0.1:{server.port}/rollout/missing/v1/chat/completions",
+                headers={"Authorization": "Bearer secret-token"},
+                json={"messages": []},
+            )
+            assert resp.status == 404
+    finally:
+        await server.stop()
+
+
+@pytest.mark.asyncio
+async def test_interception_server_non_stream_roundtrip_cleans_intercept() -> None:
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    queue = server.register_rollout("r1")
+    try:
+        async with aiohttp.ClientSession() as client:
+            request_task = asyncio.create_task(
+                client.post(
+                    f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions",
+                    headers={"Authorization": "Bearer secret-token"},
+                    json={
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "stream": False,
+                    },
+                )
+            )
+            request_id = await asyncio.wait_for(queue.get(), timeout=1.0)
+            intercept = server.get_intercept(request_id)
+            assert intercept is not None
+
+            await deliver_response(
+                intercept,
+                {
+                    "id": "resp-1",
+                    "model": "test-model",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {"role": "assistant", "content": "hello"},
+                            "finish_reason": "stop",
+                        }
+                    ],
+                },
+            )
+
+            resp = await request_task
+            assert resp.status == 200
+            payload = await resp.json()
+            assert payload["id"] == "resp-1"
+
+            # Request entries should not leak after completion.
+            assert server.get_intercept(request_id) is None
+    finally:
+        server.unregister_rollout("r1")
+        await server.stop()
+
+
+@pytest.mark.asyncio
+async def test_interception_server_unregister_rollout_cancels_pending_request() -> None:
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    queue = server.register_rollout("r1")
+    try:
+        async with aiohttp.ClientSession() as client:
+            request_task = asyncio.create_task(
+                client.post(
+                    f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions",
+                    headers={"Authorization": "Bearer secret-token"},
+                    json={
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "stream": False,
+                    },
+                )
+            )
+            _request_id = await asyncio.wait_for(queue.get(), timeout=1.0)
+            server.unregister_rollout("r1")
+
+            resp = await request_task
+            assert resp.status == 499
+            payload = await resp.json()
+            assert payload["error"] == "rollout cancelled"
+    finally:
+        await server.stop()
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py
index 6626c1c59..bfd37758a 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -165,6 +165,29 @@ def test_catalog_summary_shape() -> None:
         } <= entry.keys()
 
 
+def test_run_rollout_rejects_unknown_mode() -> None:
+    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
+
+    env = CodingAgentEnvironment()
+    with pytest.raises(ValueError, match="unsupported mode"):
+        env._run_rollout_impl(
+            agent="opencode",
+            base_url="https://api.openai.com/v1",
+            api_key="sk-test",
+            model="gpt-4o-mini",
+            instruction="hello",
+            setup=[],
+            verify=[],
+            task_id="",
+            mode="legacy_proxy",
+            disable_thinking=False,
+            max_tokens_cap=1024,
+            top_logprobs=5,
+            agent_timeout_s=30.0,
+            template="",
+        )
+
+
 def test_build_agent_config_opencode() -> None:
     from coding_agent_env.server.coding_environment import CodingAgentEnvironment
 
@@ -218,6 +241,35 @@ def test_build_agent_config_pi() -> None:
     assert cfg_gate.provider == "huggingface"
 
 
+def test_build_session_factory_requires_e2b_dependency() -> None:
+    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
+
+    env = CodingAgentEnvironment()
+    env._E2BSandboxBackend = None
+    cfg = env._build_agent_config(
+        agent="pi",
+        mode="black_box",
+        base_url="https://router.huggingface.co/v1",
+        api_key="hf_xxx",
+        model="zai-org/GLM-5.1",
+        agent_timeout_s=180.0,
+        disable_thinking=False,
+        top_logprobs=5,
+        max_tokens_cap=4096,
+    )
+
+    with pytest.raises(RuntimeError, match="E2BSandboxBackend unavailable"):
+        env._build_session_factory(
+            agent="pi",
+            config=cfg,
+            mode="black_box",
+            template="",
+            disable_thinking=False,
+            top_logprobs=5,
+            max_tokens_cap=4096,
+        )
+
+
 # ---------------------------------------------------------------------------
 # Models + task coercion
 # ---------------------------------------------------------------------------

From b18caf229f8341021f9aacb23d4ebe83d2465e34 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:39:29 +0530
Subject: [PATCH 17/35] chore: remove unsupported mode checks from
 CodingAgentEnvironment

---
 .../server/coding_environment.py              | 11 ---------
 tests/envs/test_coding_agent_env.py           | 23 -------------------
 2 files changed, 34 deletions(-)

diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index af70b292e..b1e7f47ef 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -184,11 +184,6 @@ def run_rollout(
                 raise ValueError(
                     f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}"
                 )
-            if mode not in {"black_box", "interception_gate"}:
-                raise ValueError(
-                    "unsupported mode {!r}; supported modes: ('black_box', "
-                    "'interception_gate')".format(mode)
-                )
             if not (base_url and api_key and model):
                 raise ValueError(
                     "must provide either ``endpoint`` (one of "
@@ -308,12 +303,6 @@ def _emit(msg: str) -> None:
                 except Exception:
                     pass
 
-        if mode not in {"black_box", "interception_gate"}:
-            raise ValueError(
-                "unsupported mode {!r}; supported modes: ('black_box', "
-                "'interception_gate')".format(mode)
-            )
-
         result = self._RolloutResult(task_id=task_id, mode=mode)
         t0 = time.time()
 
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py
index bfd37758a..905713e7a 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -165,29 +165,6 @@ def test_catalog_summary_shape() -> None:
         } <= entry.keys()
 
 
-def test_run_rollout_rejects_unknown_mode() -> None:
-    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
-
-    env = CodingAgentEnvironment()
-    with pytest.raises(ValueError, match="unsupported mode"):
-        env._run_rollout_impl(
-            agent="opencode",
-            base_url="https://api.openai.com/v1",
-            api_key="sk-test",
-            model="gpt-4o-mini",
-            instruction="hello",
-            setup=[],
-            verify=[],
-            task_id="",
-            mode="legacy_proxy",
-            disable_thinking=False,
-            max_tokens_cap=1024,
-            top_logprobs=5,
-            agent_timeout_s=30.0,
-            template="",
-        )
-
-
 def test_build_agent_config_opencode() -> None:
     from coding_agent_env.server.coding_environment import CodingAgentEnvironment
 

From bfc730542f899a231060082c3a76475915b01863 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:40:47 +0530
Subject: [PATCH 18/35] chore: revert linting for out of scope files

---
 envs/agent_world_model_env/server/web_ui.py   |  4 +-
 envs/chat_env/models.py                       |  4 +-
 envs/chat_env/server/chat_environment.py      |  4 +-
 .../server/coding_tools_env_environment.py    | 65 +++----------
 envs/coding_tools_env/server/e2b_sandbox.py   | 23 +----
 envs/coding_tools_env/server/gradio_ui.py     | 94 ++++++-------------
 .../jupyter_env/server/jupyter_environment.py | 10 +-
 envs/repl_env/server/repl_environment.py      |  4 +-
 .../server/terminus_env_environment.py        | 10 +-
 envs/textarena_env/server/gradio_ui.py        |  8 +-
 10 files changed, 64 insertions(+), 162 deletions(-)

diff --git a/envs/agent_world_model_env/server/web_ui.py b/envs/agent_world_model_env/server/web_ui.py
index 09b445d3f..84b10c6b2 100644
--- a/envs/agent_world_model_env/server/web_ui.py
+++ b/envs/agent_world_model_env/server/web_ui.py
@@ -21,7 +21,9 @@
 
 
 # Keep in sync with DEFAULT_REWARD_CONFIG in config.py.
-_DEFAULT_REWARD_JSON = json.dumps(DEFAULT_REWARD_CONFIG, indent=2)
+_DEFAULT_REWARD_JSON = json.dumps(
+    DEFAULT_REWARD_CONFIG, indent=2
+)
 
 
 def _format_obs_md(payload: dict | None) -> str:
diff --git a/envs/chat_env/models.py b/envs/chat_env/models.py
index da994cbe3..8bc10f09e 100644
--- a/envs/chat_env/models.py
+++ b/envs/chat_env/models.py
@@ -55,9 +55,7 @@ class ChatState(State):
     """State of the ChatEnvironment containing message history."""
 
     history_messages: list[Message] = Field(default_factory=list)
-    history_tokens: list[list[int]] = Field(
-        default_factory=list
-    )  # Same len as messages
+    history_tokens: list[list[int]] = Field(default_factory=list)  # Same len as messages
 
 
 class ChatObservation(Observation):
diff --git a/envs/chat_env/server/chat_environment.py b/envs/chat_env/server/chat_environment.py
index f66f3e790..90b2d01f0 100644
--- a/envs/chat_env/server/chat_environment.py
+++ b/envs/chat_env/server/chat_environment.py
@@ -90,9 +90,7 @@ def _coerce_tokens(self, tokens) -> list[int]:
     def _tokenize_conversation(self, conversation: list[Message]) -> list[int]:
         """Tokenize a conversation with a chat-template fallback for base tokenizers."""
         try:
-            tokens = self.tokenizer.apply_chat_template(
-                conversation=conversation, tokenize=True
-            )
+            tokens = self.tokenizer.apply_chat_template(conversation=conversation, tokenize=True)
         except Exception:
             # Some tokenizers (e.g. gpt2) do not define `chat_template`.
             fallback_text = "".join(
diff --git a/envs/coding_tools_env/server/coding_tools_env_environment.py b/envs/coding_tools_env/server/coding_tools_env_environment.py
index d0ef86675..615e7770f 100644
--- a/envs/coding_tools_env/server/coding_tools_env_environment.py
+++ b/envs/coding_tools_env/server/coding_tools_env_environment.py
@@ -45,28 +45,16 @@ def bash(command: str, timeout: float | None = 30) -> str:
                 return "Error: environment not reset. Call reset() first."
             timeout_value = 30 if timeout is None else float(timeout)
             result = self._sandbox.run_shell(command, timeout_s=timeout_value)
-            self._record(
-                "bash", result.ok, result.output, result.error, result.metadata
-            )
-            return (
-                result.output
-                if result.ok
-                else f"ERROR: {result.error}\n{result.output}".strip()
-            )
+            self._record("bash", result.ok, result.output, result.error, result.metadata)
+            return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip()
 
         @mcp.tool
-        def read(
-            file_path: str, offset: int | None = None, limit: int | None = None
-        ) -> str:
+        def read(file_path: str, offset: int | None = None, limit: int | None = None) -> str:
             """Read file contents using computer instance."""
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
-            result = self._sandbox.read_file(
-                file_path=file_path, offset=offset, limit=limit
-            )
-            self._record(
-                "read", result.ok, result.output, result.error, result.metadata
-            )
+            result = self._sandbox.read_file(file_path=file_path, offset=offset, limit=limit)
+            self._record("read", result.ok, result.output, result.error, result.metadata)
             return result.output if result.ok else f"ERROR: {result.error}"
 
         @mcp.tool
@@ -75,9 +63,7 @@ def write(file_path: str, content: str) -> str:
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
             result = self._sandbox.write_file(file_path=file_path, content=content)
-            self._record(
-                "write", result.ok, result.output, result.error, result.metadata
-            )
+            self._record("write", result.ok, result.output, result.error, result.metadata)
             return result.output if result.ok else f"ERROR: {result.error}"
 
         @mcp.tool
@@ -102,14 +88,10 @@ def edit(
                 updated = original.replace(old_string, new_string)
             else:
                 updated = original.replace(old_string, new_string, 1)
-            write_result = self._sandbox.write_file(
-                file_path=file_path, content=updated
-            )
+            write_result = self._sandbox.write_file(file_path=file_path, content=updated)
             ok = write_result.ok
             msg = "edit ok" if ok else ""
-            self._record(
-                "edit", ok, msg, write_result.error, {"replace_all": replace_all}
-            )
+            self._record("edit", ok, msg, write_result.error, {"replace_all": replace_all})
             return msg if ok else f"ERROR: {write_result.error}"
 
         @mcp.tool
@@ -147,11 +129,7 @@ def multi_edit(file_path: str, edits: list[dict[str, Any]]) -> str:
                 write_result.error,
                 {"applied": applied},
             )
-            return (
-                f"applied {applied} edits"
-                if write_result.ok
-                else f"ERROR: {write_result.error}"
-            )
+            return f"applied {applied} edits" if write_result.ok else f"ERROR: {write_result.error}"
 
         @mcp.tool
         def glob(pattern: str, path: str | None = None) -> str:
@@ -159,27 +137,17 @@ def glob(pattern: str, path: str | None = None) -> str:
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
             result = self._sandbox.glob_files(pattern=pattern, path=path)
-            self._record(
-                "glob", result.ok, result.output, result.error, result.metadata
-            )
+            self._record("glob", result.ok, result.output, result.error, result.metadata)
             return result.output if result.ok else f"ERROR: {result.error}"
 
         @mcp.tool
-        def grep(
-            pattern: str, path: str | None = None, include: str | None = None
-        ) -> str:
+        def grep(pattern: str, path: str | None = None, include: str | None = None) -> str:
             """Search for patterns in files."""
             if not self._sandbox:
                 return "Error: environment not reset. Call reset() first."
             result = self._sandbox.grep(pattern=pattern, path=path, include=include)
-            self._record(
-                "grep", result.ok, result.output, result.error, result.metadata
-            )
-            return (
-                result.output
-                if result.ok
-                else f"ERROR: {result.error}\n{result.output}".strip()
-            )
+            self._record("grep", result.ok, result.output, result.error, result.metadata)
+            return result.output if result.ok else f"ERROR: {result.error}\n{result.output}".strip()
 
         @mcp.tool
         def ls(path: str = ".", ignore: list[str] | None = None) -> str:
@@ -209,9 +177,7 @@ def todo_write(todos: list[dict[str, Any]]) -> str:
                     self._record("todo_write", False, "", msg, None)
                     return msg
             self._state.todos = validated
-            self._record(
-                "todo_write", True, f"stored {len(validated)} todos", None, None
-            )
+            self._record("todo_write", True, f"stored {len(validated)} todos", None, None)
             return f"stored {len(validated)} todos"
 
         @mcp.tool
@@ -315,8 +281,7 @@ def reset(
                             "sandbox_id": self._state.sandbox_id,
                             "message": "Setup command failed.",
                             "setup_results": [
-                                entry.model_dump()
-                                for entry in self._state.setup_results
+                                entry.model_dump() for entry in self._state.setup_results
                             ],
                         },
                     )
diff --git a/envs/coding_tools_env/server/e2b_sandbox.py b/envs/coding_tools_env/server/e2b_sandbox.py
index d6f77373b..5833c7ecb 100644
--- a/envs/coding_tools_env/server/e2b_sandbox.py
+++ b/envs/coding_tools_env/server/e2b_sandbox.py
@@ -94,11 +94,7 @@ def read_file(
     def write_file(self, file_path: str, content: str) -> ToolResult:
         try:
             self._sbx.files.write(file_path, content.encode("utf-8"))
-            return ToolResult(
-                ok=True,
-                output="write ok",
-                metadata={"bytes": len(content.encode("utf-8"))},
-            )
+            return ToolResult(ok=True, output="write ok", metadata={"bytes": len(content.encode("utf-8"))})
         except Exception as exc:
             return ToolResult(ok=False, error=f"write failed: {exc}")
 
@@ -115,9 +111,7 @@ def glob_files(self, pattern: str, path: str | None = None) -> ToolResult:
         if result is None:
             return ToolResult(ok=False, error=_format_error(execution))
         matches = result.get("matches", [])
-        return ToolResult(
-            ok=True, output="\n".join(matches), metadata={"matches": matches}
-        )
+        return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches})
 
     def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResult:
         ignore = ignore or []
@@ -143,15 +137,10 @@ def list_dir(self, path: str = ".", ignore: list[str] | None = None) -> ToolResu
         if not result.get("ok", False):
             return ToolResult(ok=False, error=str(result.get("error", "ls failed")))
         items = result.get("items", [])
-        lines = [
-            f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}"
-            for item in items
-        ]
+        lines = [f"{'[dir]' if item['is_dir'] else '[file]'} {item['name']}" for item in items]
         return ToolResult(ok=True, output="\n".join(lines), metadata={"items": items})
 
-    def grep(
-        self, pattern: str, path: str | None = None, include: str | None = None
-    ) -> ToolResult:
+    def grep(self, pattern: str, path: str | None = None, include: str | None = None) -> ToolResult:
         root = path or "."
         code = (
             "from pathlib import Path\n"
@@ -184,9 +173,7 @@ def grep(
         if not result.get("ok", False):
             return ToolResult(ok=False, error=str(result.get("error", "grep failed")))
         matches = result.get("matches", [])
-        return ToolResult(
-            ok=True, output="\n".join(matches), metadata={"matches": matches}
-        )
+        return ToolResult(ok=True, output="\n".join(matches), metadata={"matches": matches})
 
     def kill(self) -> None:
         try:
diff --git a/envs/coding_tools_env/server/gradio_ui.py b/envs/coding_tools_env/server/gradio_ui.py
index 1f3845141..c0d670a99 100644
--- a/envs/coding_tools_env/server/gradio_ui.py
+++ b/envs/coding_tools_env/server/gradio_ui.py
@@ -105,9 +105,7 @@ def _extract_tool_error(result: dict[str, Any]) -> bool:
 
 def _format_status(state: dict[str, Any]) -> str:
     if not state:
-        return (
-            "**No active session.** Configure setup/verify and click *Reset sandbox*."
-        )
+        return "**No active session.** Configure setup/verify and click *Reset sandbox*."
     sandbox_id = state.get("sandbox_id") or "—"
     step_count = state.get("step_count", 0)
     submitted = state.get("submitted", False)
@@ -229,9 +227,9 @@ def state_payload() -> dict[str, Any]:
                         label="edits (JSON array)",
                         language="json",
                         value=(
-                            "[\n"
+                            '[\n'
                             '  {"old_string": "TODO", "new_string": "DONE", "replace_all": false}\n'
-                            "]"
+                            ']'
                         ),
                         lines=8,
                     )
@@ -262,10 +260,10 @@ def state_payload() -> dict[str, Any]:
                         label="todos (JSON array)",
                         language="json",
                         value=(
-                            "[\n"
+                            '[\n'
                             '  {"id":"1","content":"Inspect files",'
                             '"status":"in_progress","priority":"high"}\n'
-                            "]"
+                            ']'
                         ),
                         lines=8,
                     )
@@ -339,33 +337,23 @@ def on_tool_change(tool: str):
             return [help_md, *updates]
 
         tool_dropdown.change(
-            on_tool_change,
-            inputs=[tool_dropdown],
-            outputs=[tool_help, *group_components],
+            on_tool_change, inputs=[tool_dropdown], outputs=[tool_help, *group_components]
         )
 
         # ───────── Result rendering helper ─────────
-        def render_result(
-            tool: str, raw: dict[str, Any]
-        ) -> tuple[str, str, str, str, str, list[list[str]]]:
+        def render_result(tool: str, raw: dict[str, Any]) -> tuple[str, str, str, str, str, list[list[str]]]:
             text = _extract_tool_text(raw)
-            is_error = (
-                _extract_tool_error(raw)
-                or text.startswith("ERROR:")
-                or text.startswith("Error:")
-            )
+            is_error = _extract_tool_error(raw) or text.startswith("ERROR:") or text.startswith("Error:")
             badge = "❌ error" if is_error else "✅ ok"
             status_line = f"**{tool}** — {badge}"
             state = state_payload()
             return (
-                status_line,  # output_status
-                text,  # output_view
-                json.dumps(raw, indent=2),  # raw_response
-                _format_status(
-                    state
-                ),  # state_summary (top + summary panel — same content)
+                status_line,                       # output_status
+                text,                              # output_view
+                json.dumps(raw, indent=2),         # raw_response
+                _format_status(state),             # state_summary (top + summary panel — same content)
                 json.dumps(state, indent=2, default=str),  # state_json
-                _format_history(state),  # history_table
+                _format_history(state),            # history_table
             )
 
         # ───────── Session handlers ─────────
@@ -410,33 +398,21 @@ async def on_close():
         async def on_run(
             tool: str,
             # bash
-            bash_command: str,
-            bash_timeout: float,
+            bash_command: str, bash_timeout: float,
             # read
-            read_path: str,
-            read_offset: float | None,
-            read_limit: float | None,
+            read_path: str, read_offset: float | None, read_limit: float | None,
             # write
-            write_path: str,
-            write_content: str,
+            write_path: str, write_content: str,
             # edit
-            edit_path: str,
-            edit_old: str,
-            edit_new: str,
-            edit_replace_all: bool,
+            edit_path: str, edit_old: str, edit_new: str, edit_replace_all: bool,
             # multi_edit
-            multi_edit_path: str,
-            multi_edit_json: str,
+            multi_edit_path: str, multi_edit_json: str,
             # glob
-            glob_pattern: str,
-            glob_path: str,
+            glob_pattern: str, glob_path: str,
             # grep
-            grep_pattern: str,
-            grep_path: str,
-            grep_include: str,
+            grep_pattern: str, grep_path: str, grep_include: str,
             # ls
-            ls_path: str,
-            ls_ignore: str,
+            ls_path: str, ls_ignore: str,
             # todo_write
             todo_json: str,
         ):
@@ -517,26 +493,14 @@ async def on_run(
         # ───────── Wire up events ─────────
         all_inputs = [
             tool_dropdown,
-            bash_command,
-            bash_timeout,
-            read_path,
-            read_offset,
-            read_limit,
-            write_path,
-            write_content,
-            edit_path,
-            edit_old,
-            edit_new,
-            edit_replace_all,
-            multi_edit_path,
-            multi_edit_json,
-            glob_pattern,
-            glob_path,
-            grep_pattern,
-            grep_path,
-            grep_include,
-            ls_path,
-            ls_ignore,
+            bash_command, bash_timeout,
+            read_path, read_offset, read_limit,
+            write_path, write_content,
+            edit_path, edit_old, edit_new, edit_replace_all,
+            multi_edit_path, multi_edit_json,
+            glob_pattern, glob_path,
+            grep_pattern, grep_path, grep_include,
+            ls_path, ls_ignore,
             todo_json,
         ]
         all_outputs = [
diff --git a/envs/jupyter_env/server/jupyter_environment.py b/envs/jupyter_env/server/jupyter_environment.py
index b7902e5d2..bc622ae22 100644
--- a/envs/jupyter_env/server/jupyter_environment.py
+++ b/envs/jupyter_env/server/jupyter_environment.py
@@ -348,10 +348,7 @@ def step(
     ) -> Observation:
         self._state.step_count += 1
         obs = super().step(action, timeout_s=timeout_s, **kwargs)
-        if (
-            self._state.submitted_answer is not None
-            and self._state.last_reward is not None
-        ):
+        if self._state.submitted_answer is not None and self._state.last_reward is not None:
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
@@ -364,10 +361,7 @@ async def step_async(
     ) -> Observation:
         self._state.step_count += 1
         obs = await super().step_async(action, timeout_s=timeout_s, **kwargs)
-        if (
-            self._state.submitted_answer is not None
-            and self._state.last_reward is not None
-        ):
+        if self._state.submitted_answer is not None and self._state.last_reward is not None:
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
diff --git a/envs/repl_env/server/repl_environment.py b/envs/repl_env/server/repl_environment.py
index 13a759c29..f2e6f5d98 100644
--- a/envs/repl_env/server/repl_environment.py
+++ b/envs/repl_env/server/repl_environment.py
@@ -272,7 +272,9 @@ def reset(
         # reset() are treated as equal and don't trigger a redundant rebuild.
         resolved_model = self._resolve_model(llm_model)
         has_runtime_llm = self._runtime_controller is not None
-        model_changed = has_runtime_llm and resolved_model != self._current_llm_model
+        model_changed = (
+            has_runtime_llm and resolved_model != self._current_llm_model
+        )
         token_provided = hf_token is not None
         if not self.llm_query_fn or model_changed or token_provided:
             effective_token = (
diff --git a/envs/terminus_env/server/terminus_env_environment.py b/envs/terminus_env/server/terminus_env_environment.py
index 03de18baa..c6f9e1c02 100644
--- a/envs/terminus_env/server/terminus_env_environment.py
+++ b/envs/terminus_env/server/terminus_env_environment.py
@@ -183,10 +183,7 @@ def step(
     ) -> Observation:
         self._state.step_count += 1
         obs = super().step(action, timeout_s=timeout_s, **kwargs)
-        if (
-            self._state.submitted_answer is not None
-            and self._state.last_reward is not None
-        ):
+        if self._state.submitted_answer is not None and self._state.last_reward is not None:
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
@@ -199,10 +196,7 @@ async def step_async(
     ) -> Observation:
         self._state.step_count += 1
         obs = await super().step_async(action, timeout_s=timeout_s, **kwargs)
-        if (
-            self._state.submitted_answer is not None
-            and self._state.last_reward is not None
-        ):
+        if self._state.submitted_answer is not None and self._state.last_reward is not None:
             obs.done = True
             obs.reward = self._state.last_reward
         return obs
diff --git a/envs/textarena_env/server/gradio_ui.py b/envs/textarena_env/server/gradio_ui.py
index c9bb88cae..45728fc00 100644
--- a/envs/textarena_env/server/gradio_ui.py
+++ b/envs/textarena_env/server/gradio_ui.py
@@ -71,9 +71,7 @@ def _sudoku_demo_html() -> str:
         for col in range(9):
             value = givens.get((row, col), "")
             border_right = "3px solid #0f172a" if col in {2, 5} else "1px solid #94a3b8"
-            border_bottom = (
-                "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8"
-            )
+            border_bottom = "3px solid #0f172a" if row in {2, 5} else "1px solid #94a3b8"
             background = "#e2e8f0" if value else "#ffffff"
             cells.append(
                 f"""
@@ -84,7 +82,7 @@ def _sudoku_demo_html() -> str:
   align-items: center;
   justify-content: center;
   font-size: 1.1rem;
-  font-weight: {"700" if value else "400"};
+  font-weight: {'700' if value else '400'};
   color: #0f172a;
   background: {background};
   border-right: {border_right};
@@ -107,7 +105,7 @@ def _sudoku_demo_html() -> str:
     border: 3px solid #0f172a;
     background: #ffffff;
   ">
-    {"".join(cells)}
+    {''.join(cells)}
   </div>
   <p style="margin-top: 16px; color: #475569; font-size: 0.95rem; line-height: 1.45;">
     Use the <strong>Playground</strong> tab to reset the game and submit moves in the

From c2ca0c8734c510c175968b77ff5383bc56d3be8c Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:52:53 +0530
Subject: [PATCH 19/35] feat: host-side tool routing and Pi gate models
 bootstrap

---
 src/openenv/core/harness/agents/cli_driver.py |  85 ++++++++++-
 .../harness/agents/interception_server.py     | 136 +++++++++++++++++-
 tests/core/test_cli_agent_driver.py           |  37 +++++
 tests/core/test_interception_server.py        | 124 ++++++++++++++++
 4 files changed, 379 insertions(+), 3 deletions(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 42161bfec..b499d3c5f 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -36,7 +36,11 @@
 from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 
 from .base import CLIAgentSpec
-from .interception_server import deliver_response, InterceptionServer
+from .interception_server import (
+    deliver_response,
+    InterceptionServer,
+    ToolHandler,
+)
 
 
 _log = logging.getLogger(__name__)
@@ -44,6 +48,19 @@
 Verifier = Callable[..., VerifyResult]
 
 
+class _ConfigOverrideView:
+    """Read-only attribute view with optional overrides."""
+
+    def __init__(self, base: Any, **overrides: Any) -> None:
+        self._base = base
+        self._overrides = overrides
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._overrides:
+            return self._overrides[name]
+        return getattr(self._base, name)
+
+
 class CLIAgentSession(ResourceSession):
     """Per-rollout session wrapping one sandbox with one running agent CLI."""
 
@@ -211,6 +228,25 @@ async def deliver(
         """Return a trainer-generated response to the waiting agent."""
         await deliver_response(intercept, response_dict)
 
+    def register_tool_handler(
+        self,
+        tool_name: str,
+        handler: ToolHandler,
+        *,
+        tool_definition: dict[str, Any] | None = None,
+    ) -> None:
+        """Register a host-side interception tool for this rollout."""
+        if self._interception_server is None or self._interception_rollout_id is None:
+            raise RuntimeError(
+                "register_tool_handler() is only available in interception_gate mode."
+            )
+        self._interception_server.register_tool_handler(
+            self._interception_rollout_id,
+            tool_name,
+            handler,
+            tool_definition=tool_definition,
+        )
+
 
 class CLIAgentDriver:
     """Shared driver for all CLI-based agentic harnesses."""
@@ -415,8 +451,23 @@ def _start_agent(
         *,
         base_url_override: str | None = None,
     ) -> BgJob:
+        command_config = config
+        if (
+            self.mode == "interception_gate"
+            and self._interception_server is not None
+            and self.spec.name == "pi"
+            and base_url_override
+        ):
+            self._write_pi_models_config(
+                sandbox,
+                config,
+                rollout_url=base_url_override,
+                api_key=self._interception_server.secret,
+            )
+            command_config = _ConfigOverrideView(config, provider="openenv")
+
         if self.spec.build_command is not None:
-            cmd = self.spec.build_command(self.spec, config, task, None)
+            cmd = self.spec.build_command(self.spec, command_config, task, None)
         else:
             cmd = " ".join(shlex.quote(c) for c in self.spec.base_command)
         envs = self._resolve_env_vars(config, base_url_override=base_url_override)
@@ -425,6 +476,36 @@ def _start_agent(
             envs["ANTHROPIC_API_KEY"] = self._interception_server.secret
         return sandbox.start_bg(cmd, envs=envs)
 
+    def _write_pi_models_config(
+        self,
+        sandbox: SandboxHandle,
+        config: Any,
+        *,
+        rollout_url: str,
+        api_key: str,
+    ) -> None:
+        home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
+        model = config.model if hasattr(config, "model") else "model"
+        content = json.dumps(
+            {
+                "providers": {
+                    "openenv": {
+                        "baseUrl": rollout_url,
+                        "api": "openai-completions",
+                        "apiKey": api_key,
+                        "compat": {
+                            "supportsDeveloperRole": False,
+                            "supportsReasoningEffort": False,
+                        },
+                        "models": [{"id": model, "reasoning": False}],
+                    }
+                }
+            },
+            indent=2,
+        )
+        for path in {f"{home}/.pi/agent/models.json", "/root/.pi/agent/models.json"}:
+            sandbox.write_text(path, content)
+
     def _resolve_env_vars(
         self,
         config: Any,
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index 1aa4edf57..70f8c4247 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -56,7 +56,7 @@
 import threading
 import time
 import uuid
-from typing import Any
+from typing import Any, Awaitable, Callable
 
 from aiohttp import web
 
@@ -66,6 +66,8 @@
 _KEEPALIVE_INTERVAL_S = 3.0
 _MAX_REQUEST_BODY = 16 * 1024 * 1024
 
+ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]
+
 
 class InterceptionServer:
     """Async HTTP server that gates every LLM call from sandboxed agents.
@@ -100,6 +102,10 @@ async def start(self) -> None:
                 "/rollout/{rollout_id}/v1/chat/completions",
                 self._handle_chat_completions,
             )
+            app.router.add_post(
+                "/rollout/{rollout_id}/v1/tools/{tool_name}",
+                self._handle_tool_call,
+            )
             app.router.add_get("/health", self._handle_health)
             runner = web.AppRunner(app)
             await runner.setup()
@@ -155,6 +161,8 @@ def register_rollout(
             self.active_rollouts[rollout_id] = {
                 "request_id_queue": queue,
                 "state": state,
+                "tool_handlers": {},
+                "tool_defs": {},
             }
         return queue
 
@@ -185,6 +193,74 @@ def get_intercept(self, request_id: str) -> dict[str, Any] | None:
         with self._state_lock:
             return self.intercepts.get(request_id)
 
+    def register_tool_handler(
+        self,
+        rollout_id: str,
+        tool_name: str,
+        handler: ToolHandler,
+        *,
+        tool_definition: dict[str, Any] | None = None,
+    ) -> None:
+        """Register a host-side tool handler for a rollout.
+
+        The handler is called by ``POST /rollout/{rollout_id}/v1/tools/{tool_name}``
+        with a JSON payload containing ``arguments``.
+
+        Optionally provide ``tool_definition`` (OpenAI tool schema). Registered
+        schemas are injected into intercepted chat-completion requests for the
+        rollout when the incoming request does not already include the tool.
+        """
+        with self._state_lock:
+            context = self.active_rollouts.get(rollout_id)
+            if context is None:
+                raise KeyError(f"rollout not found: {rollout_id}")
+            handlers: dict[str, ToolHandler] = context["tool_handlers"]
+            handlers[tool_name] = handler
+            if tool_definition is not None:
+                tool_defs: dict[str, dict[str, Any]] = context["tool_defs"]
+                tool_defs[tool_name] = tool_definition
+
+    def unregister_tool_handler(self, rollout_id: str, tool_name: str) -> None:
+        with self._state_lock:
+            context = self.active_rollouts.get(rollout_id)
+            if context is None:
+                return
+            handlers: dict[str, ToolHandler] = context.get("tool_handlers", {})
+            handlers.pop(tool_name, None)
+            tool_defs: dict[str, dict[str, Any]] = context.get("tool_defs", {})
+            tool_defs.pop(tool_name, None)
+
+    @staticmethod
+    def _tool_name(tool: dict[str, Any]) -> str | None:
+        if not isinstance(tool, dict):
+            return None
+        function = tool.get("function")
+        if not isinstance(function, dict):
+            return None
+        name = function.get("name")
+        return name if isinstance(name, str) and name else None
+
+    def _merge_rollout_tools(
+        self,
+        tools: Any,
+        tool_defs: dict[str, dict[str, Any]],
+    ) -> list[dict[str, Any]] | None:
+        merged: list[dict[str, Any]] = []
+        if isinstance(tools, list):
+            for tool in tools:
+                if isinstance(tool, dict):
+                    merged.append(tool)
+
+        existing = {
+            name for item in merged if (name := self._tool_name(item)) is not None
+        }
+        for name, tool in tool_defs.items():
+            if name in existing:
+                continue
+            merged.append(tool)
+
+        return merged or None
+
     def _authorized(self, request: web.Request) -> bool:
         auth = request.headers.get("Authorization", "")
         api_key = request.headers.get("x-api-key", "")
@@ -195,6 +271,59 @@ def _authorized(self, request: web.Request) -> bool:
     async def _handle_health(self, request: web.Request) -> web.Response:
         return web.json_response({"status": "ok"})
 
+    async def _handle_tool_call(self, request: web.Request) -> web.Response:
+        if not self._authorized(request):
+            return web.json_response({"error": "Unauthorized"}, status=401)
+
+        rollout_id = request.match_info["rollout_id"]
+        tool_name = request.match_info["tool_name"]
+        with self._state_lock:
+            context = self.active_rollouts.get(rollout_id)
+            if context is None:
+                return web.json_response({"error": "rollout not found"}, status=404)
+            handlers: dict[str, ToolHandler] = context.get("tool_handlers", {})
+            handler = handlers.get(tool_name)
+            if handler is None:
+                return web.json_response({"error": "tool not found"}, status=404)
+
+        try:
+            body = await request.json()
+        except Exception as exc:
+            return web.json_response({"error": f"invalid JSON: {exc}"}, status=400)
+
+        arguments_raw: Any
+        if isinstance(body, dict) and "arguments" in body:
+            arguments_raw = body.get("arguments")
+        else:
+            arguments_raw = body
+
+        if arguments_raw is None:
+            arguments = {}
+        elif isinstance(arguments_raw, dict):
+            arguments = arguments_raw
+        else:
+            return web.json_response(
+                {"error": "tool arguments must be a JSON object"},
+                status=400,
+            )
+
+        try:
+            response = await handler(arguments)
+        except Exception:
+            _log.exception(
+                "tool handler failed (rollout=%s, tool=%s)",
+                rollout_id,
+                tool_name,
+            )
+            return web.json_response({"error": "tool execution failed"}, status=500)
+
+        if not isinstance(response, dict):
+            return web.json_response(
+                {"error": "tool handler must return a JSON object"},
+                status=500,
+            )
+        return web.json_response(response)
+
     async def _handle_chat_completions(
         self, request: web.Request
     ) -> web.StreamResponse | web.Response:
@@ -212,6 +341,11 @@ async def _handle_chat_completions(
         except Exception as exc:
             return web.json_response({"error": f"invalid JSON: {exc}"}, status=400)
 
+        tool_defs: dict[str, dict[str, Any]] = dict(context.get("tool_defs", {}))
+        merged_tools = self._merge_rollout_tools(body.get("tools"), tool_defs)
+        if merged_tools is not None:
+            body["tools"] = merged_tools
+
         is_streaming = bool(body.get("stream"))
         request_id = f"req_{uuid.uuid4().hex[:8]}"
         chunk_queue: asyncio.Queue | None = asyncio.Queue() if is_streaming else None
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index af9629970..11110e34c 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -529,6 +529,43 @@ def test_opencode_interception_gate_uses_server_secret_not_user_key(self):
         assert envs["OPENAI_API_KEY"] == "gate-secret"
         session.close()
 
+    def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+        from openenv.core.harness.agents.interception_server import InterceptionServer
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        backend = FakeSandboxBackend()
+        server = InterceptionServer(port=0, secret="gate-secret")
+        driver = CLIAgentDriver(
+            spec=PI_SPEC,
+            sandbox_backend=backend,
+            mode="interception_gate",
+            interception_server=server,
+            interception_base_url="http://127.0.0.1:8765",
+        )
+
+        session = driver.create_session(task=FakeTask(), config=FakeConfig())
+        sbx = backend.created[0]
+
+        # Command should force the custom provider backed by models.json.
+        cmd, _envs = sbx.bg_commands[-1]
+        assert "--provider openenv" in cmd
+
+        home_models = "/home/user/.pi/agent/models.json"
+        root_models = "/root/.pi/agent/models.json"
+        assert home_models in sbx.written
+        assert root_models in sbx.written
+
+        cfg = json.loads(sbx.written[home_models])
+        provider = cfg["providers"]["openenv"]
+        assert provider["api"] == "openai-completions"
+        assert provider["apiKey"] == "gate-secret"
+        assert provider["models"][0]["id"] == "test-model"
+        assert "/rollout/" in provider["baseUrl"]
+        assert provider["baseUrl"].endswith("/v1")
+
+        session.close()
+
     def test_create_session_runs_task_setup_shell(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
 
diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py
index 45922849c..77d844aff 100644
--- a/tests/core/test_interception_server.py
+++ b/tests/core/test_interception_server.py
@@ -17,6 +17,22 @@
 )
 
 
+_ANSWER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "answer",
+        "description": "Submit final answer for grading",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "answer": {"type": "string"},
+            },
+            "required": ["answer"],
+        },
+    },
+}
+
+
 @pytest.mark.asyncio
 async def test_interception_server_rejects_unauthorized_requests() -> None:
     server = InterceptionServer(port=0, secret="secret-token")
@@ -122,3 +138,111 @@ async def test_interception_server_unregister_rollout_cancels_pending_request()
             assert payload["error"] == "rollout cancelled"
     finally:
         await server.stop()
+
+
+@pytest.mark.asyncio
+async def test_interception_server_tool_endpoint_executes_registered_handler() -> None:
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    server.register_rollout("r1")
+    seen: dict[str, object] = {}
+
+    async def _handler(arguments: dict) -> dict:
+        seen["arguments"] = arguments
+        return {"content": [{"type": "text", "text": "✅"}]}
+
+    server.register_tool_handler("r1", "answer", _handler)
+    try:
+        async with aiohttp.ClientSession() as client:
+            resp = await client.post(
+                f"http://127.0.0.1:{server.port}/rollout/r1/v1/tools/answer",
+                headers={"Authorization": "Bearer secret-token"},
+                json={"arguments": {"answer": "42"}},
+            )
+            assert resp.status == 200
+            payload = await resp.json()
+            assert payload["content"][0]["text"] == "✅"
+            assert seen["arguments"] == {"answer": "42"}
+    finally:
+        server.unregister_rollout("r1")
+        await server.stop()
+
+
+@pytest.mark.asyncio
+async def test_interception_server_tool_endpoint_returns_404_for_unknown_tool() -> None:
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    server.register_rollout("r1")
+    try:
+        async with aiohttp.ClientSession() as client:
+            resp = await client.post(
+                f"http://127.0.0.1:{server.port}/rollout/r1/v1/tools/missing",
+                headers={"Authorization": "Bearer secret-token"},
+                json={"arguments": {}},
+            )
+            assert resp.status == 404
+    finally:
+        server.unregister_rollout("r1")
+        await server.stop()
+
+
+@pytest.mark.asyncio
+async def test_interception_server_injects_registered_tool_defs_into_intercept() -> (
+    None
+):
+    server = InterceptionServer(port=0, secret="secret-token")
+    await server.start()
+    queue = server.register_rollout("r1")
+
+    async def _handler(arguments: dict) -> dict:
+        return {"content": [{"type": "text", "text": str(arguments)}]}
+
+    server.register_tool_handler(
+        "r1",
+        "answer",
+        _handler,
+        tool_definition=_ANSWER_TOOL,
+    )
+
+    try:
+        async with aiohttp.ClientSession() as client:
+            request_task = asyncio.create_task(
+                client.post(
+                    f"http://127.0.0.1:{server.port}/rollout/r1/v1/chat/completions",
+                    headers={"Authorization": "Bearer secret-token"},
+                    json={
+                        "messages": [{"role": "user", "content": "grade this"}],
+                        "stream": False,
+                    },
+                )
+            )
+            request_id = await asyncio.wait_for(queue.get(), timeout=1.0)
+            intercept = server.get_intercept(request_id)
+            assert intercept is not None
+            tool_names = {
+                tool["function"]["name"]
+                for tool in intercept.get("tools", [])
+                if isinstance(tool, dict) and isinstance(tool.get("function"), dict)
+            }
+            assert "answer" in tool_names
+
+            await deliver_response(
+                intercept,
+                {
+                    "id": "resp-1",
+                    "model": "test-model",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {"role": "assistant", "content": "done"},
+                            "finish_reason": "stop",
+                        }
+                    ],
+                },
+            )
+
+            resp = await request_task
+            assert resp.status == 200
+    finally:
+        server.unregister_rollout("r1")
+        await server.stop()

From 1b1d9fbff307fb11f450906778063f3930be7604 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:53:56 +0530
Subject: [PATCH 20/35] fix: support configurable Pi workdir for command and
 MCP config

---
 src/openenv/core/harness/agents/cli_driver.py | 10 +++---
 src/openenv/core/harness/agents/pi.py         |  6 +++-
 tests/core/test_cli_agent_driver.py           | 35 +++++++++++++++++++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index b499d3c5f..3a6c50f51 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -428,14 +428,14 @@ def _write_mcp_config(self, sandbox: SandboxHandle, config: Any) -> None:
             self.spec.mcp_config.method == "config_file"
             and self.spec.mcp_config.path_template
         ):
-            workdir = (
-                config.sandbox_home + "/workdir"
-                if hasattr(config, "sandbox_home")
-                else "/home/user/workdir"
-            )
             home = (
                 config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
             )
+            workdir = (
+                config.workdir
+                if hasattr(config, "workdir") and getattr(config, "workdir")
+                else f"{home}/workdir"
+            )
             mcp_path = self.spec.mcp_config.path_template.format(
                 workdir=workdir, home=home
             )
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index dcc552842..6d553eee4 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -49,7 +49,11 @@ def _build_command(
     home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
     instruction_file = f"{home}/task/instruction.txt"
     log_file = f"{home}/logs/agent/pi.txt"
-    workdir = f"{home}/workdir"
+    workdir = (
+        config.workdir
+        if hasattr(config, "workdir") and getattr(config, "workdir")
+        else f"{home}/workdir"
+    )
 
     provider = ""
     if hasattr(config, "provider") and config.provider:
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 11110e34c..d27ca00cd 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -156,6 +156,7 @@ class FakeConfig:
     model: str = "test-model"
     agent_timeout_s: float = 300.0
     sandbox_home: str = "/home/user"
+    workdir: str | None = None
     extra_env: dict[str, str] = field(default_factory=dict)
 
 
@@ -438,6 +439,20 @@ def test_create_session_full_lifecycle(self):
         session.close()
         assert sbx._killed
 
+    def test_create_session_honors_configured_workdir_for_mcp_file(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec()
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        config = FakeConfig(workdir="/testbed")
+        session = driver.create_session(task=FakeTask(), config=config)
+
+        sbx = backend.created[0]
+        assert "/testbed/mcp.json" in sbx.written
+        session.close()
+
     def test_create_session_skips_install_when_prebaked(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
 
@@ -1038,6 +1053,26 @@ class PiConfig:
         assert "-p @'/home/user with space/task/instruction.txt'" in cmd
         assert "tee '/home/user with space/logs/agent/pi.txt'" in cmd
 
+    def test_build_command_uses_config_workdir_when_present(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        @dataclass
+        class PiConfig:
+            sandbox_home: str = "/home/user"
+            workdir: str = "/testbed"
+            provider: str = "openai"
+            model: str = "model/name"
+            thinking: str = "off"
+
+        assert PI_SPEC.build_command is not None
+        cmd = PI_SPEC.build_command(
+            PI_SPEC,
+            PiConfig(),
+            FakeTask(instruction="Write hello.py"),
+            None,
+        )
+        assert "cd /testbed" in cmd
+
 
 # Env var resolution
 

From 2f52e4858f3cc8647b89f2d5ae9b78855fdfa1e1 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:55:24 +0530
Subject: [PATCH 21/35] fix: Docker host gateway mapping and per-create image
 override

---
 src/openenv/core/harness/sandbox/base.py      |  7 ++++-
 .../core/harness/sandbox/docker_backend.py    | 17 ++++++++--
 .../core/harness/sandbox/e2b_backend.py       |  2 ++
 .../core/harness/sandbox/hf_backend.py        |  3 +-
 tests/core/test_docker_sandbox_backend.py     | 31 +++++++++++++++++++
 5 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/src/openenv/core/harness/sandbox/base.py b/src/openenv/core/harness/sandbox/base.py
index d84e267e1..22f096310 100644
--- a/src/openenv/core/harness/sandbox/base.py
+++ b/src/openenv/core/harness/sandbox/base.py
@@ -96,5 +96,10 @@ def create(
         timeout_s: int = 900,
         envs: dict[str, str] | None = None,
         metadata: dict[str, str] | None = None,
+        image: str | None = None,
     ) -> SandboxHandle:
-        """Create and return a new, ready-to-use sandbox."""
+        """Create and return a new, ready-to-use sandbox.
+
+        ``image`` is backend-specific and may be ignored by providers that do
+        not support per-sandbox image selection.
+        """
diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py
index 28447ce2e..a64070a46 100644
--- a/src/openenv/core/harness/sandbox/docker_backend.py
+++ b/src/openenv/core/harness/sandbox/docker_backend.py
@@ -299,9 +299,16 @@ def __init__(
         user: str | None = None,
     ) -> None:
         self._image = image
-        self._docker_args = docker_args or []
+        self._docker_args = list(docker_args or [])
         self._user = user
 
+        # Linux Docker Engine does not auto-resolve host.docker.internal
+        # unless we explicitly map it.
+        if "host.docker.internal:host-gateway" not in self._docker_args:
+            self._docker_args.extend(
+                ["--add-host", "host.docker.internal:host-gateway"]
+            )
+
         try:
             subprocess.run(
                 ["docker", "version"],
@@ -324,6 +331,7 @@ def create(
         timeout_s: int = 900,
         envs: dict[str, str] | None = None,
         metadata: dict[str, str] | None = None,
+        image: str | None = None,
     ) -> DockerSandboxHandle:
         cmd = [
             "docker",
@@ -338,7 +346,8 @@ def create(
         for k, v in (envs or {}).items():
             cmd.extend(["-e", f"{k}={v}"])
         cmd.extend(self._docker_args)
-        cmd.extend([self._image, "sleep", str(timeout_s)])
+        effective_image = image or self._image
+        cmd.extend([effective_image, "sleep", str(timeout_s)])
 
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
         if result.returncode != 0:
@@ -347,6 +356,8 @@ def create(
             )
         container_id = result.stdout.strip()
         _log.info(
-            "Docker sandbox created: %s (image=%s)", container_id[:12], self._image
+            "Docker sandbox created: %s (image=%s)",
+            container_id[:12],
+            effective_image,
         )
         return DockerSandboxHandle(container_id, user=self._user)
diff --git a/src/openenv/core/harness/sandbox/e2b_backend.py b/src/openenv/core/harness/sandbox/e2b_backend.py
index 29c9d952d..c0cbf75ba 100644
--- a/src/openenv/core/harness/sandbox/e2b_backend.py
+++ b/src/openenv/core/harness/sandbox/e2b_backend.py
@@ -184,7 +184,9 @@ def create(
         timeout_s: int = 900,
         envs: dict[str, str] | None = None,
         metadata: dict[str, str] | None = None,
+        image: str | None = None,
     ) -> SandboxHandle:
+        del image
         sbx = Sandbox.create(
             template=self._template,
             timeout=timeout_s,
diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py
index 43ec5ad95..3857ea7e6 100644
--- a/src/openenv/core/harness/sandbox/hf_backend.py
+++ b/src/openenv/core/harness/sandbox/hf_backend.py
@@ -228,9 +228,10 @@ def create(
         timeout_s: int = 900,
         envs: dict[str, str] | None = None,
         metadata: dict[str, str] | None = None,
+        image: str | None = None,
     ) -> SandboxHandle:
         # `hf-sandbox` does not support metadata at create-time yet.
-        del metadata
+        del metadata, image
 
         timeout = self._timeout or _format_timeout(timeout_s)
         last_error: Exception | None = None
diff --git a/tests/core/test_docker_sandbox_backend.py b/tests/core/test_docker_sandbox_backend.py
index b2eebddd2..c309e63b3 100644
--- a/tests/core/test_docker_sandbox_backend.py
+++ b/tests/core/test_docker_sandbox_backend.py
@@ -69,6 +69,37 @@ def test_create_sandbox_backend_unknown_raises(self):
         with pytest.raises(ValueError, match="Unknown sandbox backend"):
             create_sandbox_backend("bogus")  # type: ignore[arg-type]
 
+    def test_create_adds_host_gateway_and_supports_image_override(self, monkeypatch):
+        import openenv.core.harness.sandbox.docker_backend as docker_backend
+
+        calls: list[list[str]] = []
+
+        def _fake_run(cmd, *args, **kwargs):
+            calls.append(list(cmd))
+            if cmd[:2] == ["docker", "version"]:
+                return subprocess.CompletedProcess(cmd, 0, "", "")
+            if cmd[:2] == ["docker", "run"]:
+                return subprocess.CompletedProcess(
+                    cmd,
+                    0,
+                    "1234567890abcdef\n",
+                    "",
+                )
+            return subprocess.CompletedProcess(cmd, 0, "", "")
+
+        monkeypatch.setattr(docker_backend.subprocess, "run", _fake_run)
+
+        backend = docker_backend.DockerSandboxBackend(image="base:latest")
+        handle = backend.create(image="override:latest")
+        assert handle.sandbox_id == "1234567890ab"
+
+        run_cmds = [cmd for cmd in calls if cmd[:2] == ["docker", "run"]]
+        assert len(run_cmds) == 1
+        run_cmd = run_cmds[0]
+        assert "--add-host" in run_cmd
+        assert "host.docker.internal:host-gateway" in run_cmd
+        assert "override:latest" in run_cmd
+
     @pytest.mark.skipif(_DOCKER_AVAILABLE, reason="Only test error when Docker missing")
     def test_backend_raises_without_docker(self):
         from openenv.core.harness.sandbox.docker_backend import DockerSandboxBackend

From f3fede2354f072dc438677eb6d66da5edba4f8ee Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Sat, 16 May 2026 20:57:00 +0530
Subject: [PATCH 22/35] feat: configurable extension directory support for CLI
 agents

---
 src/openenv/core/harness/agents/base.py       |  8 ++++++
 src/openenv/core/harness/agents/cli_driver.py | 26 +++++++++++++++++++
 src/openenv/core/harness/agents/pi.py         |  1 +
 tests/core/test_cli_agent_driver.py           | 21 +++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/src/openenv/core/harness/agents/base.py b/src/openenv/core/harness/agents/base.py
index ded9ba3b8..4ec1c297a 100644
--- a/src/openenv/core/harness/agents/base.py
+++ b/src/openenv/core/harness/agents/base.py
@@ -206,6 +206,14 @@ class CLIAgentSpec:
     resolved from the rollout config at runtime.
     """
 
+    extension_dir_template: str | None = None
+    """Optional extension install directory template.
+
+    Receives ``{home}`` substitution at runtime (e.g.
+    ``"{home}/.pi/agent/extensions"``). Drivers may use this to create
+    extension directories in the correct sandbox user home.
+    """
+
     build_command: Callable[..., str] | None = None
     """``(spec, config, task, mcp_config_path) -> str``
 
diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 3a6c50f51..2ff07e4d2 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -352,6 +352,7 @@ def _bootstrap_sandbox(
         self._wait_for_sandbox_ready(sandbox)
         if not self._agent_already_installed(sandbox):
             self._install_agent(sandbox)
+        self._ensure_extension_dir(sandbox, config)
         self._upload_files(sandbox, task, config)
         self._write_mcp_config(sandbox, config)
         setup_shell = task.setup_shell if hasattr(task, "setup_shell") else None
@@ -406,6 +407,31 @@ def _install_agent(self, sandbox: SandboxHandle) -> None:
                 label=f"{self.spec.name} install",
             )
 
+    def _resolve_sandbox_home(self, sandbox: SandboxHandle, config: Any) -> str:
+        configured = getattr(config, "sandbox_home", None)
+        if isinstance(configured, str) and configured.strip():
+            return configured
+        try:
+            result = sandbox.exec('printf %s "$HOME"', timeout=5)
+            candidate = (result.stdout or "").strip()
+            if result.exit_code == 0 and candidate:
+                return candidate
+        except Exception:
+            pass
+        return "/home/user"
+
+    def _ensure_extension_dir(self, sandbox: SandboxHandle, config: Any) -> None:
+        template = self.spec.extension_dir_template
+        if not template:
+            return
+        home = self._resolve_sandbox_home(sandbox, config)
+        extension_dir = template.format(home=home)
+        result = sandbox.exec(f"mkdir -p {shlex.quote(extension_dir)}", timeout=10)
+        if result.exit_code != 0:
+            raise RuntimeError(
+                f"failed to create extension dir {extension_dir!r}: {result.stderr}"
+            )
+
     def _upload_files(self, sandbox: SandboxHandle, task: Any, config: Any) -> None:
         if not self.spec.files:
             return
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index 6d553eee4..03946c552 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -144,6 +144,7 @@ def _parse_events(line: str) -> AgentEvent | None:
         "PI_SKIP_VERSION_CHECK": "1",
         "PI_TELEMETRY": "0",
     },
+    extension_dir_template="{home}/.pi/agent/extensions",
     build_command=_build_command,
     build_mcp_config=_build_mcp_config,
     parse_events=_parse_events,
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index d27ca00cd..977bf6703 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -213,6 +213,7 @@ def test_cli_agent_spec_minimal(self):
         assert spec.files is None
         assert spec.artifacts is None
         assert spec.env is None
+        assert spec.extension_dir_template is None
         assert spec.build_command is None
 
     def test_cli_agent_spec_full(self):
@@ -453,6 +454,21 @@ def test_create_session_honors_configured_workdir_for_mcp_file(self):
         assert "/testbed/mcp.json" in sbx.written
         session.close()
 
+    def test_create_session_creates_extension_dir_when_spec_declares_one(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+
+        spec = _make_test_spec(extension_dir_template="{home}/.agent/extensions")
+        backend = FakeSandboxBackend()
+        driver = CLIAgentDriver(spec=spec, sandbox_backend=backend, mode="black_box")
+
+        session = driver.create_session(task=FakeTask(), config=FakeConfig())
+        sbx = backend.created[0]
+        assert any(
+            cmd.startswith("mkdir -p /home/user/.agent/extensions")
+            for cmd in sbx.executed
+        )
+        session.close()
+
     def test_create_session_skips_install_when_prebaked(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
 
@@ -1073,6 +1089,11 @@ class PiConfig:
         )
         assert "cd /testbed" in cmd
 
+    def test_spec_declares_extension_dir_template(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        assert PI_SPEC.extension_dir_template == "{home}/.pi/agent/extensions"
+
 
 # Env var resolution
 

From 448f6905f258aa1ca0032812b4cb454bd31380ea Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 13:18:09 +0530
Subject: [PATCH 23/35] fix: thread-safe queue handling

---
 .../harness/agents/interception_server.py     | 96 +++++++++++++++++--
 .../core/harness/sandbox/hf_backend.py        |  5 +-
 2 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index 70f8c4247..7bc67fedc 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -164,6 +164,12 @@ def register_rollout(
                 "tool_handlers": {},
                 "tool_defs": {},
             }
+            active = len(self.active_rollouts)
+        _log.info(
+            "interception_rollout_registered rollout_id=%s active_rollouts=%d",
+            rollout_id,
+            active,
+        )
         return queue
 
     def unregister_rollout(self, rollout_id: str) -> None:
@@ -176,7 +182,9 @@ def unregister_rollout(self, rollout_id: str) -> None:
             matching_intercepts = [self.intercepts[i] for i in matching_ids]
             for request_id in matching_ids:
                 del self.intercepts[request_id]
-            self.active_rollouts.pop(rollout_id, None)
+            removed = self.active_rollouts.pop(rollout_id, None) is not None
+            active = len(self.active_rollouts)
+            pending = len(self.intercepts)
 
         for intercept in matching_intercepts:
             fut: asyncio.Future | None = intercept.get("response_future")
@@ -189,10 +197,27 @@ def unregister_rollout(self, rollout_id: str) -> None:
                 except asyncio.QueueFull:
                     pass
 
+        _log.info(
+            "interception_rollout_unregistered rollout_id=%s removed=%s "
+            "active_rollouts=%d pending_intercepts=%d",
+            rollout_id,
+            removed,
+            active,
+            pending,
+        )
+
     def get_intercept(self, request_id: str) -> dict[str, Any] | None:
         with self._state_lock:
             return self.intercepts.get(request_id)
 
+    def stats(self) -> dict[str, int]:
+        """Return lightweight runtime counters for health/debug views."""
+        with self._state_lock:
+            return {
+                "active_rollouts": len(self.active_rollouts),
+                "pending_intercepts": len(self.intercepts),
+            }
+
     def register_tool_handler(
         self,
         rollout_id: str,
@@ -269,7 +294,7 @@ def _authorized(self, request: web.Request) -> bool:
         ) or hmac.compare_digest(api_key, self.secret)
 
     async def _handle_health(self, request: web.Request) -> web.Response:
-        return web.json_response({"status": "ok"})
+        return web.json_response({"status": "ok", **self.stats()})
 
     async def _handle_tool_call(self, request: web.Request) -> web.Response:
         if not self._authorized(request):
@@ -433,6 +458,53 @@ async def _stream_response(
         return resp
 
 
+def _resolve_future_threadsafe(
+    future: asyncio.Future, value: Any
+) -> None:
+    """Set a future's result from any thread.
+
+    ``asyncio.Future`` is not thread-safe: calling ``set_result`` from a
+    thread that is not running the future's event loop can silently fail
+    to wake the coroutine awaiting it.  This helper detects cross-loop
+    calls and uses ``call_soon_threadsafe`` to schedule the resolution on
+    the correct loop.
+    """
+    if future.done():
+        return
+    loop = future.get_loop()
+    try:
+        running = asyncio.get_running_loop()
+    except RuntimeError:
+        running = None
+    if running is loop:
+        future.set_result(value)
+    else:
+        loop.call_soon_threadsafe(future.set_result, value)
+
+
+def _put_queue_threadsafe(
+    q: asyncio.Queue, item: Any
+) -> None:
+    """Put an item on an asyncio.Queue from any thread."""
+    loop = getattr(q, "_loop", None)
+    if loop is None:
+        # Fallback: try put_nowait which is simpler.
+        try:
+            q.put_nowait(item)
+            return
+        except asyncio.QueueFull:
+            pass
+        return
+    try:
+        running = asyncio.get_running_loop()
+    except RuntimeError:
+        running = None
+    if running is loop:
+        q.put_nowait(item)
+    else:
+        loop.call_soon_threadsafe(q.put_nowait, item)
+
+
 async def deliver_response(
     intercept: dict[str, Any], response_dict: dict[str, Any]
 ) -> None:
@@ -441,14 +513,20 @@ async def deliver_response(
     For non-streaming requests, resolves the future directly.
     For streaming requests, synthesizes SSE chunks from the complete
     response and signals EOF.
+
+    Thread-safe: can be called from any thread, not just the event loop
+    that owns the future/queue.  This is required because the rollout
+    worker may run ``deliver_response`` from its own ``asyncio.run()``
+    in a daemon thread while the ``InterceptionServer``'s aiohttp
+    handler awaits the future on a different loop.
     """
     is_streaming = intercept.get("stream", False)
     chunk_queue: asyncio.Queue | None = intercept.get("chunk_queue")
     future: asyncio.Future | None = intercept.get("response_future")
 
     if not is_streaming:
-        if future and not future.done():
-            future.set_result(response_dict)
+        if future:
+            _resolve_future_threadsafe(future, response_dict)
         return
 
     if chunk_queue is None:
@@ -474,7 +552,7 @@ async def deliver_response(
                 }
             ],
         }
-        await chunk_queue.put(content_chunk)
+        _put_queue_threadsafe(chunk_queue, content_chunk)
         finish_chunk = {
             "id": response_dict.get("id", ""),
             "object": "chat.completion.chunk",
@@ -488,11 +566,11 @@ async def deliver_response(
                 }
             ],
         }
-        await chunk_queue.put(finish_chunk)
+        _put_queue_threadsafe(chunk_queue, finish_chunk)
 
-    await chunk_queue.put(None)
-    if future and not future.done():
-        future.set_result(response_dict)
+    _put_queue_threadsafe(chunk_queue, None)
+    if future:
+        _resolve_future_threadsafe(future, response_dict)
 
 
 __all__ = [
diff --git a/src/openenv/core/harness/sandbox/hf_backend.py b/src/openenv/core/harness/sandbox/hf_backend.py
index 3857ea7e6..3b7b060b5 100644
--- a/src/openenv/core/harness/sandbox/hf_backend.py
+++ b/src/openenv/core/harness/sandbox/hf_backend.py
@@ -231,15 +231,16 @@ def create(
         image: str | None = None,
     ) -> SandboxHandle:
         # `hf-sandbox` does not support metadata at create-time yet.
-        del metadata, image
+        del metadata
 
         timeout = self._timeout or _format_timeout(timeout_s)
+        effective_image = image or self._image
         last_error: Exception | None = None
 
         for attempt in range(self._create_retries):
             try:
                 sbx = Sandbox.create(
-                    image=self._image,
+                    image=effective_image,
                     flavor=self._flavor,
                     timeout=timeout,
                     forward_hf_token=self._forward_hf_token,

From 37e549d5dc52fa318b88aae16f2edc0a2c68e805 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 14:18:52 +0530
Subject: [PATCH 24/35] fix: interception gate support in
 CodingAgentSessionFactory

---
 envs/coding_agent_env/harness.py | 33 +++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py
index 2355260f5..748dcb091 100644
--- a/envs/coding_agent_env/harness.py
+++ b/envs/coding_agent_env/harness.py
@@ -8,6 +8,8 @@
 
 from __future__ import annotations
 
+import asyncio
+import uuid
 from typing import Any, Literal
 
 from openenv.core.harness import ResourceSessionFactory
@@ -21,11 +23,7 @@
 from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
 
 from .config import CodingAgentConfig
-from .opencode_runtime import (
-    agent_log_path,
-    build_env_vars,
-    build_run_cmd,
-)
+from .opencode_runtime import agent_log_path, build_env_vars, build_run_cmd
 from .task import CodingAgentTask
 
 
@@ -124,12 +122,37 @@ def create(
             _log.error("factory.create: bootstrap failed: %r", exc)
             sandbox.kill()
             raise
+
+        # Wire up interception_gate if the driver is configured for it
+        base_url_override: str | None = None
+        interception_rollout_id: str | None = None
+        interception_queue: asyncio.Queue | None = None
+
+        if self._driver.mode == "interception_gate":
+            assert self._driver._interception_server is not None
+            assert self._driver._interception_base_url is not None
+            rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}"
+            interception_rollout_id = rollout_id
+            interception_queue = self._driver._interception_server.register_rollout(
+                rollout_id
+            )
+            base_url_override = (
+                f"{self._driver._interception_base_url.rstrip('/')}"
+                f"/rollout/{rollout_id}/v1"
+            )
+
         session = CodingAgentSession(
             sandbox=sandbox,
             config=self._config,
             task=oc_task,
             verifier=self._verifier,
+            base_url_override=base_url_override,
         )
+        # Pass interception fields to the parent CLIAgentSession
+        session._interception_server = self._driver._interception_server
+        session._interception_rollout_id = interception_rollout_id
+        session._interception_queue = interception_queue
+
         session.start_agent()
         return session
 

From 8aa9d18ce39a1e3d7a97d2ddde5c0572d60b3e8c Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 14:28:35 +0530
Subject: [PATCH 25/35] fix: improve error handling and config propagation
 across agent pipeline

- Wire disable_thinking and max_tokens_cap through CodingAgentConfig
- Raise RuntimeError on mkdir/cat failures in docker backend
- Propagate QueueFull exceptions instead of silently swallowing
- Change CommandResult.exit_code to int | None for bootstrap clarity
---
 envs/coding_agent_env/config.py               |  4 ++++
 envs/coding_agent_env/models.py               |  9 ++++++--
 envs/coding_agent_env/opencode_runtime.py     | 15 +++++++++++--
 .../server/coding_environment.py              | 21 +++++++++++++++----
 .../harness/agents/interception_server.py     | 17 +++++----------
 .../core/harness/sandbox/docker_backend.py    | 14 +++++++++++--
 6 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/envs/coding_agent_env/config.py b/envs/coding_agent_env/config.py
index b3243253e..d70610542 100644
--- a/envs/coding_agent_env/config.py
+++ b/envs/coding_agent_env/config.py
@@ -45,6 +45,10 @@ class CodingAgentConfig(BaseModel):
     extra_env: dict[str, str] = Field(default_factory=dict)
     extra_setup_shell: str | None = None
 
+    # --- Model behavior --------------------------------------------------------
+    disable_thinking: bool = False
+    max_tokens_cap: int | None = None
+
     # --- Sandbox paths --------------------------------------------------------
     # Root directory inside the sandbox where the primitive writes config,
     # task files, and logs. E2B's default user is ``user`` with home
diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py
index 2111d84d5..e338a4867 100644
--- a/envs/coding_agent_env/models.py
+++ b/envs/coding_agent_env/models.py
@@ -35,10 +35,15 @@ class RolloutTurn(BaseModel):
 
 
 class CommandResult(BaseModel):
-    """Outcome of one bash command in setup/verify."""
+    """Outcome of one bash command in setup/verify.
+
+    When ``exit_code`` is ``None``, the command ran during sandbox bootstrap
+    and its individual exit code was not captured (bootstrap succeeds or fails
+    atomically).
+    """
 
     cmd: str
-    exit_code: int
+    exit_code: int | None = None
     stdout: str = ""
     stderr: str = ""
     duration_s: float = 0.0
diff --git a/envs/coding_agent_env/opencode_runtime.py b/envs/coding_agent_env/opencode_runtime.py
index 49855528b..31285556e 100644
--- a/envs/coding_agent_env/opencode_runtime.py
+++ b/envs/coding_agent_env/opencode_runtime.py
@@ -52,6 +52,12 @@ def build_opencode_json(config: CodingAgentConfig) -> str:
     """
 
     provider_name = "intercepted"
+    model_key = config.model.split("/", 1)[-1]
+
+    model_block: dict[str, Any] = {"name": "Intercepted Model"}
+    if config.max_tokens_cap is not None:
+        model_block["limit"] = {"output": config.max_tokens_cap}
+
     provider_block: dict[str, Any] = {
         "npm": provider_npm_package(config.provider),
         "name": "Intercepted",
@@ -61,16 +67,21 @@ def build_opencode_json(config: CodingAgentConfig) -> str:
             "timeout": config.request_timeout_ms,
         },
         "models": {
-            config.model.split("/", 1)[-1]: {"name": "Intercepted Model"},
+            model_key: model_block,
         },
     }
 
     doc: dict[str, Any] = {
         "$schema": "https://opencode.ai/config.json",
-        "model": f"{provider_name}/{config.model.split('/', 1)[-1]}",
+        "model": f"{provider_name}/{model_key}",
         "provider": {provider_name: provider_block},
     }
 
+    # Disable thinking/reasoning tokens when requested. AI SDK respects
+    # the top-level "reasoning" key to control reasoning token generation.
+    if config.disable_thinking:
+        doc["reasoning"] = "none"
+
     tools = _build_tools_block(config)
     if tools:
         doc["tools"] = tools
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index b1e7f47ef..9000ed4e0 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -22,6 +22,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import os
 import time
 from typing import Any, Optional
@@ -50,6 +51,8 @@
 HOME = "/home/user"
 WORKDIR = f"{HOME}/workdir"
 INSTRUCTION_PATH = f"{HOME}/task/instruction.md"
+_log = logging.getLogger(__name__)
+
 REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
 PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
 AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
@@ -83,21 +86,22 @@ def __init__(self) -> None:
         # Lazy imports so module import stays cheap and so tests can patch.
         try:
             from ..models import (
-                CommandResult,
                 CodingAgentState,
+                CommandResult,
                 RolloutResult,
                 RolloutTurn,
             )
         except ImportError:  # pragma: no cover
             from models import (  # type: ignore
-                CommandResult,
                 CodingAgentState,
+                CommandResult,
                 RolloutResult,
                 RolloutTurn,
             )
 
         from openenv.core.harness.agents import get_agent_spec
         from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
+
         from coding_agent_env.config import CodingAgentConfig
         from coding_agent_env.harness import CodingAgentSessionFactory
         from coding_agent_env.task import CodingAgentTask
@@ -374,8 +378,8 @@ def _emit(msg: str) -> None:
                 result.setup_results.append(
                     self._CommandResult(
                         cmd=cmd,
-                        exit_code=0,
-                        stdout="executed during bootstrap",
+                        exit_code=None,
+                        stdout="executed during bootstrap (individual exit code not captured)",
                         stderr="",
                         duration_s=0.0,
                     )
@@ -466,12 +470,21 @@ def _build_agent_config(
         max_tokens_cap: int,
     ) -> Any:
         if agent == "opencode":
+            if top_logprobs:
+                _log.warning(
+                    "top_logprobs=%d is not supported for agent='opencode' "
+                    "and will have no effect. Use interception_gate mode for "
+                    "logprob capture.",
+                    top_logprobs,
+                )
             return self._CodingAgentConfig(
                 provider="openai_compatible",
                 base_url=base_url.rstrip("/"),
                 api_key=api_key,
                 model=model,
                 agent_timeout_s=agent_timeout_s,
+                disable_thinking=disable_thinking,
+                max_tokens_cap=max_tokens_cap if max_tokens_cap != 4096 else None,
             )
 
         provider = self._infer_pi_provider(base_url)
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index 7bc67fedc..5e541700d 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -458,9 +458,7 @@ async def _stream_response(
         return resp
 
 
-def _resolve_future_threadsafe(
-    future: asyncio.Future, value: Any
-) -> None:
+def _resolve_future_threadsafe(future: asyncio.Future, value: Any) -> None:
     """Set a future's result from any thread.
 
     ``asyncio.Future`` is not thread-safe: calling ``set_result`` from a
@@ -482,18 +480,13 @@ def _resolve_future_threadsafe(
         loop.call_soon_threadsafe(future.set_result, value)
 
 
-def _put_queue_threadsafe(
-    q: asyncio.Queue, item: Any
-) -> None:
+def _put_queue_threadsafe(q: asyncio.Queue, item: Any) -> None:
     """Put an item on an asyncio.Queue from any thread."""
     loop = getattr(q, "_loop", None)
     if loop is None:
-        # Fallback: try put_nowait which is simpler.
-        try:
-            q.put_nowait(item)
-            return
-        except asyncio.QueueFull:
-            pass
+        # Fallback: put_nowait which is simpler. Let QueueFull propagate —
+        # silently dropping items would cause hard-to-debug streaming issues.
+        q.put_nowait(item)
         return
     try:
         running = asyncio.get_running_loop()
diff --git a/src/openenv/core/harness/sandbox/docker_backend.py b/src/openenv/core/harness/sandbox/docker_backend.py
index a64070a46..120fb9a11 100644
--- a/src/openenv/core/harness/sandbox/docker_backend.py
+++ b/src/openenv/core/harness/sandbox/docker_backend.py
@@ -162,12 +162,17 @@ def start_bg(
     def write_text(self, path: str, content: str) -> None:
         parent = str(PurePosixPath(path).parent)
         if parent not in ("", "/"):
-            subprocess.run(
+            mkdir_result = subprocess.run(
                 ["docker", "exec", self._container_id, "mkdir", "-p", parent],
                 capture_output=True,
                 timeout=10,
             )
-        subprocess.run(
+            if mkdir_result.returncode != 0:
+                raise RuntimeError(
+                    f"Failed to create directory {parent!r} in container "
+                    f"{self._container_id}: {mkdir_result.stderr.decode(errors='replace')}"
+                )
+        write_result = subprocess.run(
             [
                 "docker",
                 "exec",
@@ -181,6 +186,11 @@ def write_text(self, path: str, content: str) -> None:
             capture_output=True,
             timeout=30,
         )
+        if write_result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to write file {path!r} in container "
+                f"{self._container_id}: {write_result.stderr.decode(errors='replace')}"
+            )
 
     def read_text(self, path: str) -> str:
         result = subprocess.run(

From 61e5524762182be06d43b290895d33f0e3e66b09 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 14:35:18 +0530
Subject: [PATCH 26/35] fix: whitespace secret validation + conditional /root/
 write

---
 src/openenv/core/harness/agents/cli_driver.py          | 5 ++++-
 src/openenv/core/harness/agents/interception_server.py | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 2ff07e4d2..587362746 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -529,7 +529,10 @@ def _write_pi_models_config(
             },
             indent=2,
         )
-        for path in {f"{home}/.pi/agent/models.json", "/root/.pi/agent/models.json"}:
+        paths = {f"{home}/.pi/agent/models.json"}
+        if home == "/root":
+            paths.add("/root/.pi/agent/models.json")
+        for path in paths:
             sandbox.write_text(path, content)
 
     def _resolve_env_vars(
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index 5e541700d..19b05bb95 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -85,6 +85,8 @@ def __init__(
         self.port = port
         self.host = host
         self.secret = secret or secrets.token_urlsafe(32)
+        if not self.secret.strip():
+            raise ValueError("InterceptionServer secret must not be blank.")
         self._app: web.Application | None = None
         self._runner: web.AppRunner | None = None
         self._site: web.TCPSite | None = None

From a2b43887c0b207837fb67c1d5d96d8e9c3bdf0fb Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 15:17:50 +0530
Subject: [PATCH 27/35] fix: cross-loop safe request queue via stdlib
 queue.Queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

replace asyncio.Queue with stdlib queue.Queue for the request
notification path (server → training loop). This makes both
directions of the InterceptionServer cross-loop/cross-thread safe:

- Request notifications: queue.Queue (inherently thread-safe)
- Response delivery: asyncio.Future via _resolve_future_threadsafe
  (already cross-loop safe)

The consumer (next_request) uses asyncio.to_thread(q.get, timeout=...)
to await without blocking the event loop. This follows the same
pattern used by OpenClaw-RL at scale.

chunk_queue (internal SSE streaming) remains asyncio.Queue since both
producer and consumer run on the server's own event loop.
---
 src/openenv/core/harness/agents/cli_driver.py   | 17 +++++++----------
 .../core/harness/agents/interception_server.py  | 17 +++++++++--------
 tests/core/test_cli_agent_driver.py             | 12 ++++++------
 tests/core/test_interception_server.py          |  6 +++---
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 587362746..a2724162e 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -20,6 +20,7 @@
 import asyncio
 import json
 import logging
+import queue as _queue_mod
 import shlex
 import time
 import uuid
@@ -36,11 +37,7 @@
 from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
 
 from .base import CLIAgentSpec
-from .interception_server import (
-    deliver_response,
-    InterceptionServer,
-    ToolHandler,
-)
+from .interception_server import deliver_response, InterceptionServer, ToolHandler
 
 
 _log = logging.getLogger(__name__)
@@ -76,7 +73,7 @@ def __init__(
         agent_bg_job: BgJob | None = None,
         interception_server: InterceptionServer | None = None,
         interception_rollout_id: str | None = None,
-        interception_queue: asyncio.Queue | None = None,
+        interception_queue: _queue_mod.Queue[str] | None = None,
     ) -> None:
         self.spec = spec
         self.sandbox = sandbox
@@ -204,14 +201,14 @@ async def next_request(
                     f"{self.spec.name} interception_gate: no request within timeout"
                 )
             try:
-                request_id = await asyncio.wait_for(
-                    self._interception_queue.get(),
+                request_id = await asyncio.to_thread(
+                    self._interception_queue.get,
                     timeout=min(remaining, 1.0),
                 )
                 intercept = server.get_intercept(request_id)
                 if intercept is not None:
                     return intercept
-            except asyncio.TimeoutError:
+            except _queue_mod.Empty:
                 pass
 
             if self._agent_bg_job is not None:
@@ -317,7 +314,7 @@ def create_session(
 
         base_url_override: str | None = None
         interception_rollout_id: str | None = None
-        interception_queue: asyncio.Queue | None = None
+        interception_queue: _queue_mod.Queue[str] | None = None
 
         if self.mode == "interception_gate":
             assert self._interception_server is not None
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index 19b05bb95..a71082e69 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -31,11 +31,11 @@
     # Docker: base_url = f"http://host.docker.internal:{server.port}"
     # Remote: base_url = your_tunnel_or_public_url
 
-    queue = server.register_rollout(rollout_id)
+    request_queue = server.register_rollout(rollout_id)
     # Agent runs with OPENAI_BASE_URL = f"{base_url}/rollout/{rollout_id}/v1"
 
     while True:
-        request_id = await asyncio.wait_for(queue.get(), timeout=...)
+        request_id = await asyncio.to_thread(request_queue.get, timeout=...)
         intercept = server.get_intercept(request_id)
         if intercept is None:
             continue
@@ -52,6 +52,7 @@
 import hmac
 import json
 import logging
+import queue as _queue_mod
 import secrets
 import threading
 import time
@@ -157,11 +158,11 @@ def register_rollout(
         self,
         rollout_id: str,
         state: dict[str, Any] | None = None,
-    ) -> asyncio.Queue:
-        queue: asyncio.Queue = asyncio.Queue()
+    ) -> _queue_mod.Queue[str]:
+        request_queue: _queue_mod.Queue[str] = _queue_mod.Queue()
         with self._state_lock:
             self.active_rollouts[rollout_id] = {
-                "request_id_queue": queue,
+                "request_id_queue": request_queue,
                 "state": state,
                 "tool_handlers": {},
                 "tool_defs": {},
@@ -172,7 +173,7 @@ def register_rollout(
             rollout_id,
             active,
         )
-        return queue
+        return request_queue
 
     def unregister_rollout(self, rollout_id: str) -> None:
         with self._state_lock:
@@ -393,8 +394,8 @@ async def _handle_chat_completions(
             if context is None:
                 return web.json_response({"error": "rollout not found"}, status=404)
             self.intercepts[request_id] = intercept
-            request_queue: asyncio.Queue = context["request_id_queue"]
-        await request_queue.put(request_id)
+            request_queue: _queue_mod.Queue[str] = context["request_id_queue"]
+        request_queue.put_nowait(request_id)
 
         if is_streaming:
             return await self._stream_response(request, intercept)
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 977bf6703..18854fe7e 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -18,6 +18,7 @@
 from __future__ import annotations
 
 import json
+import queue as _queue_mod
 from dataclasses import dataclass, field
 from typing import Any
 
@@ -585,7 +586,8 @@ def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self)
         home_models = "/home/user/.pi/agent/models.json"
         root_models = "/root/.pi/agent/models.json"
         assert home_models in sbx.written
-        assert root_models in sbx.written
+        # /root/ path is only written when sandbox_home == "/root"
+        assert root_models not in sbx.written
 
         cfg = json.loads(sbx.written[home_models])
         provider = cfg["providers"]["openenv"]
@@ -786,15 +788,13 @@ def test_close_kills_sandbox_and_jobs(self):
 
     @pytest.mark.asyncio
     async def test_next_request_handles_missing_intercept_without_keyerror(self):
-        import asyncio
-
         from openenv.core.harness.agents.cli_driver import CLIAgentSession
         from openenv.core.harness.agents.interception_server import InterceptionServer
 
         spec = _make_test_spec()
         sbx = FakeSandbox()
-        queue: asyncio.Queue[str] = asyncio.Queue()
-        await queue.put("req_missing")
+        q: _queue_mod.Queue[str] = _queue_mod.Queue()
+        q.put("req_missing")
 
         session = CLIAgentSession(
             spec=spec,
@@ -804,7 +804,7 @@ async def test_next_request_handles_missing_intercept_without_keyerror(self):
             agent_bg_job=FakeBgJob(),
             interception_server=InterceptionServer(secret="s"),
             interception_rollout_id="rollout-1",
-            interception_queue=queue,
+            interception_queue=q,
         )
 
         # Missing request IDs can happen if unregister_rollout races with queue.get().
diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py
index 77d844aff..73421e1a7 100644
--- a/tests/core/test_interception_server.py
+++ b/tests/core/test_interception_server.py
@@ -81,7 +81,7 @@ async def test_interception_server_non_stream_roundtrip_cleans_intercept() -> No
                     },
                 )
             )
-            request_id = await asyncio.wait_for(queue.get(), timeout=1.0)
+            request_id = await asyncio.to_thread(queue.get, timeout=1.0)
             intercept = server.get_intercept(request_id)
             assert intercept is not None
 
@@ -129,7 +129,7 @@ async def test_interception_server_unregister_rollout_cancels_pending_request()
                     },
                 )
             )
-            _request_id = await asyncio.wait_for(queue.get(), timeout=1.0)
+            _request_id = await asyncio.to_thread(queue.get, timeout=1.0)
             server.unregister_rollout("r1")
 
             resp = await request_task
@@ -216,7 +216,7 @@ async def _handler(arguments: dict) -> dict:
                     },
                 )
             )
-            request_id = await asyncio.wait_for(queue.get(), timeout=1.0)
+            request_id = await asyncio.to_thread(queue.get, timeout=1.0)
             intercept = server.get_intercept(request_id)
             assert intercept is not None
             tool_names = {

From b10a4483a2309d7bdbd5fa5aacb5cfe34a082209 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 21:13:48 +0530
Subject: [PATCH 28/35] fix: replace asyncio.Queue with queue.Queue for
 thread-safe request handling - soak test

---
 envs/coding_agent_env/harness.py    |  4 +-
 tests/core/test_cli_agent_driver.py | 76 +++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py
index 748dcb091..379a055bb 100644
--- a/envs/coding_agent_env/harness.py
+++ b/envs/coding_agent_env/harness.py
@@ -8,7 +8,7 @@
 
 from __future__ import annotations
 
-import asyncio
+import queue as _queue_mod
 import uuid
 from typing import Any, Literal
 
@@ -126,7 +126,7 @@ def create(
         # Wire up interception_gate if the driver is configured for it
         base_url_override: str | None = None
         interception_rollout_id: str | None = None
-        interception_queue: asyncio.Queue | None = None
+        interception_queue: _queue_mod.Queue[str] | None = None
 
         if self._driver.mode == "interception_gate":
             assert self._driver._interception_server is not None
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 18854fe7e..6c1c1511e 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -17,8 +17,11 @@
 
 from __future__ import annotations
 
+import asyncio
 import json
 import queue as _queue_mod
+import threading
+import time
 from dataclasses import dataclass, field
 from typing import Any
 
@@ -810,6 +813,79 @@ async def test_next_request_handles_missing_intercept_without_keyerror(self):
         # Missing request IDs can happen if unregister_rollout races with queue.get().
         assert await session.next_request(timeout_s=0.2) is None
 
+    def test_next_request_soak_cross_loop_queue_get(self):
+        """Soak test cross-loop request dequeueing via queue.Queue.
+
+        Exercises the worker pattern that used to be unsafe with asyncio.Queue:
+        repeatedly call next_request() from fresh event loops (asyncio.run)
+        while request IDs are pushed from another thread.
+        """
+        from openenv.core.harness.agents.cli_driver import CLIAgentSession
+        from openenv.core.harness.agents.interception_server import InterceptionServer
+
+        spec = _make_test_spec()
+        sbx = FakeSandbox()
+        server = InterceptionServer(secret="s")
+        request_queue = server.register_rollout("rollout-soak")
+
+        session = CLIAgentSession(
+            spec=spec,
+            sandbox=sbx,
+            task=FakeTask(),
+            config=FakeConfig(),
+            interception_server=server,
+            interception_rollout_id="rollout-soak",
+            interception_queue=request_queue,
+        )
+
+        total_requests = 200
+        consumed: list[str] = []
+        failures: list[BaseException] = []
+
+        def _consumer() -> None:
+            try:
+                for _ in range(total_requests):
+                    intercept = asyncio.run(session.next_request(timeout_s=2.0))
+                    assert intercept is not None
+                    request_id = intercept["request_id"]
+                    consumed.append(request_id)
+                    with server._state_lock:
+                        server.intercepts.pop(request_id, None)
+            except BaseException as exc:  # pragma: no cover - assertion path
+                failures.append(exc)
+
+        def _producer() -> None:
+            try:
+                for i in range(total_requests):
+                    request_id = f"req_soak_{i:04d}"
+                    with server._state_lock:
+                        server.intercepts[request_id] = {
+                            "request_id": request_id,
+                            "messages": [{"role": "user", "content": "ping"}],
+                        }
+                    request_queue.put_nowait(request_id)
+                    if i % 10 == 0:
+                        time.sleep(0.001)
+            except BaseException as exc:  # pragma: no cover - unexpected
+                failures.append(exc)
+
+        consumer_t = threading.Thread(target=_consumer, name="soak-consumer")
+        producer_t = threading.Thread(target=_producer, name="soak-producer")
+
+        consumer_t.start()
+        producer_t.start()
+
+        producer_t.join(timeout=10)
+        consumer_t.join(timeout=15)
+
+        assert not producer_t.is_alive(), "producer thread hung"
+        assert not consumer_t.is_alive(), "consumer thread hung"
+        assert not failures
+        assert len(consumed) == total_requests
+        assert len(set(consumed)) == total_requests
+
+        session.close()
+
 
 class TestCLIAgentSessionFactory:
     """Tests for the ResourceSessionFactory wrapper."""

From 659288b5e8fd0680253d65ba5a31648a13c242da Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 18 May 2026 21:23:35 +0530
Subject: [PATCH 29/35] fix: pi config discovery for CLIAgentDriver to be
 independent of runtime user's $HOME

---
 src/openenv/core/harness/agents/cli_driver.py | 12 +++----
 tests/core/test_cli_agent_driver.py           | 32 +++++++++++++++++--
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index a2724162e..831d930af 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -494,6 +494,10 @@ def _start_agent(
         else:
             cmd = " ".join(shlex.quote(c) for c in self.spec.base_command)
         envs = self._resolve_env_vars(config, base_url_override=base_url_override)
+        if self.spec.name == "pi":
+            home = self._resolve_sandbox_home(sandbox, config)
+            # Make pi config discovery independent of the runtime user's $HOME.
+            envs["PI_CODING_AGENT_DIR"] = f"{home}/.pi/agent"
         if self.mode == "interception_gate" and self._interception_server is not None:
             envs["OPENAI_API_KEY"] = self._interception_server.secret
             envs["ANTHROPIC_API_KEY"] = self._interception_server.secret
@@ -507,7 +511,7 @@ def _write_pi_models_config(
         rollout_url: str,
         api_key: str,
     ) -> None:
-        home = config.sandbox_home if hasattr(config, "sandbox_home") else "/home/user"
+        home = self._resolve_sandbox_home(sandbox, config)
         model = config.model if hasattr(config, "model") else "model"
         content = json.dumps(
             {
@@ -526,11 +530,7 @@ def _write_pi_models_config(
             },
             indent=2,
         )
-        paths = {f"{home}/.pi/agent/models.json"}
-        if home == "/root":
-            paths.add("/root/.pi/agent/models.json")
-        for path in paths:
-            sandbox.write_text(path, content)
+        sandbox.write_text(f"{home}/.pi/agent/models.json", content)
 
     def _resolve_env_vars(
         self,
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 6c1c1511e..7338fc323 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -583,13 +583,14 @@ def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self)
         sbx = backend.created[0]
 
         # Command should force the custom provider backed by models.json.
-        cmd, _envs = sbx.bg_commands[-1]
+        cmd, envs = sbx.bg_commands[-1]
         assert "--provider openenv" in cmd
+        assert envs is not None
+        assert envs["PI_CODING_AGENT_DIR"] == "/home/user/.pi/agent"
 
         home_models = "/home/user/.pi/agent/models.json"
         root_models = "/root/.pi/agent/models.json"
         assert home_models in sbx.written
-        # /root/ path is only written when sandbox_home == "/root"
         assert root_models not in sbx.written
 
         cfg = json.loads(sbx.written[home_models])
@@ -602,6 +603,33 @@ def test_pi_interception_gate_writes_models_json_and_uses_openenv_provider(self)
 
         session.close()
 
+    def test_pi_interception_gate_uses_explicit_pi_config_dir(self):
+        from openenv.core.harness.agents.cli_driver import CLIAgentDriver
+        from openenv.core.harness.agents.interception_server import InterceptionServer
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        backend = FakeSandboxBackend()
+        server = InterceptionServer(port=0, secret="gate-secret")
+        driver = CLIAgentDriver(
+            spec=PI_SPEC,
+            sandbox_backend=backend,
+            mode="interception_gate",
+            interception_server=server,
+            interception_base_url="http://127.0.0.1:8765",
+        )
+
+        config = FakeConfig(sandbox_home="/custom/home")
+        session = driver.create_session(task=FakeTask(), config=config)
+        sbx = backend.created[0]
+
+        _cmd, envs = sbx.bg_commands[-1]
+        assert envs is not None
+        assert envs["PI_CODING_AGENT_DIR"] == "/custom/home/.pi/agent"
+        assert "/custom/home/.pi/agent/models.json" in sbx.written
+        assert "/root/.pi/agent/models.json" not in sbx.written
+
+        session.close()
+
     def test_create_session_runs_task_setup_shell(self):
         from openenv.core.harness.agents.cli_driver import CLIAgentDriver
 

From 5136337e287e31e07724c3e6a891d6ca72359005 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Wed, 20 May 2026 10:48:13 +0530
Subject: [PATCH 30/35] fix: interception params and update max_tokens_cap
 validation

---
 envs/coding_agent_env/harness.py              | 32 ++++++++++++-------
 .../server/coding_environment.py              |  2 +-
 tests/envs/test_coding_agent_env.py           | 27 ++++++++++++++++
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py
index 379a055bb..de4ec91dd 100644
--- a/envs/coding_agent_env/harness.py
+++ b/envs/coding_agent_env/harness.py
@@ -36,6 +36,9 @@ def __init__(
         task: CodingAgentTask,
         verifier: Verifier | None = None,
         base_url_override: str | None = None,
+        interception_server: InterceptionServer | None = None,
+        interception_rollout_id: str | None = None,
+        interception_queue: _queue_mod.Queue[str] | None = None,
     ) -> None:
         super().__init__(
             spec=OPENCODE_SPEC,
@@ -44,6 +47,9 @@ def __init__(
             config=config,
             verifier=verifier,
             base_url_override=base_url_override,
+            interception_server=interception_server,
+            interception_rollout_id=interception_rollout_id,
+            interception_queue=interception_queue,
         )
 
     def fetch_trace(self) -> str:
@@ -129,16 +135,21 @@ def create(
         interception_queue: _queue_mod.Queue[str] | None = None
 
         if self._driver.mode == "interception_gate":
-            assert self._driver._interception_server is not None
-            assert self._driver._interception_base_url is not None
+            interception_server = self._driver._interception_server
+            if interception_server is None:
+                raise RuntimeError(
+                    "interception_gate mode requires an InterceptionServer"
+                )
+            interception_base_url = self._driver._interception_base_url
+            if interception_base_url is None:
+                raise RuntimeError(
+                    "interception_gate mode requires interception_base_url"
+                )
             rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}"
             interception_rollout_id = rollout_id
-            interception_queue = self._driver._interception_server.register_rollout(
-                rollout_id
-            )
+            interception_queue = interception_server.register_rollout(rollout_id)
             base_url_override = (
-                f"{self._driver._interception_base_url.rstrip('/')}"
-                f"/rollout/{rollout_id}/v1"
+                f"{interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1"
             )
 
         session = CodingAgentSession(
@@ -147,11 +158,10 @@ def create(
             task=oc_task,
             verifier=self._verifier,
             base_url_override=base_url_override,
+            interception_server=self._driver._interception_server,
+            interception_rollout_id=interception_rollout_id,
+            interception_queue=interception_queue,
         )
-        # Pass interception fields to the parent CLIAgentSession
-        session._interception_server = self._driver._interception_server
-        session._interception_rollout_id = interception_rollout_id
-        session._interception_queue = interception_queue
 
         session.start_agent()
         return session
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index 9000ed4e0..111d417b8 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -484,7 +484,7 @@ def _build_agent_config(
                 model=model,
                 agent_timeout_s=agent_timeout_s,
                 disable_thinking=disable_thinking,
-                max_tokens_cap=max_tokens_cap if max_tokens_cap != 4096 else None,
+                max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
             )
 
         provider = self._infer_pi_provider(base_url)
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py
index 905713e7a..6397a1060 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -183,6 +183,33 @@ def test_build_agent_config_opencode() -> None:
     assert isinstance(cfg, env._CodingAgentConfig)
     assert cfg.model == "gpt-4o-mini"
     assert cfg.agent_timeout_s == 123.0
+    assert cfg.max_tokens_cap == 2048
+
+    cfg_4096 = env._build_agent_config(
+        agent="opencode",
+        mode="black_box",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-test",
+        model="gpt-4o-mini",
+        agent_timeout_s=123.0,
+        disable_thinking=True,
+        top_logprobs=7,
+        max_tokens_cap=4096,
+    )
+    assert cfg_4096.max_tokens_cap == 4096
+
+    cfg_uncapped = env._build_agent_config(
+        agent="opencode",
+        mode="black_box",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-test",
+        model="gpt-4o-mini",
+        agent_timeout_s=123.0,
+        disable_thinking=True,
+        top_logprobs=7,
+        max_tokens_cap=0,
+    )
+    assert cfg_uncapped.max_tokens_cap is None
 
 
 def test_build_agent_config_pi() -> None:

From 8137b154adb78f305336348ef5735632bb279ecd Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Wed, 20 May 2026 10:59:40 +0530
Subject: [PATCH 31/35] refactor: remove RolloutTurn references

---
 envs/coding_agent_env/README.md               |  6 +-
 envs/coding_agent_env/__init__.py             |  3 +-
 envs/coding_agent_env/models.py               | 21 ------
 .../server/coding_environment.py              | 18 +----
 envs/coding_agent_env/server/gradio_ui.py     | 70 +++----------------
 tests/envs/test_coding_agent_env.py           | 15 +---
 6 files changed, 15 insertions(+), 118 deletions(-)

diff --git a/envs/coding_agent_env/README.md b/envs/coding_agent_env/README.md
index 7825e5c25..347afdd05 100644
--- a/envs/coding_agent_env/README.md
+++ b/envs/coding_agent_env/README.md
@@ -200,8 +200,8 @@ directly.
 | `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. |
 
 Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
-`verify_results[]`, `files{}`, `agent_log_tail`,
-`proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`.
+`verify_results[]`, `files{}`, `agent_log_tail`, `wall_s`,
+`agent_exit_code`, `sandbox_id`, `error`.
 
 ## Two Operating Modes
 
@@ -259,7 +259,7 @@ coding_agent_env/
 ├── __init__.py                     # re-exports primitive + client + models
 │
 ├── client.py                       # CodingAgentEnv(MCPToolClient)
-├── models.py                       # RolloutResult / RolloutTurn / CodingAgentState
+├── models.py                       # RolloutResult / CodingAgentState
 │
 ├── config.py                       # CodingAgentConfig (primitive)
 ├── harness.py                      # CodingAgentSession / CodingAgentSessionFactory (CLI-only)
diff --git a/envs/coding_agent_env/__init__.py b/envs/coding_agent_env/__init__.py
index 6b839e7ea..bc04e7236 100644
--- a/envs/coding_agent_env/__init__.py
+++ b/envs/coding_agent_env/__init__.py
@@ -25,7 +25,7 @@
 from .client import CodingAgentEnv
 from .config import CodingAgentConfig, Provider
 from .harness import CodingAgentSession, CodingAgentSessionFactory
-from .models import CommandResult, CodingAgentState, RolloutResult, RolloutTurn
+from .models import CommandResult, CodingAgentState, RolloutResult
 from .task import CodingAgentTask
 
 try:
@@ -42,7 +42,6 @@
     "CommandResult",
     "CodingAgentState",
     "RolloutResult",
-    "RolloutTurn",
     # Harness primitive
     "CodingAgentConfig",
     "CodingAgentSession",
diff --git a/envs/coding_agent_env/models.py b/envs/coding_agent_env/models.py
index e338a4867..2bf19925e 100644
--- a/envs/coding_agent_env/models.py
+++ b/envs/coding_agent_env/models.py
@@ -14,26 +14,10 @@
 
 from __future__ import annotations
 
-from typing import Any
-
 from openenv.core.env_server.types import State
 from pydantic import BaseModel, Field
 
 
-class RolloutTurn(BaseModel):
-    """One intercepted LLM turn shape (trainer-owned in interception_gate mode)."""
-
-    turn: int
-    finish_reason: str | None = None
-    completion_tokens: list[str] = Field(default_factory=list)
-    completion_token_ids: list[int] = Field(default_factory=list)
-    per_token_logps: list[float] = Field(default_factory=list)
-    latency_s: float = 0.0
-    timestamp: float = 0.0
-    upstream_status: int | None = None
-    upstream_error: dict[str, Any] | None = None
-
-
 class CommandResult(BaseModel):
     """Outcome of one bash command in setup/verify.
 
@@ -66,17 +50,12 @@ class RolloutResult(BaseModel):
     setup_results: list[CommandResult] = Field(default_factory=list)
     verify_results: list[CommandResult] = Field(default_factory=list)
 
-    # Per-turn LLM trajectory placeholder. Capture is trainer-owned in
-    # interception_gate mode; environment currently leaves this empty.
-    proxy_turns: list[RolloutTurn] = Field(default_factory=list)
-
     # Filesystem the agent produced (path -> contents, truncated)
     files: dict[str, str] = Field(default_factory=dict)
     files_extra: list[str] = Field(default_factory=list)
 
     # Diagnostic tails
     agent_log_tail: str = ""
-    proxy_log_tail: str = ""
 
     # Error surfacing
     error: str | None = None
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/coding_agent_env/server/coding_environment.py
index 111d417b8..9174666e7 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/coding_agent_env/server/coding_environment.py
@@ -54,7 +54,6 @@
 _log = logging.getLogger(__name__)
 
 REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
-PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
 AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
 VERIFY_TIMEOUT_S = 120
 _SUPPORTED_AGENTS = ("opencode", "pi")
@@ -89,14 +88,12 @@ def __init__(self) -> None:
                 CodingAgentState,
                 CommandResult,
                 RolloutResult,
-                RolloutTurn,
             )
         except ImportError:  # pragma: no cover
             from models import (  # type: ignore
                 CodingAgentState,
                 CommandResult,
                 RolloutResult,
-                RolloutTurn,
             )
 
         from openenv.core.harness.agents import get_agent_spec
@@ -113,7 +110,6 @@ def __init__(self) -> None:
 
         self._CommandResult = CommandResult
         self._RolloutResult = RolloutResult
-        self._RolloutTurn = RolloutTurn
         self._CodingAgentState = CodingAgentState
         self._CodingAgentConfig = CodingAgentConfig
         self._CodingAgentSessionFactory = CodingAgentSessionFactory
@@ -418,24 +414,18 @@ def _emit(msg: str) -> None:
             else:
                 result.reward = None
 
-            # Collect filesystem + proxy trace.
-            _emit("collecting workdir files + proxy trace + logs")
+            # Collect filesystem + agent log tail.
+            _emit("collecting workdir files + logs")
             result.files, result.files_extra = self._collect_files(session.sandbox)
-            result.proxy_turns = self._collect_proxy_turns(session)
-            result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
             result.agent_log_tail = self._collect_agent_log_tail(session, agent)
             _emit(
                 f"collected: {len(result.files)} file(s), "
-                f"{len(result.proxy_turns)} proxy turn(s), "
                 f"reward={'%.2f' % result.reward if result.reward is not None else 'n/a'}"
             )
         except Exception as exc:  # noqa: BLE001
             result.error = f"{type(exc).__name__}: {exc}"
             _emit(f"ERROR: {result.error}")
             if session is not None:
-                result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[
-                    -2000:
-                ]
                 result.agent_log_tail = self._collect_agent_log_tail(session, agent)
         finally:
             if session is not None:
@@ -607,10 +597,6 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]:
                 extras.append(path)
         return files, extras
 
-    def _collect_proxy_turns(self, session: Any) -> list[Any]:
-        """Logprob capture is now owned by the training loop via interception_gate."""
-        return []
-
     @staticmethod
     def _safe_read(sandbox: Any, path: str) -> str:
         try:
diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/coding_agent_env/server/gradio_ui.py
index 82f130ce3..ea9cdb81f 100644
--- a/envs/coding_agent_env/server/gradio_ui.py
+++ b/envs/coding_agent_env/server/gradio_ui.py
@@ -19,8 +19,7 @@
     agent_timeout_s, template).
   - Preset buttons for the ready-made example tasks.
   - Run button → result panel with reward, setup/verify per-command
-    results, file outputs, logprob stats, agent + proxy log tails,
-    and the raw RolloutResult JSON.
+    results, file outputs, agent log tail, and the raw RolloutResult JSON.
 """
 
 from __future__ import annotations
@@ -156,51 +155,6 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]:
     return rows
 
 
-def _logprobs_md(turns: list[dict[str, Any]]) -> str:
-    if not turns:
-        return "_No proxy turns captured._\n\nLogprob capture is handled by the training loop via `interception_gate` mode."
-    n = len(turns)
-    productive = sum(1 for t in turns if t.get("completion_tokens"))
-    total_toks = sum(len(t.get("completion_tokens") or []) for t in turns)
-    all_lps = [
-        float(x)
-        for t in turns
-        for x in (t.get("per_token_logps") or [])
-        if x is not None
-    ]
-    mean_lp = (sum(all_lps) / len(all_lps)) if all_lps else None
-    lines = [
-        f"**turns**: `{n}`  ·  **productive**: `{productive}`  ·  "
-        f"**total_completion_tokens**: `{total_toks}`",
-    ]
-    if mean_lp is not None:
-        lines.append(f"**mean_logprob**: `{mean_lp:+.4f}`")
-    finishes: dict[str, int] = {}
-    for t in turns:
-        f = t.get("finish_reason") or "unknown"
-        finishes[f] = finishes.get(f, 0) + 1
-    if finishes:
-        lines.append(
-            "**finish_reasons**: "
-            + "  ".join(f"`{k}={v}`" for k, v in finishes.items())
-        )
-    productive_rows = [t for t in turns if t.get("completion_tokens")]
-    if productive_rows:
-        first = productive_rows[0]
-        toks = first["completion_tokens"][:10]
-        lps = first.get("per_token_logps") or []
-        lines.append(
-            "\n**first productive turn (first 10 tokens)**\n\n"
-            "```\n"
-            + "\n".join(
-                f"  {tok!r:<14}  {lp:+.3f}" if i < len(lps) else f"  {tok!r:<14}  -"
-                for i, (tok, lp) in enumerate(zip(toks, lps + [None] * len(toks)))
-            )
-            + "\n```"
-        )
-    return "\n\n".join(lines)
-
-
 def _live_status_md(
     agent: str,
     endpoint_kind: str,
@@ -292,9 +246,9 @@ def run(
         """Generator handler — yields incremental UI updates.
 
         Each ``yield`` is a tuple matching ``outputs=[...]``:
-        (summary_md, setup_table, verify_table, files_md, logprobs_md,
-        logs_md, raw_json). Early yields keep summary_md as a live phase
-        log while the rollout runs; the final yield populates everything.
+        (summary_md, setup_table, verify_table, files_md, logs_md,
+        raw_json). Early yields keep summary_md as a live phase log while
+        the rollout runs; the final yield populates everything.
         """
         import queue
         import threading
@@ -308,7 +262,7 @@ def run(
             )
         except ValueError as exc:
             err = f"endpoint resolution failed: {exc}"
-            yield (f"### error\n\n```\n{err}\n```", [], [], "", "", "", {"error": err})
+            yield (f"### error\n\n```\n{err}\n```", [], [], "", "", {"error": err})
             return
 
         # Translate "auto" / "on" / "off" into bool / None.
@@ -369,7 +323,6 @@ def _worker():
             [],
             "",
             "",
-            "",
             {},
         )
 
@@ -397,7 +350,7 @@ def _worker():
                 elapsed,
                 status_lines,
             )
-            yield (md, [], [], "", "", "", {})
+            yield (md, [], [], "", "", {})
 
         # Drain any final messages still in the queue.
         while not status_q.empty():
@@ -415,7 +368,6 @@ def _worker():
                 [],
                 [],
                 "",
-                "",
                 _live_status_md(
                     agent,
                     resolved.kind,
@@ -434,7 +386,6 @@ def _worker():
             _command_rows(result.get("setup_results") or []),
             _command_rows(result.get("verify_results") or []),
             _files_md(result.get("files") or {}),
-            _logprobs_md(result.get("proxy_turns") or []),
             (
                 "### live phase log\n\n"
                 + _live_status_md(
@@ -445,8 +396,7 @@ def _worker():
                     time.time() - t_start,
                     status_lines,
                 )
-                + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n"
-                f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```"
+                + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```"
             ),
             result,
         )
@@ -460,8 +410,7 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         gr.Markdown(
             "Run one coding-agent rollout in an E2B sandbox against your chosen "
             "LLM endpoint. Pick an agent + endpoint, write the task as "
-            "`(instruction, setup, verify)`, and inspect reward + per-token "
-            "logprobs."
+            "`(instruction, setup, verify)`, and inspect reward + logs."
         )
 
         gr.Markdown(_catalog_banner())
@@ -563,8 +512,6 @@ def apply_preset(name: str) -> tuple[str, str, str]:
                 )
             with gr.Tab("Files"):
                 files_md = gr.Markdown("")
-            with gr.Tab("Logprobs"):
-                logprobs_md = gr.Markdown("")
             with gr.Tab("Logs"):
                 logs_md = gr.Markdown("")
             with gr.Tab("Raw JSON"):
@@ -604,7 +551,6 @@ def apply_preset(name: str) -> tuple[str, str, str]:
                 setup_table,
                 verify_table,
                 files_md,
-                logprobs_md,
                 logs_md,
                 raw_json,
             ],
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_coding_agent_env.py
index 6397a1060..fa3dcae79 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_coding_agent_env.py
@@ -55,7 +55,6 @@ def test_public_api_imports() -> None:
         E2BSandboxBackend,
         Provider,
         RolloutResult,
-        RolloutTurn,
         SandboxBackend,
         SandboxHandle,
     )
@@ -280,7 +279,7 @@ def test_build_session_factory_requires_e2b_dependency() -> None:
 
 
 def test_rollout_result_serializes_round_trip() -> None:
-    from coding_agent_env import CommandResult, RolloutResult, RolloutTurn
+    from coding_agent_env import CommandResult, RolloutResult
 
     r = RolloutResult(
         task_id="t1",
@@ -291,22 +290,12 @@ def test_rollout_result_serializes_round_trip() -> None:
         mode="black_box",
         setup_results=[CommandResult(cmd="pip install pandas", exit_code=0)],
         verify_results=[CommandResult(cmd="pytest", exit_code=1, stderr="boom")],
-        proxy_turns=[
-            RolloutTurn(
-                turn=1,
-                finish_reason="stop",
-                completion_tokens=["hi"],
-                per_token_logps=[-0.1],
-                latency_s=0.2,
-            )
-        ],
         files={"/home/user/workdir/x.py": "print('x')"},
     )
     blob = r.model_dump_json()
     rebuilt = RolloutResult.model_validate_json(blob)
     assert rebuilt.reward == 0.75
     assert rebuilt.verify_results[0].exit_code == 1
-    assert rebuilt.proxy_turns[0].completion_tokens == ["hi"]
 
 
 def test_coding_agent_task_coerce_str() -> None:
@@ -402,8 +391,6 @@ async def _go() -> RolloutResult:
     assert result.reward == 1.0, (
         f"expected reward=1.0 got {result.reward}: {result.error}"
     )
-    # proxy_turns is now always empty — logprob capture is trainer-owned
-    # via interception_gate mode, not captured by the environment.
     assert any(f.endswith("/binary_search.py") for f in result.files), (
         f"expected binary_search.py in workdir, got {list(result.files)}"
     )

From 3c4ffa4a6a0fb3d120e136936d3627726193c4ba Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Wed, 20 May 2026 11:17:11 +0530
Subject: [PATCH 32/35] feat: add tool name allowlist validation

---
 .../harness/agents/interception_server.py     | 70 +++++++++++++++++-
 tests/core/test_interception_server.py        | 71 ++++++++++++++++++-
 2 files changed, 136 insertions(+), 5 deletions(-)

diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index a71082e69..fa735f0c0 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -24,7 +24,7 @@
 
 Usage — training loop::
 
-    server = InterceptionServer(port=8765)
+    server = InterceptionServer(port=8765, tool_name_allowlist={"answer"})
     await server.start()
 
     # Make the server reachable — your responsibility.
@@ -53,12 +53,15 @@
 import json
 import logging
 import queue as _queue_mod
+import re
 import secrets
 import threading
 import time
 import uuid
 from typing import Any, Awaitable, Callable
 
+from openenv.core.env_server.mcp_types import RESERVED_TOOL_NAMES
+
 from aiohttp import web
 
 
@@ -66,6 +69,7 @@
 
 _KEEPALIVE_INTERVAL_S = 3.0
 _MAX_REQUEST_BODY = 16 * 1024 * 1024
+_TOOL_NAME_RE = re.compile(r"^[A-Za-z0-9_-]{1,64}$")
 
 ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]
 
@@ -82,12 +86,25 @@ def __init__(
         port: int = 0,
         secret: str | None = None,
         host: str = "127.0.0.1",
+        tool_name_allowlist: set[str] | None = None,
     ) -> None:
         self.port = port
         self.host = host
         self.secret = secret or secrets.token_urlsafe(32)
         if not self.secret.strip():
             raise ValueError("InterceptionServer secret must not be blank.")
+        normalized_allowlist: set[str] = set()
+        for raw_name in tool_name_allowlist or set():
+            name = raw_name.strip()
+            if not name:
+                raise ValueError("tool_name_allowlist must not include blank names")
+            if not _TOOL_NAME_RE.fullmatch(name):
+                raise ValueError(
+                    "tool_name_allowlist entries must match "
+                    f"^[A-Za-z0-9_-]{{1,64}}$ (got {raw_name!r})"
+                )
+            normalized_allowlist.add(name)
+        self._tool_name_allowlist = frozenset(normalized_allowlist)
         self._app: web.Application | None = None
         self._runner: web.AppRunner | None = None
         self._site: web.TCPSite | None = None
@@ -237,16 +254,25 @@ def register_tool_handler(
         Optionally provide ``tool_definition`` (OpenAI tool schema). Registered
         schemas are injected into intercepted chat-completion requests for the
         rollout when the incoming request does not already include the tool.
+
+        Only tool names explicitly configured in ``tool_name_allowlist`` are
+        accepted. Control-plane names (``reset``, ``step``, ``state``,
+        ``close``) are always rejected to preserve the dual API boundary.
         """
+        normalized_name = self._validate_tool_registration(
+            tool_name,
+            tool_definition=tool_definition,
+        )
+
         with self._state_lock:
             context = self.active_rollouts.get(rollout_id)
             if context is None:
                 raise KeyError(f"rollout not found: {rollout_id}")
             handlers: dict[str, ToolHandler] = context["tool_handlers"]
-            handlers[tool_name] = handler
+            handlers[normalized_name] = handler
             if tool_definition is not None:
                 tool_defs: dict[str, dict[str, Any]] = context["tool_defs"]
-                tool_defs[tool_name] = tool_definition
+                tool_defs[normalized_name] = tool_definition
 
     def unregister_tool_handler(self, rollout_id: str, tool_name: str) -> None:
         with self._state_lock:
@@ -268,6 +294,44 @@ def _tool_name(tool: dict[str, Any]) -> str | None:
         name = function.get("name")
         return name if isinstance(name, str) and name else None
 
+    def _validate_tool_registration(
+        self,
+        tool_name: str,
+        *,
+        tool_definition: dict[str, Any] | None,
+    ) -> str:
+        normalized = tool_name.strip()
+        if not normalized:
+            raise ValueError("tool_name must not be blank")
+        if not _TOOL_NAME_RE.fullmatch(normalized):
+            raise ValueError(
+                f"tool_name must match ^[A-Za-z0-9_-]{{1,64}}$ (got {tool_name!r})"
+            )
+        if normalized.lower() in RESERVED_TOOL_NAMES:
+            raise ValueError(
+                "Interception tool name is reserved for infrastructure/control "
+                f"APIs: {normalized!r}"
+            )
+        if normalized not in self._tool_name_allowlist:
+            raise ValueError(
+                "Interception tool name is not in the configured allowlist: "
+                f"{normalized!r}"
+            )
+
+        if tool_definition is not None:
+            definition_name = self._tool_name(tool_definition)
+            if definition_name is None:
+                raise ValueError(
+                    "tool_definition must be an OpenAI tool schema with function.name"
+                )
+            if definition_name != normalized:
+                raise ValueError(
+                    "tool_definition.function.name must exactly match tool_name "
+                    f"({definition_name!r} != {normalized!r})"
+                )
+
+        return normalized
+
     def _merge_rollout_tools(
         self,
         tools: Any,
diff --git a/tests/core/test_interception_server.py b/tests/core/test_interception_server.py
index 73421e1a7..41ef38fe5 100644
--- a/tests/core/test_interception_server.py
+++ b/tests/core/test_interception_server.py
@@ -142,7 +142,11 @@ async def test_interception_server_unregister_rollout_cancels_pending_request()
 
 @pytest.mark.asyncio
 async def test_interception_server_tool_endpoint_executes_registered_handler() -> None:
-    server = InterceptionServer(port=0, secret="secret-token")
+    server = InterceptionServer(
+        port=0,
+        secret="secret-token",
+        tool_name_allowlist={"answer"},
+    )
     await server.start()
     server.register_rollout("r1")
     seen: dict[str, object] = {}
@@ -186,11 +190,74 @@ async def test_interception_server_tool_endpoint_returns_404_for_unknown_tool()
         await server.stop()
 
 
+def test_interception_server_rejects_reserved_tool_name_registration() -> None:
+    server = InterceptionServer(
+        port=0,
+        secret="secret-token",
+        tool_name_allowlist={"reset"},
+    )
+    server.register_rollout("r1")
+
+    async def _handler(arguments: dict) -> dict:
+        return {"ok": True}
+
+    with pytest.raises(ValueError, match="reserved"):
+        server.register_tool_handler("r1", "reset", _handler)
+
+
+def test_interception_server_rejects_tool_definition_name_mismatch() -> None:
+    server = InterceptionServer(
+        port=0,
+        secret="secret-token",
+        tool_name_allowlist={"answer"},
+    )
+    server.register_rollout("r1")
+
+    async def _handler(arguments: dict) -> dict:
+        return {"ok": True}
+
+    mismatched = {
+        "type": "function",
+        "function": {
+            "name": "not_answer",
+            "description": "Mismatch",
+            "parameters": {"type": "object", "properties": {}},
+        },
+    }
+
+    with pytest.raises(ValueError, match="must exactly match"):
+        server.register_tool_handler(
+            "r1",
+            "answer",
+            _handler,
+            tool_definition=mismatched,
+        )
+
+
+def test_interception_server_rejects_tool_not_in_allowlist() -> None:
+    server = InterceptionServer(
+        port=0,
+        secret="secret-token",
+        tool_name_allowlist={"answer"},
+    )
+    server.register_rollout("r1")
+
+    async def _handler(arguments: dict) -> dict:
+        return {"ok": True}
+
+    with pytest.raises(ValueError, match="allowlist"):
+        server.register_tool_handler("r1", "search", _handler)
+
+
 @pytest.mark.asyncio
 async def test_interception_server_injects_registered_tool_defs_into_intercept() -> (
     None
 ):
-    server = InterceptionServer(port=0, secret="secret-token")
+    server = InterceptionServer(
+        port=0,
+        secret="secret-token",
+        tool_name_allowlist={"answer"},
+    )
     await server.start()
     queue = server.register_rollout("r1")
 

From 151d1abc9d73e0508f008149337704b986a335ec Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Wed, 20 May 2026 11:23:27 +0530
Subject: [PATCH 33/35] feat: provider-specific env var handling for Pi agent

---
 src/openenv/core/harness/agents/pi.py | 56 ++++++++++++++++++++++-----
 tests/core/test_harness_adapters.py   | 38 ++++++++++++++++--
 2 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index 03946c552..060b41dbd 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -11,8 +11,9 @@
     pi --no-session --no-context-files --provider <p> --model <m> --thinking off \\
        -p @/home/user/task/instruction.txt 2>&1 | tee /home/user/logs/agent/pi.txt
 
-The provider and model are passed as CLI flags so the spec's ``env`` dict
-only needs auth credentials (``HF_TOKEN``, ``OPENAI_API_KEY``, etc.).
+The provider and model are passed as CLI flags. Provider-specific credentials
+are exported via ``build_env_vars`` according to Pi's provider docs
+(``HF_TOKEN`` for ``huggingface``, ``OPENAI_API_KEY`` for ``openai``, etc.).
 
 Registered on import::
 
@@ -111,6 +112,48 @@ def _parse_events(line: str) -> AgentEvent | None:
     return AgentEvent(type="assistant", data=data, raw=line)
 
 
+def _provider_api_key_env(provider: str) -> str:
+    provider_key = provider.strip().lower()
+    env_by_provider = {
+        # https://github.com/earendil-works/pi/tree/main/packages/coding-agent#providers--models
+        "openai": "OPENAI_API_KEY",
+        "openenv": "OPENAI_API_KEY",
+        "huggingface": "HF_TOKEN",
+        "anthropic": "ANTHROPIC_API_KEY",
+        "gemini": "GEMINI_API_KEY",
+        "google": "GEMINI_API_KEY",
+    }
+    env_name = env_by_provider.get(provider_key)
+    if env_name is None:
+        raise ValueError(
+            f"Unsupported pi provider {provider!r}; expected one of "
+            f"{sorted(env_by_provider)}"
+        )
+    return env_name
+
+
+def _build_env_vars(spec: CLIAgentSpec, config: Any) -> dict[str, str]:
+    provider = config.provider if hasattr(config, "provider") else "openai"
+    if not isinstance(provider, str) or not provider.strip():
+        provider = "openai"
+    api_key = config.api_key if hasattr(config, "api_key") else ""
+    base_url = config.base_url if hasattr(config, "base_url") else ""
+    extra_env = config.extra_env if hasattr(config, "extra_env") else {}
+
+    env = dict(extra_env)
+    env["PI_SKIP_VERSION_CHECK"] = "1"
+    env["PI_TELEMETRY"] = "0"
+
+    if base_url:
+        env["OPENAI_BASE_URL"] = base_url
+
+    key_env_var = _provider_api_key_env(provider)
+    if api_key:
+        env[key_env_var] = api_key
+
+    return env
+
+
 PI_SPEC = CLIAgentSpec(
     name="pi",
     install_check_cmd=["pi", "--version"],
@@ -137,17 +180,12 @@ def _parse_events(line: str) -> AgentEvent | None:
     artifacts={
         "agent_log": ArtifactSpec(path="/home/user/logs/agent/pi.txt"),
     },
-    env={
-        "HF_TOKEN": "{api_key}",
-        "OPENAI_API_KEY": "{api_key}",
-        "OPENAI_BASE_URL": "{base_url}",
-        "PI_SKIP_VERSION_CHECK": "1",
-        "PI_TELEMETRY": "0",
-    },
+    env=None,
     extension_dir_template="{home}/.pi/agent/extensions",
     build_command=_build_command,
     build_mcp_config=_build_mcp_config,
     parse_events=_parse_events,
+    build_env_vars=_build_env_vars,
 )
 
 register_agent(PI_SPEC)
diff --git a/tests/core/test_harness_adapters.py b/tests/core/test_harness_adapters.py
index f5e1dc260..1766b8ad4 100644
--- a/tests/core/test_harness_adapters.py
+++ b/tests/core/test_harness_adapters.py
@@ -47,9 +47,41 @@ def test_fields(self):
         assert PI_SPEC.mcp_config.method == "config_file"
         assert PI_SPEC.mcp_config.path_template is not None
         assert ".mcp.json" in PI_SPEC.mcp_config.path_template
-        assert PI_SPEC.env is not None
-        assert "HF_TOKEN" in PI_SPEC.env
-        assert "PI_SKIP_VERSION_CHECK" in PI_SPEC.env
+        assert PI_SPEC.build_env_vars is not None
+
+    def test_build_env_vars_provider_specific_api_key(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        @dataclass
+        class PiConfig:
+            provider: str
+            api_key: str = "secret"
+            base_url: str = "https://api.example.com/v1"
+            extra_env: dict[str, str] = field(default_factory=dict)
+
+        assert PI_SPEC.build_env_vars is not None
+
+        hf_env = PI_SPEC.build_env_vars(PI_SPEC, PiConfig(provider="huggingface"))
+        assert hf_env["HF_TOKEN"] == "secret"
+        assert "OPENAI_API_KEY" not in hf_env
+
+        oa_env = PI_SPEC.build_env_vars(PI_SPEC, PiConfig(provider="openai"))
+        assert oa_env["OPENAI_API_KEY"] == "secret"
+        assert "HF_TOKEN" not in oa_env
+
+    def test_build_env_vars_rejects_unknown_provider(self):
+        from openenv.core.harness.agents.pi import PI_SPEC
+
+        @dataclass
+        class PiConfig:
+            provider: str = "unknown"
+            api_key: str = "secret"
+            base_url: str = "https://api.example.com/v1"
+            extra_env: dict[str, str] = field(default_factory=dict)
+
+        assert PI_SPEC.build_env_vars is not None
+        with pytest.raises(ValueError, match="Unsupported pi provider"):
+            PI_SPEC.build_env_vars(PI_SPEC, PiConfig())
 
     def test_build_command(self):
         from openenv.core.harness.agents.pi import PI_SPEC

From 39624900aa00ae36b716b0cfdb707b2497f1aeb7 Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 1 Jun 2026 14:39:31 +0530
Subject: [PATCH 34/35] chore: exit notification handling and build
 interception rollout URL

---
 src/openenv/core/harness/agents/cli_driver.py | 38 +++++++++++++++++--
 .../harness/agents/interception_server.py     | 31 +++++++++++++++
 src/openenv/core/harness/agents/pi.py         |  2 +-
 3 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/src/openenv/core/harness/agents/cli_driver.py b/src/openenv/core/harness/agents/cli_driver.py
index 831d930af..80d482ef3 100644
--- a/src/openenv/core/harness/agents/cli_driver.py
+++ b/src/openenv/core/harness/agents/cli_driver.py
@@ -45,6 +45,11 @@
 Verifier = Callable[..., VerifyResult]
 
 
+def build_interception_rollout_url(base_url: str, rollout_id: str) -> str:
+    """Build OpenAI-compatible interception endpoint for one rollout."""
+    return f"{base_url.rstrip('/')}/rollout/{rollout_id}/v1"
+
+
 class _ConfigOverrideView:
     """Read-only attribute view with optional overrides."""
 
@@ -205,6 +210,9 @@ async def next_request(
                     self._interception_queue.get,
                     timeout=min(remaining, 1.0),
                 )
+                # None sentinel = agent process exited (sent by /exit endpoint)
+                if request_id is None:
+                    return None
                 intercept = server.get_intercept(request_id)
                 if intercept is not None:
                     return intercept
@@ -322,8 +330,9 @@ def create_session(
             rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}"
             interception_rollout_id = rollout_id
             interception_queue = self._interception_server.register_rollout(rollout_id)
-            base_url_override = (
-                f"{self._interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1"
+            base_url_override = build_interception_rollout_url(
+                self._interception_base_url,
+                rollout_id,
             )
 
         agent_bg_job = self._start_agent(
@@ -501,6 +510,23 @@ def _start_agent(
         if self.mode == "interception_gate" and self._interception_server is not None:
             envs["OPENAI_API_KEY"] = self._interception_server.secret
             envs["ANTHROPIC_API_KEY"] = self._interception_server.secret
+
+            # Append an exit notification so the InterceptionServer detects
+            # agent exit immediately instead of waiting for the full timeout.
+            # The /exit endpoint enqueues a None sentinel on the request queue,
+            # causing next_request() to return None.
+            if base_url_override:
+                exit_url = f"{base_url_override.rstrip('/')}/exit"
+                auth_header = (
+                    "Authorization: Bearer "
+                    f"{self._interception_server.secret}"
+                )
+                cmd = (
+                    f"{{ {cmd} ; }} ; "
+                    f"curl -sf -X POST -H {shlex.quote(auth_header)} "
+                    f"{shlex.quote(exit_url)} || true"
+                )
+
         return sandbox.start_bg(cmd, envs=envs)
 
     def _write_pi_models_config(
@@ -631,4 +657,10 @@ def create(
         )
 
 
-__all__ = ["CLIAgentDriver", "CLIAgentSession", "CLIAgentSessionFactory", "Verifier"]
+__all__ = [
+    "CLIAgentDriver",
+    "CLIAgentSession",
+    "CLIAgentSessionFactory",
+    "Verifier",
+    "build_interception_rollout_url",
+]
diff --git a/src/openenv/core/harness/agents/interception_server.py b/src/openenv/core/harness/agents/interception_server.py
index fa735f0c0..97573b352 100644
--- a/src/openenv/core/harness/agents/interception_server.py
+++ b/src/openenv/core/harness/agents/interception_server.py
@@ -126,6 +126,10 @@ async def start(self) -> None:
                 "/rollout/{rollout_id}/v1/tools/{tool_name}",
                 self._handle_tool_call,
             )
+            app.router.add_post(
+                "/rollout/{rollout_id}/v1/exit",
+                self._handle_exit,
+            )
             app.router.add_get("/health", self._handle_health)
             runner = web.AppRunner(app)
             await runner.setup()
@@ -363,6 +367,33 @@ def _authorized(self, request: web.Request) -> bool:
     async def _handle_health(self, request: web.Request) -> web.Response:
         return web.json_response({"status": "ok", **self.stats()})
 
+    async def _handle_exit(self, request: web.Request) -> web.Response:
+        """Handle agent process exit notification.
+
+        Called by the sandbox entrypoint after the agent process exits.
+        Enqueues a sentinel ``None`` on the rollout's request queue so that
+        ``next_request()`` returns immediately instead of waiting for the
+        full timeout.
+        """
+        rollout_id = request.match_info["rollout_id"]
+        with self._state_lock:
+            rollout = self.active_rollouts.get(rollout_id)
+        if rollout is None:
+            return web.json_response({"status": "ignored", "reason": "unknown rollout_id"})
+
+        queue = rollout.get("request_id_queue")
+        if queue is not None:
+            try:
+                queue.put_nowait(None)  # sentinel: signals "agent exited"
+            except Exception:
+                pass
+
+        _log.info(
+            "interception_exit_signal rollout_id=%s",
+            rollout_id,
+        )
+        return web.json_response({"status": "ok"})
+
     async def _handle_tool_call(self, request: web.Request) -> web.Response:
         if not self._authorized(request):
             return web.json_response({"error": "Unauthorized"}, status=401)
diff --git a/src/openenv/core/harness/agents/pi.py b/src/openenv/core/harness/agents/pi.py
index 060b41dbd..a2fdd7537 100644
--- a/src/openenv/core/harness/agents/pi.py
+++ b/src/openenv/core/harness/agents/pi.py
@@ -166,7 +166,7 @@ def _build_env_vars(spec: CLIAgentSpec, config: Any) -> dict[str, str]:
     setup=(
         "set -e && "
         "apt-get update -qq && apt-get install -y -qq curl ca-certificates gnupg && "
-        "curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && "
+        "curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && "
         "apt-get install -y -qq nodejs && "
         "curl -fsSL https://pi.dev/install.sh | sh && "
         "mkdir -p /home/user/logs/agent /home/user/task /home/user/workdir && "

From 88f6a55354b20a6c236eec3c676f01e3ee60991b Mon Sep 17 00:00:00 2001
From: swappy <59965507+rycerzes@users.noreply.github.com>
Date: Mon, 1 Jun 2026 15:57:19 +0530
Subject: [PATCH 35/35] refactor(opencode_env): migrate to core harness

---
 docs/source/environments.md                   |   8 +-
 docs/source/environments/coding_agent.md      |   2 -
 docs/source/environments/opencode.md          |   2 +
 envs/coding_agent_env/harness.py              | 178 -----
 .../.dockerignore                             |   0
 .../.gitignore                                |   0
 .../README.md                                 | 113 ++-
 .../__init__.py                               |  33 +-
 .../client.py                                 |  43 +-
 .../config.py                                 |  15 +-
 envs/opencode_env/harness.py                  | 342 +++++++++
 .../models.py                                 |  28 +-
 .../opencode_runtime.py                       |  24 +-
 .../openenv.yaml                              |   2 +-
 .../pyproject.toml                            |  16 +-
 .../sandbox/__init__.py                       |   0
 .../sandbox/build_template.py                 |   4 +-
 envs/opencode_env/sandbox/interception.py     | 661 ++++++++++++++++++
 .../server/Dockerfile                         |   6 +-
 .../server/__init__.py                        |   2 +-
 .../server/app.py                             |  22 +-
 .../server/catalog.py                         |   0
 .../server/gradio_ui.py                       |  54 +-
 .../server/opencode_environment.py}           | 195 ++----
 .../task.py                                   |  14 +-
 .../uv.lock                                   |   2 +-
 ...t_env_simple.py => opencode_env_simple.py} |  24 +-
 tests/core/test_cli_agent_driver.py           |   2 +-
 ...ding_agent_env.py => test_opencode_env.py} | 154 ++--
 29 files changed, 1349 insertions(+), 597 deletions(-)
 delete mode 100644 docs/source/environments/coding_agent.md
 create mode 100644 docs/source/environments/opencode.md
 delete mode 100644 envs/coding_agent_env/harness.py
 rename envs/{coding_agent_env => opencode_env}/.dockerignore (100%)
 rename envs/{coding_agent_env => opencode_env}/.gitignore (100%)
 rename envs/{coding_agent_env => opencode_env}/README.md (64%)
 rename envs/{coding_agent_env => opencode_env}/__init__.py (59%)
 rename envs/{coding_agent_env => opencode_env}/client.py (78%)
 rename envs/{coding_agent_env => opencode_env}/config.py (78%)
 create mode 100644 envs/opencode_env/harness.py
 rename envs/{coding_agent_env => opencode_env}/models.py (67%)
 rename envs/{coding_agent_env => opencode_env}/opencode_runtime.py (87%)
 rename envs/{coding_agent_env => opencode_env}/openenv.yaml (76%)
 rename envs/{coding_agent_env => opencode_env}/pyproject.toml (71%)
 rename envs/{coding_agent_env => opencode_env}/sandbox/__init__.py (100%)
 rename envs/{coding_agent_env => opencode_env}/sandbox/build_template.py (94%)
 create mode 100644 envs/opencode_env/sandbox/interception.py
 rename envs/{coding_agent_env => opencode_env}/server/Dockerfile (91%)
 rename envs/{coding_agent_env => opencode_env}/server/__init__.py (79%)
 rename envs/{coding_agent_env => opencode_env}/server/app.py (81%)
 rename envs/{coding_agent_env => opencode_env}/server/catalog.py (100%)
 rename envs/{coding_agent_env => opencode_env}/server/gradio_ui.py (92%)
 rename envs/{coding_agent_env/server/coding_environment.py => opencode_env/server/opencode_environment.py} (76%)
 rename envs/{coding_agent_env => opencode_env}/task.py (73%)
 rename envs/{coding_agent_env => opencode_env}/uv.lock (99%)
 rename examples/{coding_agent_env_simple.py => opencode_env_simple.py} (80%)
 rename tests/envs/{test_coding_agent_env.py => test_opencode_env.py} (71%)

diff --git a/docs/source/environments.md b/docs/source/environments.md
index 58f36c155..207df4e8c 100644
--- a/docs/source/environments.md
+++ b/docs/source/environments.md
@@ -549,13 +549,13 @@ AgentWorldModel-1K — 1,000 synthetic MCP tool-use environments with 10,000 tas
 ```
 ````
 
-````{grid-item-card} Coding Agent
+````{grid-item-card} OpenCode
 :class-card: sd-border-1
 
-`coding_agent_env` runs coding-agent harnesses (currently OpenCode + Pi) inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, optionally capturing per-token logpr...
+`opencode_env` runs the OpenCode coding agent inside an isolated E2B sandbox against any OpenAI-compatible LLM endpoint, with trainer-owned interception for RL workflows.
 
 +++
-```{button-link} environments/coding_agent.html
+```{button-link} environments/opencode.html
 :color: primary
 :outline:
 
@@ -633,5 +633,5 @@ environments/tbench2
 environments/unity
 environments/wildfire
 environments/agent_world_model
-environments/coding_agent
+environments/opencode
 ```
diff --git a/docs/source/environments/coding_agent.md b/docs/source/environments/coding_agent.md
deleted file mode 100644
index 2903e2322..000000000
--- a/docs/source/environments/coding_agent.md
+++ /dev/null
@@ -1,2 +0,0 @@
-```{include} ../../../envs/coding_agent_env/README.md
-```
diff --git a/docs/source/environments/opencode.md b/docs/source/environments/opencode.md
new file mode 100644
index 000000000..9a93ebe33
--- /dev/null
+++ b/docs/source/environments/opencode.md
@@ -0,0 +1,2 @@
+```{include} ../../../envs/opencode_env/README.md
+```
diff --git a/envs/coding_agent_env/harness.py b/envs/coding_agent_env/harness.py
deleted file mode 100644
index de4ec91dd..000000000
--- a/envs/coding_agent_env/harness.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""Coding-agent session factory + session — backed by CLIAgentDriver."""
-
-from __future__ import annotations
-
-import queue as _queue_mod
-import uuid
-from typing import Any, Literal
-
-from openenv.core.harness import ResourceSessionFactory
-from openenv.core.harness.agents.cli_driver import (
-    CLIAgentDriver,
-    CLIAgentSession,
-    Verifier,
-)
-from openenv.core.harness.agents.interception_server import InterceptionServer
-from openenv.core.harness.agents.opencode import OPENCODE_SPEC
-from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
-
-from .config import CodingAgentConfig
-from .opencode_runtime import agent_log_path, build_env_vars, build_run_cmd
-from .task import CodingAgentTask
-
-
-class CodingAgentSession(CLIAgentSession):
-    def __init__(
-        self,
-        *,
-        sandbox: SandboxHandle,
-        config: CodingAgentConfig,
-        task: CodingAgentTask,
-        verifier: Verifier | None = None,
-        base_url_override: str | None = None,
-        interception_server: InterceptionServer | None = None,
-        interception_rollout_id: str | None = None,
-        interception_queue: _queue_mod.Queue[str] | None = None,
-    ) -> None:
-        super().__init__(
-            spec=OPENCODE_SPEC,
-            sandbox=sandbox,
-            task=task,
-            config=config,
-            verifier=verifier,
-            base_url_override=base_url_override,
-            interception_server=interception_server,
-            interception_rollout_id=interception_rollout_id,
-            interception_queue=interception_queue,
-        )
-
-    def fetch_trace(self) -> str:
-        return self.sandbox.read_text(agent_log_path(self.config))
-
-    def wait_for_completion(self, timeout_s: float | None = None) -> int:
-        budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
-        if self._agent_bg_job is None:
-            raise RuntimeError("Agent not started.")
-        return self._agent_bg_job.wait(timeout=budget)
-
-    def start_agent(self) -> None:
-        if self._agent_bg_job is not None:
-            return
-        cmd = build_run_cmd(self.config)
-        envs = build_env_vars(self.config, base_url_override=self._base_url_override)
-        self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs)
-
-
-class CodingAgentSessionFactory(ResourceSessionFactory):
-    def __init__(
-        self,
-        *,
-        config: CodingAgentConfig,
-        sandbox_backend: SandboxBackend,
-        mode: Literal["black_box", "interception_gate"] = "black_box",
-        verifier: Verifier | None = None,
-        install_timeout_s: int = 240,
-        setup_timeout_s: int = 300,
-        interception_server: InterceptionServer | None = None,
-        interception_base_url: str | None = None,
-    ) -> None:
-        if mode not in {"black_box", "interception_gate"}:
-            raise ValueError(f"Unknown mode: {mode!r}")
-        self._config = config
-        self._backend = sandbox_backend
-        self._verifier = verifier
-        self._driver = CLIAgentDriver(
-            spec=OPENCODE_SPEC,
-            sandbox_backend=sandbox_backend,
-            mode=mode,
-            install_timeout_s=install_timeout_s,
-            setup_timeout_s=setup_timeout_s,
-            interception_server=interception_server,
-            interception_base_url=interception_base_url,
-        )
-
-    def create(
-        self,
-        task: Any,
-        seed: int | None = None,
-        episode_id: str | None = None,
-    ) -> CodingAgentSession:
-        import logging
-
-        _log = logging.getLogger(__name__)
-        oc_task = CodingAgentTask.coerce(task)
-        setup_parts: list[str] = []
-        if self._config.extra_setup_shell:
-            setup_parts.append(self._config.extra_setup_shell)
-        if oc_task.setup_shell:
-            setup_parts.append(oc_task.setup_shell)
-        if setup_parts:
-            oc_task = oc_task.model_copy(
-                update={"setup_shell": "set -e\n" + "\n".join(setup_parts)}
-            )
-
-        sandbox_timeout = int(self._config.agent_timeout_s) + 300
-        sandbox = self._backend.create(
-            timeout_s=sandbox_timeout,
-            metadata={"episode_id": episode_id} if episode_id else None,
-        )
-        try:
-            self._bootstrap_sandbox(sandbox, oc_task)
-        except Exception as exc:
-            _log.error("factory.create: bootstrap failed: %r", exc)
-            sandbox.kill()
-            raise
-
-        # Wire up interception_gate if the driver is configured for it
-        base_url_override: str | None = None
-        interception_rollout_id: str | None = None
-        interception_queue: _queue_mod.Queue[str] | None = None
-
-        if self._driver.mode == "interception_gate":
-            interception_server = self._driver._interception_server
-            if interception_server is None:
-                raise RuntimeError(
-                    "interception_gate mode requires an InterceptionServer"
-                )
-            interception_base_url = self._driver._interception_base_url
-            if interception_base_url is None:
-                raise RuntimeError(
-                    "interception_gate mode requires interception_base_url"
-                )
-            rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}"
-            interception_rollout_id = rollout_id
-            interception_queue = interception_server.register_rollout(rollout_id)
-            base_url_override = (
-                f"{interception_base_url.rstrip('/')}/rollout/{rollout_id}/v1"
-            )
-
-        session = CodingAgentSession(
-            sandbox=sandbox,
-            config=self._config,
-            task=oc_task,
-            verifier=self._verifier,
-            base_url_override=base_url_override,
-            interception_server=self._driver._interception_server,
-            interception_rollout_id=interception_rollout_id,
-            interception_queue=interception_queue,
-        )
-
-        session.start_agent()
-        return session
-
-    def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: CodingAgentTask) -> None:
-        self._driver.bootstrap_sandbox(sandbox, task, self._config)
-
-
-__all__ = [
-    "CodingAgentSession",
-    "CodingAgentSessionFactory",
-    "CodingAgentTask",
-    "Verifier",
-]
diff --git a/envs/coding_agent_env/.dockerignore b/envs/opencode_env/.dockerignore
similarity index 100%
rename from envs/coding_agent_env/.dockerignore
rename to envs/opencode_env/.dockerignore
diff --git a/envs/coding_agent_env/.gitignore b/envs/opencode_env/.gitignore
similarity index 100%
rename from envs/coding_agent_env/.gitignore
rename to envs/opencode_env/.gitignore
diff --git a/envs/coding_agent_env/README.md b/envs/opencode_env/README.md
similarity index 64%
rename from envs/coding_agent_env/README.md
rename to envs/opencode_env/README.md
index 347afdd05..6840bd3fd 100644
--- a/envs/coding_agent_env/README.md
+++ b/envs/opencode_env/README.md
@@ -1,5 +1,5 @@
 ---
-title: Coding Agent Environment Server
+title: OpenCode Environment Server
 emoji: 🛠️
 colorFrom: indigo
 colorTo: purple
@@ -9,33 +9,33 @@ app_port: 8000
 base_path: /web
 tags:
   - openenv
-short_description: Multi-harness coding-agent env (OpenCode + Pi) in E2B
+short_description: OpenCode coding agent in an E2B sandbox
 ---
 
-# Coding Agent Environment for OpenEnv
+# OpenCode Environment for OpenEnv
 
-`coding_agent_env` runs coding-agent harnesses (currently
-[OpenCode](https://opencode.ai) and [Pi](https://github.com/badlogic/pi-mono))
+`opencode_env` runs the [OpenCode](https://opencode.ai) coding agent
 inside an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible
-LLM endpoint with optional trainer-owned interception for RL training.
+LLM endpoint, optionally capturing per-token logprobs through a transparent
+in-sandbox proxy for RL training data.
 
-**🚀 Try it live**: [`AdithyaSK/coding-agent-env`](https://huggingface.co/spaces/AdithyaSK/coding-agent-env)
+**🚀 Try it live**: [`AdithyaSK/opencode-env`](https://huggingface.co/spaces/AdithyaSK/opencode-env)
 
 The deployed Space exposes:
 
-- **Web UI** at [`/web`](https://adithyask-coding-agent-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward.
-- **MCP tool API** at [`/mcp`](https://adithyask-coding-agent-env.hf.space/mcp) — programmatic `run_rollout` calls.
-- **OpenAPI docs** at [`/docs`](https://adithyask-coding-agent-env.hf.space/docs).
-- **Health** at [`/health`](https://adithyask-coding-agent-env.hf.space/health).
+- **Web UI** at [`/web`](https://adithyask-opencode-env.hf.space/web) — pick endpoint, write task, hit Run, watch live phase log + reward.
+- **MCP tool API** at [`/mcp`](https://adithyask-opencode-env.hf.space/mcp) — programmatic `run_rollout` calls.
+- **OpenAPI docs** at [`/docs`](https://adithyask-opencode-env.hf.space/docs).
+- **Health** at [`/health`](https://adithyask-opencode-env.hf.space/health).
 
 The env is **task-agnostic** — every rollout is configured at call-time
 with a uniform Task shape:
 
-  - **`instruction`** — prompt for the agent
-  - **`setup`** — list of bash commands run *before* the agent (pip
+  - **`instruction`** — prompt for OpenCode
+  - **`setup`** — list of bash commands run *before* OpenCode (pip
     install, git clone, file downloads — anything you need staged in the
     sandbox)
-  - **`verify`** — list of bash commands run *after* the agent (asserts,
+  - **`verify`** — list of bash commands run *after* OpenCode (asserts,
     pytest invocations, score-file writes)
 
 Reward = `passed_verify / total_verify` unless any `verify` command writes
@@ -48,21 +48,20 @@ a float to `/home/user/logs/verifier/reward.txt` (override).
 ```python
 import asyncio
 import os
-from coding_agent_env import CodingAgentEnv
-from coding_agent_env.client import _extract_text
-from coding_agent_env.models import RolloutResult
+from opencode_env import OpenCodeEnv
+from opencode_env.client import _extract_text
+from opencode_env.models import RolloutResult
 
 
 async def main():
-    SPACE = "https://adithyask-coding-agent-env.hf.space"
+    SPACE = "https://adithyask-opencode-env.hf.space"
 
-    async with CodingAgentEnv(base_url=SPACE) as env:
+    async with OpenCodeEnv(base_url=SPACE) as env:
         await env.reset()
 
         # The MCP tool returns JSON; deserialize via the typed model.
         raw = await env.call_tool(
             "run_rollout",
-            agent="opencode",                          # opencode | pi
             endpoint="openai",                          # vllm | openai | hf_router
             api_key=os.environ["OPENAI_API_KEY"],       # or set as a Space secret
             instruction=(
@@ -77,7 +76,7 @@ async def main():
                 "import binary_search; "
                 "assert binary_search.binary_search([1,2,3], 2) == 1; print('OK')\"",
             ],
-            template="coding-agent-rl",                     # prebaked E2B template
+            template="opencode-rl",                     # prebaked E2B template
             task_id="binary_search_v1",
         )
         result = RolloutResult.model_validate_json(_extract_text(raw))
@@ -102,10 +101,10 @@ wall: 19.8 s
 
 ```python
 import os
-from coding_agent_env import CodingAgentEnv
+from opencode_env import OpenCodeEnv
 
 # .sync() returns a synchronous wrapper around the async client.
-with CodingAgentEnv(base_url="https://adithyask-coding-agent-env.hf.space").sync() as env:
+with OpenCodeEnv(base_url="https://adithyask-opencode-env.hf.space").sync() as env:
     env.reset()
     # MCP tools are reachable via env.call_tool(...) / env.step(...) sync-wrapped.
     # See the async example above for the full run_rollout signature.
@@ -120,12 +119,12 @@ For trainers that want to drive a sandbox directly without an HTTP boundary:
 
 ```python
 import os
-from coding_agent_env import (
-    CodingAgentConfig, CodingAgentSessionFactory, CodingAgentTask, E2BSandboxBackend,
+from opencode_env import (
+    OpenCodeConfig, OpenCodeSessionFactory, OpenCodeTask, E2BSandboxBackend,
 )
 
-factory = CodingAgentSessionFactory(
-    config=CodingAgentConfig(
+factory = OpenCodeSessionFactory(
+    config=OpenCodeConfig(
         provider="openai_compatible",
         base_url="https://api.openai.com/v1",
         api_key=os.environ["OPENAI_API_KEY"],
@@ -134,7 +133,7 @@ factory = CodingAgentSessionFactory(
     sandbox_backend=E2BSandboxBackend(),
     mode="interception_gate",                  # trainer-owned interception mode
 )
-session = factory.create(task=CodingAgentTask(instruction="..."))
+session = factory.create(task=OpenCodeTask(instruction="..."))
 session.wait_for_completion()
 session.close()
 ```
@@ -145,22 +144,22 @@ The Dockerfile lives at `server/Dockerfile`. Use the `openenv` CLI from
 the env root:
 
 ```bash
-cd envs/coding_agent_env
+cd envs/opencode_env
 
 openenv validate               # check pyproject.toml + openenv.yaml + server/app.py + uv.lock
-openenv build -t coding-agent-env  # builds the image (uses server/Dockerfile)
+openenv build -t opencode-env  # builds the image (uses server/Dockerfile)
 
 # run locally with E2B credentials
-docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env
+docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
 
 # push to HF Spaces (Docker variant)
-openenv push --repo-id <user>/coding-agent-env
+openenv push --repo-id <user>/opencode-env
 ```
 
 Or build directly without the CLI:
 
 ```bash
-docker build -t coding-agent-env -f envs/coding_agent_env/server/Dockerfile envs/coding_agent_env
+docker build -t opencode-env -f envs/opencode_env/server/Dockerfile envs/opencode_env
 ```
 
 The image:
@@ -173,7 +172,7 @@ The image:
 
 ## The MCP Tool: `run_rollout`
 
-Single tool, with an ``agent`` selector plus two ways to specify the LLM endpoint:
+Single tool, with two ways to specify the LLM endpoint:
 
 **Option A — endpoint shorthand (recommended)**: pass
 `endpoint="vllm"` (or `"openai"` / `"hf_router"`). The server resolves
@@ -185,30 +184,31 @@ directly.
 
 | Arg | Type | Default | Notes |
 |---|---|---|---|
-| `agent` | `str` | `"opencode"` | Harness to run: `"opencode"` or `"pi"`. |
 | `endpoint` | `str` | `""` | One of `"vllm"` / `"openai"` / `"hf_router"`. |
 | `base_url` / `api_key` / `model` | `str` | `""` | Override / supply explicitly. |
-| `instruction` | `str` | required | Prompt passed to the selected harness CLI. |
-| `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. |
-| `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. |
+| `instruction` | `str` | required | Prompt passed to OpenCode. |
+| `setup` | `list[str]` | `[]` | Bash commands run **before** OpenCode. |
+| `verify` | `list[str]` | `[]` | Bash commands run **after** OpenCode. |
 | `task_id` | `str` | `""` | Echoed back in result. |
-| `mode` | `str` | `"black_box"` | Or `"interception_gate"` for trainer-owned generation. |
+| `mode` | `str` | `"transparent_proxy"` | Or `"black_box"` for direct LLM calls. In-process trainers can also construct `OpenCodeSessionFactory(mode="interception_gate", ...)`. |
 | `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. |
 | `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. |
-| `top_logprobs` | `int` | `5` | Reserved for trainer-owned interception workflows. |
-| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for the selected harness. |
-| `template` | `str` | `""` | E2B template name; `"coding-agent-rl"` skips ~2 min of install per rollout. |
+| `top_logprobs` | `int` | `5` | Per-token top-k logprobs requested in `transparent_proxy` mode. |
+| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for OpenCode. |
+| `template` | `str` | `""` | E2B template name; `"opencode-rl"` skips ~2 min of install per rollout. |
 
 Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
-`verify_results[]`, `files{}`, `agent_log_tail`, `wall_s`,
+`verify_results[]`, `proxy_turns[]` (logprob records in transparent-proxy
+mode), `files{}`, `agent_log_tail`, `proxy_log_tail`, `wall_s`,
 `agent_exit_code`, `sandbox_id`, `error`.
 
 ## Two Operating Modes
 
 | Mode | What it does | Best for |
 |---|---|---|
-| **`black_box`** (default) | The selected harness talks directly to `base_url`. | Smoke tests, eval, SFT data collection. |
-| **`interception_gate`** | Agent calls are routed through trainer-host interception endpoints. Trainer owns forward pass + trajectory capture. | RL training with trainer-owned generation. |
+| **`transparent_proxy`** (default) | OpenCode talks to an in-sandbox proxy. The proxy forwards to `base_url`, requests logprobs, strips them before returning to OpenCode, and records `proxy_turns`. | RL data collection, GRPO-style traces. |
+| **`black_box`** | OpenCode talks directly to `base_url`. No logprob capture. | Smoke tests, eval, SFT data collection. |
+| **`interception_gate`** | Available through the in-process `OpenCodeSessionFactory`; OpenCode calls are routed through trainer-host interception endpoints. | Trainer-owned generation. |
 
 ## Environment Variables
 
@@ -237,20 +237,20 @@ sibling `.env` file; on HF Spaces, set them as **Space secrets**.
 ## Pre-baked E2B Template
 
 The first rollout in a fresh E2B sandbox spends ~2 min installing
-harness tooling. Build a one-time template that ships those pre-installed:
+OpenCode tooling. Build a one-time template that ships it pre-installed:
 
 ```bash
-.venv/bin/python envs/coding_agent_env/sandbox/build_template.py
-# → builds `coding-agent-rl` template in your E2B account (~1m20s, one-time)
+.venv/bin/python envs/opencode_env/sandbox/build_template.py
+# → builds `opencode-rl` template in your E2B account (~1m20s, one-time)
 ```
 
-After this, pass `template="coding-agent-rl"` on every `run_rollout` call —
+After this, pass `template="opencode-rl"` on every `run_rollout` call —
 each rollout drops to ~20–30s end-to-end.
 
 ## Project Structure
 
 ```
-coding_agent_env/
+opencode_env/
 ├── README.md                       # this file
 ├── openenv.yaml                    # OpenEnv space spec
 ├── pyproject.toml                  # deps + ``server`` entrypoint
@@ -258,18 +258,18 @@ coding_agent_env/
 ├── .gitignore / .dockerignore      # excludes .env / __pycache__
 ├── __init__.py                     # re-exports primitive + client + models
 │
-├── client.py                       # CodingAgentEnv(MCPToolClient)
-├── models.py                       # RolloutResult / CodingAgentState
+├── client.py                       # OpenCodeEnv(MCPToolClient)
+├── models.py                       # RolloutResult / OpenCodeState
 │
-├── config.py                       # CodingAgentConfig (primitive)
-├── harness.py                      # CodingAgentSession / CodingAgentSessionFactory (CLI-only)
+├── config.py                       # OpenCodeConfig (primitive)
+├── harness.py                      # OpenCodeSession / OpenCodeSessionFactory (CLI-only)
 ├── opencode_runtime.py             # opencode.json builder + cmds
-├── task.py                         # CodingAgentTask
+├── task.py                         # OpenCodeTask
 │
 ├── server/
 │   ├── __init__.py
 │   ├── app.py                      # FastAPI factory; mounts Gradio at /web
-│   ├── coding_environment.py      # MCPEnvironment with single ``run_rollout`` tool
+│   ├── opencode_environment.py    # MCPEnvironment with single ``run_rollout`` tool
 │   ├── gradio_ui.py                # the /web Gradio Blocks UI
 │   ├── catalog.py                  # endpoint shorthand resolver
 │   └── Dockerfile                  # multi-stage uv build (used by ``openenv build``)
@@ -291,6 +291,5 @@ src/openenv/core/harness/sandbox/
 
 - [OpenEnv docs](https://meta-pytorch.org/OpenEnv/)
 - [OpenCode CLI](https://opencode.ai/docs/cli/)
-- [Pi](https://github.com/badlogic/pi-mono)
 - [E2B Python SDK](https://e2b.dev/docs)
 
diff --git a/envs/coding_agent_env/__init__.py b/envs/opencode_env/__init__.py
similarity index 59%
rename from envs/coding_agent_env/__init__.py
rename to envs/opencode_env/__init__.py
index bc04e7236..ea72f4fe5 100644
--- a/envs/coding_agent_env/__init__.py
+++ b/envs/opencode_env/__init__.py
@@ -4,29 +4,29 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Coding-agent environment for OpenEnv.
+"""OpenCode environment for OpenEnv.
 
 Two layers in this package:
 
-1. **Harness primitive** -- :class:`CodingAgentSessionFactory` /
-   :class:`CodingAgentSession` / :class:`CodingAgentConfig` /
+1. **Harness primitive** -- :class:`OpenCodeSessionFactory` /
+   :class:`OpenCodeSession` / :class:`OpenCodeConfig` /
    :class:`E2BSandboxBackend`. Built on the generic
    :class:`CLIAgentDriver` from ``openenv.core.harness.agents``.
 
-2. **Deployable env** -- :class:`CodingAgentEnv` (MCP client) talks to the
+2. **Deployable env** -- :class:`OpenCodeEnv` (MCP client) talks to the
    FastAPI server at ``server/app.py`` over HTTP. Use this when the
-   sandbox + agent live behind an HTTP boundary (e.g. an HF Space).
+   sandbox + OpenCode live behind an HTTP boundary (e.g. an HF Space).
    See ``client.py`` and ``server/``.
 """
 
 from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
 from openenv.core.harness.sandbox import SandboxBackend, SandboxHandle
 
-from .client import CodingAgentEnv
-from .config import CodingAgentConfig, Provider
-from .harness import CodingAgentSession, CodingAgentSessionFactory
-from .models import CommandResult, CodingAgentState, RolloutResult
-from .task import CodingAgentTask
+from .client import OpenCodeEnv
+from .config import OpenCodeConfig, Provider
+from .harness import OpenCodeSession, OpenCodeSessionFactory
+from .models import CommandResult, OpenCodeState, RolloutResult, RolloutTurn
+from .task import OpenCodeTask
 
 try:
     from openenv.core.harness.sandbox import E2BSandboxBackend
@@ -35,18 +35,19 @@
 
 __all__ = [
     # Deployed-env client
-    "CodingAgentEnv",
+    "OpenCodeEnv",
     "CallToolAction",
     "ListToolsAction",
     # HTTP API models
     "CommandResult",
-    "CodingAgentState",
+    "OpenCodeState",
     "RolloutResult",
+    "RolloutTurn",
     # Harness primitive
-    "CodingAgentConfig",
-    "CodingAgentSession",
-    "CodingAgentSessionFactory",
-    "CodingAgentTask",
+    "OpenCodeConfig",
+    "OpenCodeSession",
+    "OpenCodeSessionFactory",
+    "OpenCodeTask",
     "Provider",
     # Sandbox backend
     "E2BSandboxBackend",
diff --git a/envs/coding_agent_env/client.py b/envs/opencode_env/client.py
similarity index 78%
rename from envs/coding_agent_env/client.py
rename to envs/opencode_env/client.py
index 492060a25..e11599b5e 100644
--- a/envs/coding_agent_env/client.py
+++ b/envs/opencode_env/client.py
@@ -4,17 +4,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Client for the deployed coding_agent_env server.
+"""Client for the deployed opencode_env server.
 
-The server exposes a single MCP tool ``run_rollout`` that runs one coding-agent
-rollout (OpenCode or Pi) in an E2B sandbox and returns a JSON-serialized
+The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode
+rollout in an E2B sandbox and returns a JSON-serialized
 :class:`RolloutResult`.
 
 Example::
 
-    from coding_agent_env import CodingAgentEnv
+    from opencode_env import OpenCodeEnv
 
-    with CodingAgentEnv(base_url="https://your-space.hf.space") as env:
+    with OpenCodeEnv(base_url="https://your-space.hf.space") as env:
         env.reset()
         result = env.run_rollout(
             base_url="https://api.openai.com/v1",
@@ -41,8 +41,8 @@
     from models import RolloutResult  # type: ignore
 
 
-class CodingAgentEnv(MCPToolClient):
-    """Typed client for the coding_agent_env MCP server.
+class OpenCodeEnv(MCPToolClient):
+    """Typed client for the opencode_env MCP server.
 
     Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image``
     / context-manager semantics from :class:`MCPToolClient`.
@@ -51,8 +51,7 @@ class CodingAgentEnv(MCPToolClient):
     def run_rollout(
         self,
         *,
-        # Agent + endpoint — pass either shorthand endpoint or explicit fields.
-        agent: str = "opencode",  # "opencode" | "pi"
+        # Endpoint — pass either shorthand endpoint or explicit fields.
         endpoint: str = "",  # "vllm" | "openai" | "hf_router"
         base_url: str = "",
         api_key: str = "",
@@ -63,50 +62,48 @@ def run_rollout(
         verify: list[str] | None = None,
         # Bookkeeping / tunables
         task_id: str = "",
-        mode: str = "black_box",
+        mode: str = "transparent_proxy",
         disable_thinking: bool | None = None,
         max_tokens_cap: int = 4096,
         top_logprobs: int = 5,
         agent_timeout_s: float = 600.0,
         template: str = "",
     ) -> RolloutResult:
-        """Run one coding-agent rollout and return the typed result.
+        """Run one opencode rollout and return the typed result.
 
         Args:
-            agent: Harness CLI to run in sandbox (``"opencode"`` or ``"pi"``).
             base_url: OpenAI-compatible LLM endpoint (with trailing /v1).
             api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM
                 if it doesn't enforce auth.
             model: Model id understood by the LLM endpoint
                 (e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``,
                 ``"Qwen/Qwen3-4B-Instruct-2507:nscale"``).
-            instruction: Prompt passed to the selected harness CLI.
-            setup: Bash commands run sequentially **before** the agent starts.
+            instruction: Prompt passed to OpenCode.
+            setup: Bash commands run sequentially **before** OpenCode starts.
                 Each command runs in the sandbox; non-zero exit aborts setup.
-            verify: Bash commands run sequentially **after** the agent exits.
+            verify: Bash commands run sequentially **after** OpenCode exits.
                 Reward = ``passed_count / total`` unless any command writes a
                 float to ``/home/user/logs/verifier/reward.txt`` (override).
             task_id: Echoed back in the result for traceability.
-            mode: ``"black_box"`` (agent talks directly to the LLM) or
-                ``"interception_gate"`` (LLM calls routed to trainer-side
-                InterceptionServer for trainer-owned generation).
+            mode: ``"transparent_proxy"`` (default, captures logprobs) or
+                ``"black_box"`` (OpenCode talks directly to the LLM).
             disable_thinking: Inject
                 ``chat_template_kwargs.enable_thinking=false`` on forwarded
                 requests. Needed for Qwen3.5 vLLM; harmless on Instruct
                 variants; rejected by OpenAI direct.
             max_tokens_cap: Clamp on per-turn ``max_tokens``.
-            top_logprobs: Reserved for trainer-owned interception workflows.
-            agent_timeout_s: Hard wall-clock budget for one agent run.
-            template: E2B template name (e.g. ``"coding-agent-rl"``). Empty
+            top_logprobs: Per-token top-k logprobs requested in
+                ``transparent_proxy`` mode.
+            agent_timeout_s: Hard wall-clock budget for one OpenCode run.
+            template: E2B template name (e.g. ``"opencode-rl"``). Empty
                 string uses the default (slow) base image.
 
         Returns:
-            A :class:`RolloutResult` with reward, file outputs,
+            A :class:`RolloutResult` with reward, proxy_turns, file outputs,
             setup/verify results, and diagnostic tails.
         """
         raw = self.call_tool(
             "run_rollout",
-            agent=agent,
             endpoint=endpoint,
             base_url=base_url,
             api_key=api_key,
diff --git a/envs/coding_agent_env/config.py b/envs/opencode_env/config.py
similarity index 78%
rename from envs/coding_agent_env/config.py
rename to envs/opencode_env/config.py
index d70610542..1e1a8b167 100644
--- a/envs/coding_agent_env/config.py
+++ b/envs/opencode_env/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Configuration model for the coding-agent harness primitive."""
+"""Configuration model for the OpenCode harness primitive."""
 
 from __future__ import annotations
 
@@ -16,8 +16,8 @@
 Provider = Literal["openai_compatible", "openai", "anthropic"]
 
 
-class CodingAgentConfig(BaseModel):
-    """All configuration required to launch one coding-agent rollout in a sandbox.
+class OpenCodeConfig(BaseModel):
+    """All configuration required to launch one OpenCode rollout in a sandbox.
 
     Field names are provider-agnostic. The primitive maps ``provider`` onto the
     correct ``opencode.json`` provider block (``@ai-sdk/openai-compatible``,
@@ -46,9 +46,18 @@ class CodingAgentConfig(BaseModel):
     extra_setup_shell: str | None = None
 
     # --- Model behavior --------------------------------------------------------
+    # Direct OpenCode config knobs (black_box / interception_gate).
     disable_thinking: bool = False
     max_tokens_cap: int | None = None
 
+    # --- Transparent-proxy logprob capture ------------------------------------
+    # Compatibility knobs for the HTTP env's logprob-capturing mode. The proxy
+    # requests OpenAI-compatible logprobs upstream, records them, and strips
+    # them before returning the response to OpenCode.
+    proxy_max_tokens_cap: int | None = 16384
+    proxy_top_logprobs: int = 5
+    proxy_disable_thinking: bool = False
+
     # --- Sandbox paths --------------------------------------------------------
     # Root directory inside the sandbox where the primitive writes config,
     # task files, and logs. E2B's default user is ``user`` with home
diff --git a/envs/opencode_env/harness.py b/envs/opencode_env/harness.py
new file mode 100644
index 000000000..ca5c294c2
--- /dev/null
+++ b/envs/opencode_env/harness.py
@@ -0,0 +1,342 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenCode session factory + session backed by CLIAgentDriver."""
+
+from __future__ import annotations
+
+import json
+import queue as _queue_mod
+import shlex
+import uuid
+from pathlib import Path
+from typing import Any, Literal
+
+from openenv.core.harness import ResourceSessionFactory
+from openenv.core.harness.agents.cli_driver import (
+    CLIAgentDriver,
+    CLIAgentSession,
+    Verifier,
+    build_interception_rollout_url,
+)
+from openenv.core.harness.agents.interception_server import InterceptionServer
+from openenv.core.harness.agents.opencode import OPENCODE_SPEC
+from openenv.core.harness.sandbox import BgJob, SandboxBackend, SandboxHandle
+
+from .config import OpenCodeConfig
+from .opencode_runtime import (
+    agent_log_path,
+    build_env_vars,
+    build_opencode_json,
+    build_run_cmd,
+    opencode_config_path,
+)
+from .task import OpenCodeTask
+
+
+# Inside-sandbox transparent proxy paths.
+_PROXY_PORT = 7000
+_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
+_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
+_PROXY_SOURCE_PATH = Path(__file__).parent / "sandbox" / "interception.py"
+
+
+class OpenCodeSession(CLIAgentSession):
+    def __init__(
+        self,
+        *,
+        sandbox: SandboxHandle,
+        config: OpenCodeConfig,
+        task: OpenCodeTask,
+        verifier: Verifier | None = None,
+        base_url_override: str | None = None,
+        agent_bg_job: BgJob | None = None,
+        proxy_trace_path: str | None = None,
+        proxy_bg_job: BgJob | None = None,
+        interception_server: InterceptionServer | None = None,
+        interception_rollout_id: str | None = None,
+        interception_queue: _queue_mod.Queue[str | None] | None = None,
+    ) -> None:
+        super().__init__(
+            spec=OPENCODE_SPEC,
+            sandbox=sandbox,
+            task=task,
+            config=config,
+            verifier=verifier,
+            base_url_override=base_url_override,
+            agent_bg_job=agent_bg_job,
+            interception_server=interception_server,
+            interception_rollout_id=interception_rollout_id,
+            interception_queue=interception_queue,
+        )
+        self._proxy_trace_path = proxy_trace_path
+        self._proxy_bg_job = proxy_bg_job
+
+    def fetch_trace(self) -> str:
+        return self.sandbox.read_text(agent_log_path(self.config))
+
+    def fetch_proxy_trace(self) -> list[dict[str, Any]]:
+        """Return per-turn proxy-captured records (transparent_proxy only)."""
+        if self._proxy_trace_path is None:
+            return []
+        try:
+            content = self.sandbox.read_text(self._proxy_trace_path)
+        except Exception:
+            return []
+        records: list[dict[str, Any]] = []
+        for line in content.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            records.append(json.loads(line))
+        return records
+
+    def close(self) -> None:
+        if self._proxy_bg_job is not None:
+            try:
+                self._proxy_bg_job.kill()
+            except Exception:
+                pass
+            self._proxy_bg_job = None
+        super().close()
+
+    def wait_for_completion(self, timeout_s: float | None = None) -> int:
+        budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
+        if self._agent_bg_job is None:
+            raise RuntimeError("Agent not started.")
+        return self._agent_bg_job.wait(timeout=budget)
+
+    def start_agent(self) -> None:
+        if self._agent_bg_job is not None:
+            return
+        cmd = build_run_cmd(self.config)
+        envs = build_env_vars(self.config, base_url_override=self._base_url_override)
+        self._agent_bg_job = self.sandbox.start_bg(cmd, envs=envs)
+
+
+class OpenCodeSessionFactory(ResourceSessionFactory):
+    def __init__(
+        self,
+        *,
+        config: OpenCodeConfig,
+        sandbox_backend: SandboxBackend,
+        mode: Literal[
+            "black_box", "transparent_proxy", "interception_gate"
+        ] = "transparent_proxy",
+        verifier: Verifier | None = None,
+        install_timeout_s: int = 240,
+        setup_timeout_s: int = 300,
+        interception_server: InterceptionServer | None = None,
+        interception_base_url: str | None = None,
+    ) -> None:
+        if mode not in {"black_box", "transparent_proxy", "interception_gate"}:
+            raise ValueError(f"Unknown mode: {mode!r}")
+        self._config = config
+        self._backend = sandbox_backend
+        self._mode = mode
+        self._verifier = verifier
+        driver_mode: Literal["black_box", "interception_gate"] = (
+            "black_box" if mode == "transparent_proxy" else mode
+        )
+        self._driver = CLIAgentDriver(
+            spec=OPENCODE_SPEC,
+            sandbox_backend=sandbox_backend,
+            mode=driver_mode,
+            install_timeout_s=install_timeout_s,
+            setup_timeout_s=setup_timeout_s,
+            interception_server=interception_server,
+            interception_base_url=interception_base_url,
+        )
+
+    def create(
+        self,
+        task: Any,
+        seed: int | None = None,
+        episode_id: str | None = None,
+    ) -> OpenCodeSession:
+        import logging
+
+        _log = logging.getLogger(__name__)
+        oc_task = OpenCodeTask.coerce(task)
+        setup_parts: list[str] = []
+        if self._config.extra_setup_shell:
+            setup_parts.append(self._config.extra_setup_shell)
+        if oc_task.setup_shell:
+            setup_parts.append(oc_task.setup_shell)
+        if setup_parts:
+            oc_task = oc_task.model_copy(
+                update={"setup_shell": "set -e\n" + "\n".join(setup_parts)}
+            )
+
+        sandbox_timeout = int(self._config.agent_timeout_s) + 300
+        sandbox = self._backend.create(
+            timeout_s=sandbox_timeout,
+            metadata={"episode_id": episode_id} if episode_id else None,
+        )
+        try:
+            self._bootstrap_sandbox(sandbox, oc_task)
+        except Exception as exc:
+            _log.error("factory.create: bootstrap failed: %r", exc)
+            sandbox.kill()
+            raise
+
+        base_url_override: str | None = None
+        interception_rollout_id: str | None = None
+        interception_queue: _queue_mod.Queue[str | None] | None = None
+        proxy_trace_path: str | None = None
+        proxy_bg_job: BgJob | None = None
+
+        if self._mode == "interception_gate":
+            interception_server = self._driver._interception_server
+            if interception_server is None:
+                raise RuntimeError(
+                    "interception_gate mode requires an InterceptionServer"
+                )
+            interception_base_url = self._driver._interception_base_url
+            if interception_base_url is None:
+                raise RuntimeError(
+                    "interception_gate mode requires interception_base_url"
+                )
+            rollout_id = episode_id or f"rollout_{uuid.uuid4().hex[:8]}"
+            interception_rollout_id = rollout_id
+            interception_queue = interception_server.register_rollout(rollout_id)
+            base_url_override = build_interception_rollout_url(
+                interception_base_url,
+                rollout_id,
+            )
+        elif self._mode == "transparent_proxy":
+            proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy(
+                sandbox
+            )
+
+        run_config = self._config
+        if base_url_override is not None:
+            api_key = self._config.api_key
+            if self._mode == "interception_gate":
+                assert self._driver._interception_server is not None
+                api_key = self._driver._interception_server.secret
+            run_config = self._config.model_copy(
+                update={
+                    "provider": "openai_compatible",
+                    "base_url": base_url_override,
+                    "api_key": api_key,
+                }
+            )
+        sandbox.write_text(
+            opencode_config_path(self._config),
+            build_opencode_json(run_config),
+        )
+        agent_bg_job = self._driver._start_agent(
+            sandbox,
+            oc_task,
+            run_config,
+            base_url_override=base_url_override,
+        )
+
+        return OpenCodeSession(
+            sandbox=sandbox,
+            config=run_config,
+            task=oc_task,
+            verifier=self._verifier,
+            base_url_override=base_url_override,
+            agent_bg_job=agent_bg_job,
+            proxy_trace_path=proxy_trace_path,
+            proxy_bg_job=proxy_bg_job,
+            interception_server=self._driver._interception_server,
+            interception_rollout_id=interception_rollout_id,
+            interception_queue=interception_queue,
+        )
+
+    def _start_proxy(
+        self,
+        sandbox: SandboxHandle,
+    ) -> tuple[BgJob, str, str]:
+        """Start the in-sandbox logprob-capturing proxy."""
+        proxy_already_present = sandbox.exists("/home/user/proxy/interception.py")
+
+        if not proxy_already_present:
+            self._driver._exec_with_retry(
+                sandbox,
+                "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
+                "'httpx>=0.27' 2>&1 | tail -20",
+                timeout=180,
+                attempts=3,
+                backoff_s=2.0,
+                label="proxy deps install",
+            )
+            sandbox.write_text(
+                "/home/user/proxy/interception.py",
+                _PROXY_SOURCE_PATH.read_text(),
+            )
+            sandbox.write_text("/home/user/proxy/__init__.py", "")
+
+        proxy_args = [
+            "python",
+            "interception.py",
+            "--upstream-url",
+            self._config.base_url,
+            "--trace",
+            _PROXY_TRACE_PATH,
+            "--port",
+            str(_PROXY_PORT),
+            "--top-logprobs",
+            str(self._config.proxy_top_logprobs),
+        ]
+        if self._config.proxy_max_tokens_cap is not None:
+            proxy_args.extend(
+                ["--max-tokens-cap", str(self._config.proxy_max_tokens_cap)]
+            )
+        if self._config.proxy_disable_thinking:
+            proxy_args.append("--disable-thinking")
+        if self._config.model:
+            proxy_args.extend(["--model-override", self._config.model])
+
+        quoted_proxy_args = " ".join(shlex.quote(arg) for arg in proxy_args)
+        proxy_cmd = (
+            "cd /home/user/proxy && "
+            f"{quoted_proxy_args} "
+            f"> {shlex.quote(_PROXY_LOG_PATH)} 2>&1"
+        )
+        proxy_env = {"OPENCODE_UPSTREAM_API_KEY": self._config.api_key}
+        proxy_job = sandbox.start_bg(proxy_cmd, envs=proxy_env)
+
+        import time
+
+        attempts = 120
+        interval_s = 0.5
+        for _ in range(attempts):
+            r = sandbox.exec(
+                f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz",
+                timeout=5,
+            )
+            if r.exit_code == 0:
+                break
+            time.sleep(interval_s)
+        else:
+            log = ""
+            try:
+                log = sandbox.read_text(_PROXY_LOG_PATH)
+            except Exception:
+                pass
+            proxy_job.kill()
+            raise RuntimeError(
+                f"proxy did not start within {attempts * interval_s:.0f}s. "
+                f"log:\n{log[-2000:]}"
+            )
+
+        base_url_override = f"http://127.0.0.1:{_PROXY_PORT}/v1"
+        return proxy_job, base_url_override, _PROXY_TRACE_PATH
+
+    def _bootstrap_sandbox(self, sandbox: SandboxHandle, task: OpenCodeTask) -> None:
+        self._driver.bootstrap_sandbox(sandbox, task, self._config)
+
+
+__all__ = [
+    "OpenCodeSession",
+    "OpenCodeSessionFactory",
+    "OpenCodeTask",
+    "Verifier",
+]
diff --git a/envs/coding_agent_env/models.py b/envs/opencode_env/models.py
similarity index 67%
rename from envs/coding_agent_env/models.py
rename to envs/opencode_env/models.py
index 2bf19925e..d2b023839 100644
--- a/envs/coding_agent_env/models.py
+++ b/envs/opencode_env/models.py
@@ -4,20 +4,36 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Pydantic models for the deployed coding_agent_env HTTP server.
+"""Pydantic models for the deployed opencode_env HTTP server.
 
 The server exposes a single MCP tool ``run_rollout`` that takes a Task
 (instruction + setup commands + verify commands) plus an LLM endpoint
-config, runs one coding-agent rollout end-to-end inside an E2B sandbox, and
+config, runs one OpenCode rollout end-to-end inside an E2B sandbox, and
 returns a :class:`RolloutResult` JSON.
 """
 
 from __future__ import annotations
 
+from typing import Any
+
 from openenv.core.env_server.types import State
 from pydantic import BaseModel, Field
 
 
+class RolloutTurn(BaseModel):
+    """One intercepted LLM turn captured by transparent-proxy mode."""
+
+    turn: int
+    finish_reason: str | None = None
+    completion_tokens: list[str] = Field(default_factory=list)
+    completion_token_ids: list[int] = Field(default_factory=list)
+    per_token_logps: list[float] = Field(default_factory=list)
+    latency_s: float = 0.0
+    timestamp: float = 0.0
+    upstream_status: int | None = None
+    upstream_error: dict[str, Any] | None = None
+
+
 class CommandResult(BaseModel):
     """Outcome of one bash command in setup/verify.
 
@@ -50,19 +66,23 @@ class RolloutResult(BaseModel):
     setup_results: list[CommandResult] = Field(default_factory=list)
     verify_results: list[CommandResult] = Field(default_factory=list)
 
+    # Per-turn LLM trajectory (empty outside transparent_proxy mode)
+    proxy_turns: list[RolloutTurn] = Field(default_factory=list)
+
     # Filesystem the agent produced (path -> contents, truncated)
     files: dict[str, str] = Field(default_factory=dict)
     files_extra: list[str] = Field(default_factory=list)
 
     # Diagnostic tails
     agent_log_tail: str = ""
+    proxy_log_tail: str = ""
 
     # Error surfacing
     error: str | None = None
 
 
-class CodingAgentState(State):
-    """Per-session env state across calls to one CodingAgentEnvironment instance.
+class OpenCodeState(State):
+    """Per-session env state across calls to one OpenCodeEnvironment instance.
 
     Each HTTP session gets its own env (``SUPPORTS_CONCURRENT_SESSIONS=True``
     on the server class), so this state is per-session.
diff --git a/envs/coding_agent_env/opencode_runtime.py b/envs/opencode_env/opencode_runtime.py
similarity index 87%
rename from envs/coding_agent_env/opencode_runtime.py
rename to envs/opencode_env/opencode_runtime.py
index 31285556e..0f1484e3a 100644
--- a/envs/coding_agent_env/opencode_runtime.py
+++ b/envs/opencode_env/opencode_runtime.py
@@ -16,34 +16,34 @@
 import json
 from typing import Any
 
-from .config import CodingAgentConfig, provider_npm_package
+from .config import OpenCodeConfig, provider_npm_package
 
 
-def opencode_config_path(config: CodingAgentConfig) -> str:
+def opencode_config_path(config: OpenCodeConfig) -> str:
     return f"{config.sandbox_home}/.config/opencode/opencode.json"
 
 
-def instruction_path(config: CodingAgentConfig) -> str:
+def instruction_path(config: OpenCodeConfig) -> str:
     return f"{config.sandbox_home}/task/instruction.md"
 
 
-def agent_log_path(config: CodingAgentConfig) -> str:
+def agent_log_path(config: OpenCodeConfig) -> str:
     return f"{config.sandbox_home}/logs/agent/opencode.jsonl"
 
 
-def system_prompt_path(config: CodingAgentConfig) -> str:
+def system_prompt_path(config: OpenCodeConfig) -> str:
     return f"{config.sandbox_home}/task/system.md"
 
 
-def verifier_reward_path(config: CodingAgentConfig) -> str:
+def verifier_reward_path(config: OpenCodeConfig) -> str:
     return f"{config.sandbox_home}/logs/verifier/reward.txt"
 
 
-def workdir_path(config: CodingAgentConfig) -> str:
+def workdir_path(config: OpenCodeConfig) -> str:
     return f"{config.sandbox_home}/workdir"
 
 
-def build_opencode_json(config: CodingAgentConfig) -> str:
+def build_opencode_json(config: OpenCodeConfig) -> str:
     """Return the serialized ``opencode.json`` the sandbox should install.
 
     Provider block is keyed by a stable internal name (``intercepted``) so the
@@ -90,7 +90,7 @@ def build_opencode_json(config: CodingAgentConfig) -> str:
     return json.dumps(doc, indent=2)
 
 
-def build_install_cmd(config: CodingAgentConfig) -> str:
+def build_install_cmd(config: OpenCodeConfig) -> str:
     """Return the shell command that installs OpenCode + ensures PATH.
 
     The upstream installer honors ``OPENCODE_VERSION=x.y.z`` for pinning;
@@ -110,7 +110,7 @@ def build_install_cmd(config: CodingAgentConfig) -> str:
     )
 
 
-def build_run_cmd(config: CodingAgentConfig) -> str:
+def build_run_cmd(config: OpenCodeConfig) -> str:
     """Return the shell command that launches OpenCode against a task."""
 
     format_flag = "--format json" if config.run_format == "json" else ""
@@ -123,7 +123,7 @@ def build_run_cmd(config: CodingAgentConfig) -> str:
 
 
 def build_env_vars(
-    config: CodingAgentConfig, *, base_url_override: str | None = None
+    config: OpenCodeConfig, *, base_url_override: str | None = None
 ) -> dict[str, str]:
     """Return env vars to set on the OpenCode process.
 
@@ -140,7 +140,7 @@ def build_env_vars(
     return env
 
 
-def _build_tools_block(config: CodingAgentConfig) -> dict[str, bool]:
+def _build_tools_block(config: OpenCodeConfig) -> dict[str, bool]:
     """Translate enabled/disabled lists into opencode's ``tools`` map."""
 
     if config.enabled_tools is not None:
diff --git a/envs/coding_agent_env/openenv.yaml b/envs/opencode_env/openenv.yaml
similarity index 76%
rename from envs/coding_agent_env/openenv.yaml
rename to envs/opencode_env/openenv.yaml
index be34c3a51..2a534a088 100644
--- a/envs/coding_agent_env/openenv.yaml
+++ b/envs/opencode_env/openenv.yaml
@@ -1,5 +1,5 @@
 spec_version: 1
-name: coding_agent_env
+name: opencode_env
 type: space
 runtime: fastapi
 app: server.app:app
diff --git a/envs/coding_agent_env/pyproject.toml b/envs/opencode_env/pyproject.toml
similarity index 71%
rename from envs/coding_agent_env/pyproject.toml
rename to envs/opencode_env/pyproject.toml
index d935a0bf5..a72ade07d 100644
--- a/envs/coding_agent_env/pyproject.toml
+++ b/envs/opencode_env/pyproject.toml
@@ -9,9 +9,9 @@ requires = ["setuptools>=45", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "openenv-coding-agent-env"
+name = "openenv-opencode-env"
 version = "0.1.0"
-description = "Coding-agent environment for OpenEnv — runs OpenCode/Pi harnesses in an E2B sandbox against OpenAI-compatible LLM endpoints."
+description = "OpenCode environment for OpenEnv — runs the OpenCode CLI in an E2B sandbox against OpenAI-compatible LLM endpoints."
 requires-python = ">=3.10"
 dependencies = [
     # Core OpenEnv (server + MCP). 0.3.0 ships the harness runtime.
@@ -40,16 +40,16 @@ dev = [
 
 [project.scripts]
 # Server entrypoint — enables ``uv run --project . server``.
-server = "coding_agent_env.server.app:main"
+server = "opencode_env.server.app:main"
 
 [tool.setuptools]
 include-package-data = true
 packages = [
-    "coding_agent_env",
-    "coding_agent_env.sandbox",
-    "coding_agent_env.server",
+    "opencode_env",
+    "opencode_env.sandbox",
+    "opencode_env.server",
 ]
-package-dir = { "coding_agent_env" = ".", "coding_agent_env.sandbox" = "sandbox", "coding_agent_env.server" = "server" }
+package-dir = { "opencode_env" = ".", "opencode_env.sandbox" = "sandbox", "opencode_env.server" = "server" }
 
 [tool.setuptools.package-data]
-coding_agent_env = ["**/*.md"]
+opencode_env = ["**/*.md"]
diff --git a/envs/coding_agent_env/sandbox/__init__.py b/envs/opencode_env/sandbox/__init__.py
similarity index 100%
rename from envs/coding_agent_env/sandbox/__init__.py
rename to envs/opencode_env/sandbox/__init__.py
diff --git a/envs/coding_agent_env/sandbox/build_template.py b/envs/opencode_env/sandbox/build_template.py
similarity index 94%
rename from envs/coding_agent_env/sandbox/build_template.py
rename to envs/opencode_env/sandbox/build_template.py
index 01978b520..67cf0756d 100644
--- a/envs/coding_agent_env/sandbox/build_template.py
+++ b/envs/opencode_env/sandbox/build_template.py
@@ -60,10 +60,10 @@ def build_template(name: str, *, skip_cache: bool = False) -> str:
 
 def main(argv: list[str] | None = None) -> int:
     p = argparse.ArgumentParser(prog="build_e2b_template")
-    p.add_argument("--name", default="coding-agent-rl")
+    p.add_argument("--name", default="opencode-rl")
     p.add_argument("--skip-cache", action="store_true")
     args = p.parse_args(argv)
-    _load_env(_REPO_ROOT / "envs" / "coding_agent_env" / "sandbox" / ".env")
+    _load_env(_REPO_ROOT / "envs" / "opencode_env" / "sandbox" / ".env")
     if not os.environ.get("E2B_API_KEY"):
         print("ERROR: E2B_API_KEY required.", file=sys.stderr)
         return 2
diff --git a/envs/opencode_env/sandbox/interception.py b/envs/opencode_env/sandbox/interception.py
new file mode 100644
index 000000000..131d41024
--- /dev/null
+++ b/envs/opencode_env/sandbox/interception.py
@@ -0,0 +1,661 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Transparent OpenAI-compatible forwarding proxy with logprob capture.
+
+The proxy is a small FastAPI app that OpenCode talks to instead of the upstream
+LLM endpoint. It:
+
+1. Forwards every ``POST /v1/chat/completions`` request to the real upstream
+   URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream
+   returns per-token logprobs.
+2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines
+   trace file.
+3. Returns the upstream response to OpenCode verbatim (minus the ``logprobs``
+   field, which we strip so the CLI never sees anything unexpected).
+
+The proxy is stateless beyond the trace file. One proxy instance runs per
+session, normally inside the sandbox on ``localhost:7000``.
+
+Run standalone::
+
+    OPENCODE_UPSTREAM_API_KEY=... python -m opencode_env.interception \\
+        --upstream-url https://vllm.example/v1 \\
+        --trace /tmp/trace.jsonl \\
+        --port 7000
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import copy
+import json
+import logging
+import os
+import socket
+import threading
+import time
+from contextlib import asynccontextmanager, closing
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import httpx
+import uvicorn
+from fastapi import FastAPI, Request, Response
+from fastapi.responses import JSONResponse, StreamingResponse
+
+
+CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
+_LOG = logging.getLogger(__name__)
+
+
+@dataclass
+class ProxyConfig:
+    """Runtime configuration for one :class:`InterceptionProxy`."""
+
+    upstream_url: str
+    upstream_api_key: str = "intercepted"
+    trace_path: str = "/tmp/opencode-proxy-trace.jsonl"
+    host: str = "127.0.0.1"
+    port: int = 7000
+    top_logprobs: int = 5
+    request_timeout_s: float = 600.0
+    # Cap ``max_tokens`` before forwarding. OpenCode historically asks for very
+    # large values (e.g. 32000) that exceed gpt-4o-mini's 16384 cap; capping
+    # here avoids spurious upstream 400s without requiring the caller to know
+    # per-model limits.
+    max_tokens_cap: int | None = 16384
+    # Disable Qwen-style reasoning/thinking by injecting
+    # ``chat_template_kwargs.enable_thinking=false`` into forwarded requests.
+    disable_thinking: bool = False
+    # Override the ``model`` field on every forwarded request. Some opencode
+    # builds emit a stripped model id (e.g. ``Qwen3.5-4B`` instead of the
+    # ``Qwen/Qwen3.5-4B`` the upstream serves) for their internal
+    # title-generation call. Setting this to the exact upstream model id
+    # bypasses that mismatch.
+    model_override: str | None = None
+
+
+@dataclass
+class TurnRecord:
+    """One intercepted turn, written to the trace file as JSON-lines."""
+
+    turn: int
+    request: dict[str, Any]
+    response: dict[str, Any]
+    logprobs: list[dict[str, Any]] | None
+    completion_tokens: list[str]
+    completion_token_ids: list[int]
+    per_token_logps: list[float]
+    finish_reason: str | None
+    latency_s: float
+    timestamp: float = field(default_factory=time.time)
+
+    def to_json(self) -> str:
+        return json.dumps(self.__dict__, default=str)
+
+
+def _build_app(cfg: ProxyConfig) -> FastAPI:
+    """Construct the FastAPI app that serves one proxy session."""
+
+    state: dict[str, Any] = {"turn": 0, "lock": asyncio.Lock()}
+
+    # HTTP client reused across requests. ``None`` auth header — we let each
+    # request carry its own ``Authorization`` populated from ``upstream_api_key``.
+    client = httpx.AsyncClient(timeout=cfg.request_timeout_s)
+    trace_file = open(cfg.trace_path, "a", buffering=1)
+
+    @asynccontextmanager
+    async def lifespan(_: FastAPI) -> Any:
+        try:
+            yield
+        finally:
+            await client.aclose()
+            trace_file.close()
+
+    app = FastAPI(title="opencode-interception-proxy", lifespan=lifespan)
+
+    @app.get("/healthz")
+    def healthz() -> dict[str, str]:
+        return {"status": "ok"}
+
+    @app.post(CHAT_COMPLETIONS_PATH)
+    async def chat_completions(request: Request) -> Response:
+        raw_body = await request.body()
+        try:
+            body = json.loads(raw_body)
+        except json.JSONDecodeError:
+            return JSONResponse(
+                status_code=400, content={"error": "invalid json body"}
+            )
+
+        forwarded_body = _prepare_forwarded_body(body, cfg)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {cfg.upstream_api_key}",
+        }
+        upstream_url = _resolve_upstream_url(cfg.upstream_url)
+
+        async with state["lock"]:
+            state["turn"] += 1
+            turn_idx = state["turn"]
+
+        if forwarded_body.get("stream"):
+            return await _proxy_streaming(
+                client=client,
+                upstream_url=upstream_url,
+                headers=headers,
+                forwarded_body=forwarded_body,
+                original_body=body,
+                trace_file=trace_file,
+                turn_idx=turn_idx,
+            )
+        return await _proxy_unary(
+            client=client,
+            upstream_url=upstream_url,
+            headers=headers,
+            forwarded_body=forwarded_body,
+            original_body=body,
+            trace_file=trace_file,
+            turn_idx=turn_idx,
+        )
+
+    return app
+
+
+def _prepare_forwarded_body(body: dict[str, Any], cfg: ProxyConfig) -> dict[str, Any]:
+    """Return the body we actually send upstream.
+
+    - Injects ``logprobs=true`` + ``top_logprobs`` so the upstream emits
+      per-token logprobs.
+    - Caps ``max_tokens`` / ``max_completion_tokens`` to ``max_tokens_cap``.
+    - For models that reject ``max_tokens`` (e.g. gpt-5.x), translates to
+      ``max_completion_tokens``.
+    """
+    forwarded = copy.deepcopy(body)
+    forwarded.setdefault("logprobs", True)
+    forwarded.setdefault("top_logprobs", cfg.top_logprobs)
+
+    # GPT-5.x and newer: ``max_tokens`` is rejected; must use
+    # ``max_completion_tokens``. Detect via model string so we don't break
+    # gpt-4.x or vLLM-hosted models that accept ``max_tokens``.
+    model = str(forwarded.get("model", ""))
+    needs_translation = _model_uses_max_completion_tokens(model)
+    if needs_translation and "max_tokens" in forwarded:
+        value = forwarded.pop("max_tokens")
+        forwarded.setdefault("max_completion_tokens", value)
+
+    if cfg.max_tokens_cap is not None:
+        for key in ("max_tokens", "max_completion_tokens"):
+            value = forwarded.get(key)
+            if isinstance(value, int) and value > cfg.max_tokens_cap:
+                forwarded[key] = cfg.max_tokens_cap
+
+    if cfg.disable_thinking:
+        # vLLM applies chat_template_kwargs to the tokenizer's chat template
+        # for Qwen3/Qwen3.5 models, turning off <think>...</think> generation.
+        extra = forwarded.setdefault("chat_template_kwargs", {})
+        extra.setdefault("enable_thinking", False)
+
+    if cfg.model_override:
+        forwarded["model"] = cfg.model_override
+
+    return forwarded
+
+
+def _model_uses_max_completion_tokens(model: str) -> bool:
+    """Heuristic: ``True`` for models that reject ``max_tokens``."""
+    # Strip a provider prefix opencode may have prepended (e.g. "intercepted/").
+    bare = model.split("/", 1)[-1].lower()
+    return bare.startswith(("gpt-5", "o1", "o3", "o4"))
+
+
+def _resolve_upstream_url(upstream: str) -> str:
+    """Build the fully qualified chat-completions URL from a base URL."""
+    base = upstream.rstrip("/")
+    if base.endswith("/v1"):
+        return f"{base}/chat/completions"
+    return f"{base}{CHAT_COMPLETIONS_PATH}"
+
+
+async def _proxy_unary(
+    *,
+    client: httpx.AsyncClient,
+    upstream_url: str,
+    headers: dict[str, str],
+    forwarded_body: dict[str, Any],
+    original_body: dict[str, Any],
+    trace_file: Any,
+    turn_idx: int,
+) -> Response:
+    start = time.time()
+    upstream_response = await client.post(
+        upstream_url, content=json.dumps(forwarded_body), headers=headers
+    )
+    latency = time.time() - start
+    try:
+        response_json = upstream_response.json()
+    except Exception:
+        return Response(
+            content=upstream_response.content,
+            status_code=upstream_response.status_code,
+            media_type=upstream_response.headers.get(
+                "content-type", "application/json"
+            ),
+        )
+
+    record = _build_turn_record(
+        turn_idx=turn_idx,
+        request_body=forwarded_body,
+        response_json=response_json,
+        latency_s=latency,
+    )
+    trace_file.write(record.to_json() + "\n")
+    sanitized = _strip_logprobs(response_json)
+    return JSONResponse(content=sanitized, status_code=upstream_response.status_code)
+
+
+async def _proxy_streaming(
+    *,
+    client: httpx.AsyncClient,
+    upstream_url: str,
+    headers: dict[str, str],
+    forwarded_body: dict[str, Any],
+    original_body: dict[str, Any],
+    trace_file: Any,
+    turn_idx: int,
+) -> Response:
+    """Forward an SSE stream while accumulating the full response.
+
+    Opens the upstream stream and inspects the status. On non-2xx, reads the
+    full body (an error JSON, not SSE) and returns it to the caller as a
+    regular JSON response — previously we silently emitted an empty
+    ``text/event-stream`` which opencode interpreted as an empty assistant
+    turn. Both the error body and the latency are written to the trace file
+    so debugging a broken rollout doesn't require another round-trip.
+    """
+
+    start = time.time()
+
+    # Open the stream outside the generator so we can branch on status before
+    # committing to a streaming response shape.
+    upstream_cm = client.stream(
+        "POST",
+        upstream_url,
+        content=json.dumps(forwarded_body),
+        headers=headers,
+    )
+    upstream = await upstream_cm.__aenter__()
+
+    if upstream.status_code >= 400:
+        # Upstream responded with an error body (not SSE). Read it fully and
+        # return as a non-streaming JSON payload.
+        error_bytes = await upstream.aread()
+        await upstream_cm.__aexit__(None, None, None)
+        latency = time.time() - start
+        try:
+            error_json = json.loads(error_bytes.decode() or "{}")
+        except Exception:
+            error_json = {"error": error_bytes.decode(errors="replace")[:4000]}
+        record = _build_turn_record(
+            turn_idx=turn_idx,
+            request_body=forwarded_body,
+            response_json={
+                "choices": [],
+                "usage": None,
+                "upstream_status": upstream.status_code,
+                "upstream_error": error_json,
+            },
+            latency_s=latency,
+        )
+        trace_file.write(record.to_json() + "\n")
+        _LOG.warning(
+            "proxy turn %s: upstream %s: %s",
+            turn_idx,
+            upstream.status_code,
+            str(error_json)[:400],
+        )
+        return JSONResponse(content=error_json, status_code=upstream.status_code)
+
+    async def _stream() -> Any:
+        accumulated: dict[str, Any] = {
+            "content_by_idx": {},
+            "tool_calls_by_idx": {},
+            "finish_by_idx": {},
+            "logprobs_by_idx": {},
+        }
+        last_chunk: dict[str, Any] = {}
+        try:
+            async for line in upstream.aiter_lines():
+                if not line:
+                    yield "\n"
+                    continue
+                yield line + "\n"
+                if not line.startswith("data:"):
+                    continue
+                data = line[len("data:"):].strip()
+                if data == "[DONE]":
+                    continue
+                try:
+                    chunk = json.loads(data)
+                except json.JSONDecodeError:
+                    continue
+                last_chunk = chunk
+                _accumulate_stream_chunk(chunk, accumulated)
+        finally:
+            await upstream_cm.__aexit__(None, None, None)
+
+        latency = time.time() - start
+        response_json = _assemble_streamed_response(last_chunk, accumulated)
+        record = _build_turn_record(
+            turn_idx=turn_idx,
+            request_body=forwarded_body,
+            response_json=response_json,
+            latency_s=latency,
+        )
+        trace_file.write(record.to_json() + "\n")
+
+    return StreamingResponse(_stream(), media_type="text/event-stream")
+
+
+def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None:
+    for choice in chunk.get("choices", []) or []:
+        idx = choice.get("index", 0)
+        delta = choice.get("delta") or {}
+        content = delta.get("content")
+        if content:
+            acc["content_by_idx"].setdefault(idx, []).append(content)
+        # HF-Router's Qwen thinking mode streams the chain-of-thought under a
+        # separate ``reasoning`` field (per Together/Scaleway). Accumulate it
+        # so the assembled response surfaces it — otherwise it's dropped and
+        # proxy_turn observability is lost for thinking-mode rollouts.
+        reasoning = delta.get("reasoning")
+        if reasoning:
+            acc.setdefault("reasoning_by_idx", {}).setdefault(idx, []).append(reasoning)
+        for tc in delta.get("tool_calls") or []:
+            tc_idx = tc.get("index", 0)
+            bucket = acc["tool_calls_by_idx"].setdefault(
+                (idx, tc_idx),
+                {"id": None, "type": "function", "function": {"name": "", "arguments": ""}},
+            )
+            if tc.get("id"):
+                bucket["id"] = tc["id"]
+            fn = tc.get("function") or {}
+            if fn.get("name"):
+                bucket["function"]["name"] += fn["name"]
+            if fn.get("arguments"):
+                bucket["function"]["arguments"] += fn["arguments"]
+        if choice.get("finish_reason"):
+            acc["finish_by_idx"][idx] = choice["finish_reason"]
+        lp = choice.get("logprobs") or {}
+        content_lp = lp.get("content")
+        if content_lp:
+            acc["logprobs_by_idx"].setdefault(idx, []).extend(content_lp)
+
+
+def _assemble_streamed_response(
+    last_chunk: dict[str, Any], acc: dict[str, Any]
+) -> dict[str, Any]:
+    indices = sorted(
+        set(acc["content_by_idx"])
+        | set(acc["finish_by_idx"])
+        | {k[0] for k in acc["tool_calls_by_idx"]}
+        | set(acc["logprobs_by_idx"])
+        | {0}
+    )
+    choices: list[dict[str, Any]] = []
+    for idx in indices:
+        tool_calls = [
+            acc["tool_calls_by_idx"][k]
+            for k in sorted(acc["tool_calls_by_idx"])
+            if k[0] == idx
+        ]
+        message: dict[str, Any] = {"role": "assistant"}
+        content = "".join(acc["content_by_idx"].get(idx, []))
+        if content:
+            message["content"] = content
+        reasoning = "".join((acc.get("reasoning_by_idx") or {}).get(idx, []))
+        if reasoning:
+            message["reasoning"] = reasoning
+        if tool_calls:
+            message["tool_calls"] = tool_calls
+        choice: dict[str, Any] = {
+            "index": idx,
+            "message": message,
+            "finish_reason": acc["finish_by_idx"].get(idx),
+        }
+        if acc["logprobs_by_idx"].get(idx):
+            choice["logprobs"] = {"content": acc["logprobs_by_idx"][idx]}
+        choices.append(choice)
+    return {
+        "id": last_chunk.get("id", ""),
+        "object": "chat.completion",
+        "model": last_chunk.get("model", ""),
+        "choices": choices,
+        "usage": last_chunk.get("usage"),
+    }
+
+
+def _build_turn_record(
+    *,
+    turn_idx: int,
+    request_body: dict[str, Any],
+    response_json: dict[str, Any],
+    latency_s: float,
+) -> TurnRecord:
+    """Extract per-token logprobs into a normalized :class:`TurnRecord`."""
+
+    choice = (response_json.get("choices") or [{}])[0]
+    logprobs_field = choice.get("logprobs") or {}
+    content_lp = logprobs_field.get("content") or []
+
+    tokens: list[str] = []
+    token_ids: list[int] = []
+    per_token_logps: list[float] = []
+    for entry in content_lp:
+        tokens.append(entry.get("token", ""))
+        # OpenAI returns no raw token ids; vLLM returns them as ``token_id``.
+        token_id = entry.get("token_id")
+        if token_id is not None:
+            token_ids.append(int(token_id))
+        lp = entry.get("logprob")
+        if lp is not None:
+            per_token_logps.append(float(lp))
+
+    return TurnRecord(
+        turn=turn_idx,
+        request=request_body,
+        response=response_json,
+        logprobs=content_lp,
+        completion_tokens=tokens,
+        completion_token_ids=token_ids,
+        per_token_logps=per_token_logps,
+        finish_reason=choice.get("finish_reason"),
+        latency_s=latency_s,
+    )
+
+
+def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]:
+    """Return a copy of the response with ``choices[*].logprobs`` removed."""
+
+    out = dict(response_json)
+    choices = out.get("choices")
+    if isinstance(choices, list):
+        out["choices"] = [
+            {k: v for k, v in (ch or {}).items() if k != "logprobs"}
+            for ch in choices
+        ]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Standalone runner (used inside the sandbox)
+# ---------------------------------------------------------------------------
+
+
+def serve(cfg: ProxyConfig) -> None:
+    """Start the proxy and block (for use as the sandbox-side entry point)."""
+
+    app = _build_app(cfg)
+    uvicorn.run(app, host=cfg.host, port=cfg.port, log_level="warning")
+
+
+class InterceptionProxy:
+    """Thread-backed controller for running the proxy locally.
+
+    Used by unit tests and by any in-process driver that wants a short-lived
+    proxy on the local machine. Inside a sandbox we invoke :func:`serve`
+    directly via ``python -m opencode_env.interception``.
+    """
+
+    def __init__(self, cfg: ProxyConfig) -> None:
+        self._cfg = cfg
+        self._server: uvicorn.Server | None = None
+        self._thread: threading.Thread | None = None
+        self._ready = threading.Event()
+
+    @property
+    def url(self) -> str:
+        return f"http://{self._cfg.host}:{self._cfg.port}/v1"
+
+    @property
+    def config(self) -> ProxyConfig:
+        return self._cfg
+
+    def start(self) -> None:
+        app = _build_app(self._cfg)
+        config = uvicorn.Config(
+            app,
+            host=self._cfg.host,
+            port=self._cfg.port,
+            log_level="warning",
+            lifespan="on",
+        )
+        self._server = uvicorn.Server(config)
+        self._thread = threading.Thread(
+            target=self._run_server, daemon=True
+        )
+        self._thread.start()
+        # Wait for the server to accept connections.
+        deadline = time.time() + 10
+        while time.time() < deadline:
+            if _port_open(self._cfg.host, self._cfg.port):
+                self._ready.set()
+                return
+            time.sleep(0.05)
+        raise RuntimeError("InterceptionProxy failed to start within 10s")
+
+    def _run_server(self) -> None:
+        assert self._server is not None
+        self._server.run()
+
+    def stop(self) -> None:
+        if self._server is None:
+            return
+        self._server.should_exit = True
+        if self._thread is not None:
+            self._thread.join(timeout=5)
+        self._server = None
+        self._thread = None
+
+    def __enter__(self) -> "InterceptionProxy":
+        self.start()
+        return self
+
+    def __exit__(self, *exc) -> None:
+        self.stop()
+
+
+def _port_open(host: str, port: int) -> bool:
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.settimeout(0.2)
+        return s.connect_ex((host, port)) == 0
+
+
+# ---------------------------------------------------------------------------
+# Trace reader (used by the session to pull captured turns back)
+# ---------------------------------------------------------------------------
+
+
+def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]:
+    """Read a proxy trace file into a list of dicts."""
+
+    trace: list[dict[str, Any]] = []
+    p = Path(path)
+    if not p.exists():
+        return trace
+    for line in p.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        trace.append(json.loads(line))
+    return trace
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(prog="opencode_env.interception")
+    parser.add_argument("--upstream-url", required=True)
+    parser.add_argument(
+        "--upstream-api-key",
+        default=None,
+        help=(
+            "Upstream API key. Prefer OPENCODE_UPSTREAM_API_KEY so the key "
+            "does not appear in process argv."
+        ),
+    )
+    parser.add_argument("--trace", default="/tmp/opencode-proxy-trace.jsonl")
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=7000)
+    parser.add_argument("--top-logprobs", type=int, default=5)
+    parser.add_argument("--request-timeout", type=float, default=600.0)
+    parser.add_argument(
+        "--max-tokens-cap",
+        type=int,
+        default=None,
+        help="Clamp max_tokens/max_completion_tokens on forwarded requests.",
+    )
+    parser.add_argument(
+        "--disable-thinking",
+        action="store_true",
+        help="Inject chat_template_kwargs.enable_thinking=false (Qwen3/Qwen3.5).",
+    )
+    parser.add_argument(
+        "--model-override",
+        default=None,
+        help="Rewrite the `model` field on every forwarded request.",
+    )
+    args = parser.parse_args()
+    upstream_api_key = (
+        args.upstream_api_key
+        or os.environ.get("OPENCODE_UPSTREAM_API_KEY")
+        or os.environ.get("UPSTREAM_API_KEY")
+        or "intercepted"
+    )
+
+    cfg = ProxyConfig(
+        upstream_url=args.upstream_url,
+        upstream_api_key=upstream_api_key,
+        trace_path=args.trace,
+        host=args.host,
+        port=args.port,
+        top_logprobs=args.top_logprobs,
+        request_timeout_s=args.request_timeout,
+        max_tokens_cap=args.max_tokens_cap,
+        disable_thinking=args.disable_thinking,
+        model_override=args.model_override,
+    )
+    serve(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/envs/coding_agent_env/server/Dockerfile b/envs/opencode_env/server/Dockerfile
similarity index 91%
rename from envs/coding_agent_env/server/Dockerfile
rename to envs/opencode_env/server/Dockerfile
index 97e880343..ad8319423 100644
--- a/envs/coding_agent_env/server/Dockerfile
+++ b/envs/opencode_env/server/Dockerfile
@@ -4,14 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 #
-# coding_agent_env Dockerfile — mirrors the standard OpenEnv multi-stage uv
+# opencode_env Dockerfile — mirrors the standard OpenEnv multi-stage uv
 # build used by echo_env / repl_env / jupyter_agent.
 #
 # Build:
-#   docker build -t coding-agent-env .
+#   docker build -t opencode-env .
 #
 # Run:
-#   docker run -p 8000:8000 -e E2B_API_KEY=e2b_... coding-agent-env
+#   docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
 
 ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
 FROM ${BASE_IMAGE} AS builder
diff --git a/envs/coding_agent_env/server/__init__.py b/envs/opencode_env/server/__init__.py
similarity index 79%
rename from envs/coding_agent_env/server/__init__.py
rename to envs/opencode_env/server/__init__.py
index 2eac4fb05..56363edaa 100644
--- a/envs/coding_agent_env/server/__init__.py
+++ b/envs/opencode_env/server/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Server-side for the deployed coding_agent_env."""
+"""Server-side for the deployed opencode_env."""
diff --git a/envs/coding_agent_env/server/app.py b/envs/opencode_env/server/app.py
similarity index 81%
rename from envs/coding_agent_env/server/app.py
rename to envs/opencode_env/server/app.py
index df40b507f..0757ef229 100644
--- a/envs/coding_agent_env/server/app.py
+++ b/envs/opencode_env/server/app.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""FastAPI app for the coding_agent_env MCP server.
+"""FastAPI app for the opencode_env MCP server.
 
 Mirrors the standard OpenEnv pattern (echo_env / repl_env / jupyter_agent)
 plus the custom Gradio UI mounted at ``/web`` per the
@@ -16,7 +16,7 @@
     E2B_API_KEY=... uvicorn server.app:app --host 0.0.0.0 --port 8000
 
     # Docker:
-    docker run -p 8000:8000 -e E2B_API_KEY=... coding-agent-env
+    docker run -p 8000:8000 -e E2B_API_KEY=... opencode-env
 
     # HF Space: deploys via the root ``Dockerfile``.
 
@@ -58,13 +58,13 @@ def _load_env_file() -> None:
     from openenv.core.env_server.http_server import create_app
     from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
 
-    from .gradio_ui import coding_agent_gradio_builder
-    from .coding_environment import CodingAgentEnvironment
+    from .gradio_ui import opencode_gradio_builder
+    from .opencode_environment import OpenCodeEnvironment
 except ImportError:  # pragma: no cover
     from openenv.core.env_server.http_server import create_app
     from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
-    from server.gradio_ui import coding_agent_gradio_builder  # type: ignore
-    from server.coding_environment import CodingAgentEnvironment  # type: ignore
+    from server.gradio_ui import opencode_gradio_builder  # type: ignore
+    from server.opencode_environment import OpenCodeEnvironment  # type: ignore
 
 
 # Always expose the Gradio UI at /web. Set ENABLE_WEB_INTERFACE=false to
@@ -80,22 +80,22 @@ def _custom_gradio_builder(
     title,
     quick_start_md,
 ):
-    """Hand off to ``server.gradio_ui.coding_agent_gradio_builder``."""
-    return coding_agent_gradio_builder(
+    """Hand off to ``server.gradio_ui.opencode_gradio_builder``."""
+    return opencode_gradio_builder(
         web_manager,
         action_fields,
         metadata,
         is_chat_env,
-        title or "coding_agent_env",
+        title or "opencode_env",
         quick_start_md,
     )
 
 
 app = create_app(
-    CodingAgentEnvironment,
+    OpenCodeEnvironment,
     CallToolAction,
     CallToolObservation,
-    env_name="coding_agent_env",
+    env_name="opencode_env",
     max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")),
     gradio_builder=_custom_gradio_builder,
 )
diff --git a/envs/coding_agent_env/server/catalog.py b/envs/opencode_env/server/catalog.py
similarity index 100%
rename from envs/coding_agent_env/server/catalog.py
rename to envs/opencode_env/server/catalog.py
diff --git a/envs/coding_agent_env/server/gradio_ui.py b/envs/opencode_env/server/gradio_ui.py
similarity index 92%
rename from envs/coding_agent_env/server/gradio_ui.py
rename to envs/opencode_env/server/gradio_ui.py
index ea9cdb81f..bb4340aef 100644
--- a/envs/coding_agent_env/server/gradio_ui.py
+++ b/envs/opencode_env/server/gradio_ui.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Minimal Gradio UI for coding_agent_env.
+"""Minimal Gradio UI for opencode_env.
 
 Mounts under the standard OpenEnv ``/web`` path via the
 ``gradio_builder=`` callback documented at
@@ -19,7 +19,7 @@
     agent_timeout_s, template).
   - Preset buttons for the ready-made example tasks.
   - Run button → result panel with reward, setup/verify per-command
-    results, file outputs, agent log tail, and the raw RolloutResult JSON.
+    results, file outputs, proxy/OpenCode log tails, and the raw RolloutResult JSON.
 """
 
 from __future__ import annotations
@@ -31,14 +31,14 @@
 
 try:
     from .catalog import catalog_summary, ENDPOINT_KINDS, resolve_endpoint
-    from .coding_environment import CodingAgentEnvironment
+    from .opencode_environment import OpenCodeEnvironment
 except ImportError:  # pragma: no cover
     from server.catalog import (  # type: ignore
         catalog_summary,
         ENDPOINT_KINDS,
         resolve_endpoint,
     )
-    from server.coding_environment import CodingAgentEnvironment  # type: ignore
+    from server.opencode_environment import OpenCodeEnvironment  # type: ignore
 
 
 # ────────────────────────────────────────────────────────────────────────────
@@ -156,7 +156,6 @@ def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]:
 
 
 def _live_status_md(
-    agent: str,
     endpoint_kind: str,
     model: str,
     mode: str,
@@ -166,7 +165,7 @@ def _live_status_md(
     """Render a live phase log (latest at the bottom) with elapsed timestamps."""
     head = (
         f"### running…  `elapsed={elapsed_s:.1f}s`\n\n"
-        f"_agent=`{agent}`  endpoint=`{endpoint_kind}`  model=`{model}`  mode=`{mode}`_\n\n"
+        f"_endpoint=`{endpoint_kind}`  model=`{model}`  mode=`{mode}`_\n\n"
     )
     if not lines:
         body = "_(waiting for first phase update…)_"
@@ -210,7 +209,7 @@ def _catalog_banner() -> str:
 # ────────────────────────────────────────────────────────────────────────────
 
 
-def coding_agent_gradio_builder(
+def opencode_gradio_builder(
     web_manager,  # noqa: ARG001 (unused: we instantiate the env directly)
     action_fields,  # noqa: ARG001
     metadata,  # noqa: ARG001
@@ -218,17 +217,16 @@ def coding_agent_gradio_builder(
     title,
     quick_start_md,  # noqa: ARG001
 ) -> gr.Blocks:
-    """Build the coding_agent_env console.
+    """Build the opencode_env console.
 
     Compatible with ``create_app(..., gradio_builder=...)``. We ignore
-    ``web_manager`` and instantiate :class:`CodingAgentEnvironment` ourselves
-    inside the run handler — coding_agent_env's run_rollout doesn't need any
+    ``web_manager`` and instantiate :class:`OpenCodeEnvironment` ourselves
+    inside the run handler — opencode_env's run_rollout doesn't need any
     per-session state beyond the env's own bookkeeping, and instantiating
     is cheap (no sandbox is created until the tool fires).
     """
 
     def run(
-        agent: str,
         endpoint: str,
         model: str,
         base_url: str,
@@ -273,7 +271,7 @@ def run(
         else:
             dt = None
 
-        env = CodingAgentEnvironment()
+        env = OpenCodeEnvironment()
 
         # The worker fires _run_rollout_impl in a background thread and
         # streams progress messages into a queue; this generator polls the
@@ -287,7 +285,6 @@ def _cb(msg: str) -> None:
         def _worker():
             try:
                 payload = env._run_rollout_impl(
-                    agent=agent,
                     base_url=resolved.base_url,
                     api_key=resolved.api_key,
                     model=resolved.model,
@@ -318,7 +315,7 @@ def _worker():
 
         # First yield: announce we've started. Empty result panels.
         yield (
-            f"### running…\n\n_agent=`{agent}`  endpoint=`{resolved.kind}`  model=`{resolved.model}`  mode=`{mode}`_",
+            f"### running…\n\n_endpoint=`{resolved.kind}`  model=`{resolved.model}`  mode=`{mode}`_",
             [],
             [],
             "",
@@ -343,7 +340,6 @@ def _worker():
             # Render the live status pane.
             elapsed = time.time() - t_start
             md = _live_status_md(
-                agent,
                 resolved.kind,
                 resolved.model,
                 mode,
@@ -369,7 +365,6 @@ def _worker():
                 [],
                 "",
                 _live_status_md(
-                    agent,
                     resolved.kind,
                     resolved.model,
                     mode,
@@ -389,13 +384,13 @@ def _worker():
             (
                 "### live phase log\n\n"
                 + _live_status_md(
-                    agent,
                     resolved.kind,
                     resolved.model,
                     mode,
                     time.time() - t_start,
                     status_lines,
                 )
+                + f"\n\n### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:3000]}\n```"
                 + f"\n\n### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```"
             ),
             result,
@@ -405,23 +400,17 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         p = PRESETS.get(name) or {"instruction": "", "setup": "", "verify": ""}
         return p["instruction"], p["setup"], p["verify"]
 
-    with gr.Blocks(title=title or "coding_agent_env") as app:
-        gr.Markdown(f"# {title or 'coding_agent_env'}")
+    with gr.Blocks(title=title or "opencode_env") as app:
+        gr.Markdown(f"# {title or 'opencode_env'}")
         gr.Markdown(
-            "Run one coding-agent rollout in an E2B sandbox against your chosen "
-            "LLM endpoint. Pick an agent + endpoint, write the task as "
+            "Run one OpenCode rollout in an E2B sandbox against your chosen "
+            "LLM endpoint. Pick an endpoint, write the task as "
             "`(instruction, setup, verify)`, and inspect reward + logs."
         )
 
         gr.Markdown(_catalog_banner())
 
         with gr.Row():
-            agent = gr.Dropdown(
-                choices=["opencode", "pi"],
-                value="opencode",
-                label="Agent",
-                scale=1,
-            )
             endpoint = gr.Dropdown(
                 choices=list(ENDPOINT_KINDS),
                 value="openai",
@@ -447,19 +436,19 @@ def apply_preset(name: str) -> tuple[str, str, str]:
             )
 
         instruction = gr.Textbox(
-            label="Instruction (the prompt the selected agent runs)",
+            label="Instruction (the prompt OpenCode runs)",
             lines=4,
             value=PRESETS["binary_search"]["instruction"],
         )
 
         with gr.Row():
             setup_text = gr.Textbox(
-                label="Setup (one bash command per line — runs BEFORE the agent)",
+                label="Setup (one bash command per line — runs BEFORE OpenCode)",
                 lines=5,
                 value=PRESETS["binary_search"]["setup"],
             )
             verify_text = gr.Textbox(
-                label="Verify (one bash command per line — runs AFTER the agent)",
+                label="Verify (one bash command per line — runs AFTER OpenCode)",
                 lines=5,
                 value=PRESETS["binary_search"]["verify"],
             )
@@ -472,8 +461,8 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         with gr.Accordion("Tunables", open=False):
             with gr.Row():
                 mode = gr.Dropdown(
-                    choices=["black_box", "interception_gate"],
-                    value="black_box",
+                    choices=["transparent_proxy", "black_box"],
+                    value="transparent_proxy",
                     label="mode",
                 )
                 disable_thinking = gr.Dropdown(
@@ -531,7 +520,6 @@ def apply_preset(name: str) -> tuple[str, str, str]:
         run_btn.click(
             fn=run,
             inputs=[
-                agent,
                 endpoint,
                 model,
                 base_url,
diff --git a/envs/coding_agent_env/server/coding_environment.py b/envs/opencode_env/server/opencode_environment.py
similarity index 76%
rename from envs/coding_agent_env/server/coding_environment.py
rename to envs/opencode_env/server/opencode_environment.py
index 9174666e7..52ae27b4d 100644
--- a/envs/coding_agent_env/server/coding_environment.py
+++ b/envs/opencode_env/server/opencode_environment.py
@@ -4,19 +4,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Coding-agent MCP environment.
+"""OpenCode MCP environment.
 
 Single MCP tool ``run_rollout`` with a uniform task shape:
 
-  - ``instruction``  — prompt for the selected agent
-  - ``setup``        — bash commands run BEFORE the agent (in the sandbox)
-  - ``verify``       — bash commands run AFTER the agent
+  - ``instruction``  — prompt for OpenCode
+  - ``setup``        — bash commands run BEFORE OpenCode (in the sandbox)
+  - ``verify``       — bash commands run AFTER OpenCode
 
 Reward = ``passed_verify_commands / total`` unless a verify command writes
 a float to ``/home/user/logs/verifier/reward.txt`` (override).
 
 Returns a JSON-serialized :class:`RolloutResult` with reward,
-setup/verify command results, and file outputs.
+setup/verify command results, transparent-proxy logprob turns, and file outputs.
 """
 
 from __future__ import annotations
@@ -29,7 +29,6 @@
 from uuid import uuid4
 
 from fastmcp import FastMCP
-from pydantic import BaseModel, Field
 
 try:
     from openenv.core.env_server.mcp_environment import MCPEnvironment
@@ -42,7 +41,7 @@
     from server.catalog import ENDPOINT_KINDS, resolve_endpoint  # type: ignore
 
 
-# One rollout (sandbox cold start + harness install + agent run +
+# One rollout (sandbox cold start + OpenCode install + agent run +
 # verifier) typically takes 30-180s; can spike to ~600s under load. Override
 # OpenEnv's 30s MCP-tool default so the server doesn't cut us off.
 _RUN_ROLLOUT_TIMEOUT_S = 900.0
@@ -54,29 +53,12 @@
 _log = logging.getLogger(__name__)
 
 REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
+PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
 AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
 VERIFY_TIMEOUT_S = 120
-_SUPPORTED_AGENTS = ("opencode", "pi")
-_AGENT_LOG_BY_AGENT: dict[str, str] = {
-    "opencode": f"{HOME}/logs/agent/opencode.jsonl",
-    "pi": f"{HOME}/logs/agent/pi.txt",
-}
 
 
-class _GenericAgentConfig(BaseModel):
-    """Minimal config shape for CLIAgentSessionFactory-backed agents."""
-
-    base_url: str
-    api_key: str
-    model: str
-    agent_timeout_s: float = 600.0
-    sandbox_home: str = HOME
-    provider: str | None = None
-    thinking: str | None = "off"
-    extra_env: dict[str, str] = Field(default_factory=dict)
-
-
-class CodingAgentEnvironment(MCPEnvironment):
+class OpenCodeEnvironment(MCPEnvironment):
     """Per-session environment exposing a single ``run_rollout`` MCP tool."""
 
     SUPPORTS_CONCURRENT_SESSIONS = True
@@ -85,23 +67,22 @@ def __init__(self) -> None:
         # Lazy imports so module import stays cheap and so tests can patch.
         try:
             from ..models import (
-                CodingAgentState,
+                OpenCodeState,
                 CommandResult,
                 RolloutResult,
+                RolloutTurn,
             )
         except ImportError:  # pragma: no cover
             from models import (  # type: ignore
-                CodingAgentState,
+                OpenCodeState,
                 CommandResult,
                 RolloutResult,
+                RolloutTurn,
             )
 
-        from openenv.core.harness.agents import get_agent_spec
-        from openenv.core.harness.agents.cli_driver import CLIAgentSessionFactory
-
-        from coding_agent_env.config import CodingAgentConfig
-        from coding_agent_env.harness import CodingAgentSessionFactory
-        from coding_agent_env.task import CodingAgentTask
+        from opencode_env.config import OpenCodeConfig
+        from opencode_env.harness import OpenCodeSessionFactory
+        from opencode_env.task import OpenCodeTask
 
         try:
             from openenv.core.harness.sandbox import E2BSandboxBackend
@@ -110,13 +91,12 @@ def __init__(self) -> None:
 
         self._CommandResult = CommandResult
         self._RolloutResult = RolloutResult
-        self._CodingAgentState = CodingAgentState
-        self._CodingAgentConfig = CodingAgentConfig
-        self._CodingAgentSessionFactory = CodingAgentSessionFactory
-        self._CodingAgentTask = CodingAgentTask
+        self._RolloutTurn = RolloutTurn
+        self._OpenCodeState = OpenCodeState
+        self._OpenCodeConfig = OpenCodeConfig
+        self._OpenCodeSessionFactory = OpenCodeSessionFactory
+        self._OpenCodeTask = OpenCodeTask
         self._E2BSandboxBackend = E2BSandboxBackend
-        self._CLIAgentSessionFactory = CLIAgentSessionFactory
-        self._get_agent_spec = get_agent_spec
 
         # Don't raise on missing E2B_API_KEY here — OpenEnv's web-interface
         # layer instantiates the env at import time for schema introspection,
@@ -124,14 +104,12 @@ def __init__(self) -> None:
         # just exploring. The real check happens lazily in
         # ``_run_rollout_impl`` (any rollout without creds fails fast there
         # with a clear error in the result payload).
-        self._state = self._CodingAgentState(episode_id=str(uuid4()))
+        self._state = self._OpenCodeState(episode_id=str(uuid4()))
 
-        mcp = FastMCP("coding_agent_env")
+        mcp = FastMCP("opencode_env")
 
         @mcp.tool
         def run_rollout(
-            # Agent + endpoint.
-            agent: str = "opencode",
             # Endpoint — either a shorthand (resolved from env vars + catalog
             # defaults) OR explicit base_url+api_key+model. Explicit fields
             # always win over the catalog.
@@ -145,24 +123,21 @@ def run_rollout(
             verify: Optional[list[str]] = None,
             # Bookkeeping / tunables
             task_id: str = "",
-            mode: str = "black_box",
+            mode: str = "transparent_proxy",
             disable_thinking: Optional[bool] = None,
             max_tokens_cap: int = 4096,
             top_logprobs: int = 5,
             agent_timeout_s: float = 600.0,
             template: str = "",
         ) -> str:
-            """Run one coding-agent rollout end-to-end.
-
-            ``agent`` selects the harness CLI to run inside the sandbox.
-            Currently supported: ``"opencode"``, ``"pi"``.
+            """Run one OpenCode rollout end-to-end.
 
             ``endpoint`` is the shorthand selector (one of
             ``"vllm"`` / ``"openai"`` / ``"hf_router"``) — the server
             resolves base_url / api_key / model from env vars + catalog
             defaults. Pass any of those explicitly to override.
 
-            See ``coding_agent_env.client.CodingAgentEnv.run_rollout`` for full
+            See ``opencode_env.client.OpenCodeEnv.run_rollout`` for full
             arg docs. Returns a JSON-serialized ``RolloutResult``.
             """
             # Resolve via catalog when shorthand is provided.
@@ -179,11 +154,6 @@ def run_rollout(
             if disable_thinking_resolved is None:
                 disable_thinking_resolved = False
 
-            agent = (agent or "opencode").strip()
-            if agent not in _SUPPORTED_AGENTS:
-                raise ValueError(
-                    f"unsupported agent {agent!r}; supported agents: {_SUPPORTED_AGENTS}"
-                )
             if not (base_url and api_key and model):
                 raise ValueError(
                     "must provide either ``endpoint`` (one of "
@@ -193,7 +163,6 @@ def run_rollout(
                 raise ValueError("instruction is required")
 
             return self._run_rollout_impl(
-                agent=agent,
                 base_url=base_url,
                 api_key=api_key,
                 model=model,
@@ -219,14 +188,14 @@ def reset(
         episode_id: Optional[str] = None,
         **_: Any,
     ) -> Observation:
-        self._state = self._CodingAgentState(episode_id=episode_id or str(uuid4()))
+        self._state = self._OpenCodeState(episode_id=episode_id or str(uuid4()))
         return Observation(
             done=False,
             reward=None,
             metadata={
                 "status": "ready",
                 "message": (
-                    "coding_agent_env ready. Call run_rollout(agent=..., ...) with a task."
+                    "opencode_env ready. Call run_rollout(...) with a task."
                 ),
             },
         )
@@ -277,7 +246,6 @@ def state(self) -> Any:
     def _run_rollout_impl(
         self,
         *,
-        agent: str,
         base_url: str,
         api_key: str,
         model: str,
@@ -318,11 +286,9 @@ def _emit(msg: str) -> None:
             _emit("error: E2B_API_KEY missing on server")
             return result.model_dump_json()
 
-        _emit(f"resolving config (agent={agent}, model={model}, mode={mode})")
+        _emit(f"resolving config (model={model}, mode={mode})")
 
         config = self._build_agent_config(
-            agent=agent,
-            mode=mode,
             base_url=base_url,
             api_key=api_key,
             model=model,
@@ -341,22 +307,18 @@ def _emit(msg: str) -> None:
             # ``set -e`` makes the script abort on the first failing command.
             setup_shell = "set -e\n" + "\n".join(setup)
 
-        rollout_task = self._CodingAgentTask(
+        rollout_task = self._OpenCodeTask(
             instruction=instruction,
             setup_shell=setup_shell,
-            metadata={"task_id": task_id, "agent": agent},
+            metadata={"task_id": task_id},
         )
 
         session = None
         try:
             factory = self._build_session_factory(
-                agent=agent,
                 config=config,
                 mode=mode,
                 template=template,
-                disable_thinking=disable_thinking,
-                top_logprobs=top_logprobs,
-                max_tokens_cap=max_tokens_cap,
             )
             _emit(
                 f"creating E2B sandbox (template={template or 'default'}) — "
@@ -384,7 +346,7 @@ def _emit(msg: str) -> None:
             # Block until the agent is done.
             if result.error is None:
                 _emit(
-                    f"agent running — {agent} CLI in sandbox "
+                    "agent running — OpenCode CLI in sandbox "
                     f"(timeout {int(agent_timeout_s)}s)"
                 )
                 try:
@@ -414,19 +376,23 @@ def _emit(msg: str) -> None:
             else:
                 result.reward = None
 
-            # Collect filesystem + agent log tail.
+            # Collect filesystem + logs + transparent-proxy trace.
             _emit("collecting workdir files + logs")
             result.files, result.files_extra = self._collect_files(session.sandbox)
-            result.agent_log_tail = self._collect_agent_log_tail(session, agent)
+            result.proxy_turns = self._collect_proxy_turns(session)
+            result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
+            result.agent_log_tail = self._collect_agent_log_tail(session)
             _emit(
                 f"collected: {len(result.files)} file(s), "
+                f"{len(result.proxy_turns)} proxy turn(s), "
                 f"reward={'%.2f' % result.reward if result.reward is not None else 'n/a'}"
             )
         except Exception as exc:  # noqa: BLE001
             result.error = f"{type(exc).__name__}: {exc}"
             _emit(f"ERROR: {result.error}")
             if session is not None:
-                result.agent_log_tail = self._collect_agent_log_tail(session, agent)
+                result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
+                result.agent_log_tail = self._collect_agent_log_tail(session)
         finally:
             if session is not None:
                 try:
@@ -449,8 +415,6 @@ def _emit(msg: str) -> None:
     def _build_agent_config(
         self,
         *,
-        agent: str,
-        mode: str,
         base_url: str,
         api_key: str,
         model: str,
@@ -459,44 +423,26 @@ def _build_agent_config(
         top_logprobs: int,
         max_tokens_cap: int,
     ) -> Any:
-        if agent == "opencode":
-            if top_logprobs:
-                _log.warning(
-                    "top_logprobs=%d is not supported for agent='opencode' "
-                    "and will have no effect. Use interception_gate mode for "
-                    "logprob capture.",
-                    top_logprobs,
-                )
-            return self._CodingAgentConfig(
-                provider="openai_compatible",
-                base_url=base_url.rstrip("/"),
-                api_key=api_key,
-                model=model,
-                agent_timeout_s=agent_timeout_s,
-                disable_thinking=disable_thinking,
-                max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
-            )
-
-        provider = self._infer_pi_provider(base_url)
-        return _GenericAgentConfig(
+        cap = max_tokens_cap if max_tokens_cap > 0 else None
+        return self._OpenCodeConfig(
+            provider="openai_compatible",
             base_url=base_url.rstrip("/"),
             api_key=api_key,
             model=model,
             agent_timeout_s=agent_timeout_s,
-            provider=provider,
-            thinking="off" if disable_thinking else None,
+            disable_thinking=disable_thinking,
+            max_tokens_cap=cap,
+            proxy_disable_thinking=disable_thinking,
+            proxy_top_logprobs=max(0, int(top_logprobs)),
+            proxy_max_tokens_cap=cap,
         )
 
     def _build_session_factory(
         self,
         *,
-        agent: str,
         config: Any,
         mode: str,
         template: str,
-        disable_thinking: bool,
-        top_logprobs: int,
-        max_tokens_cap: int,
     ) -> Any:
         if self._E2BSandboxBackend is None:
             raise RuntimeError(
@@ -508,35 +454,14 @@ def _build_session_factory(
             backend_kwargs["template"] = template
         backend = self._E2BSandboxBackend(**backend_kwargs)
 
-        if agent == "opencode":
-            return self._CodingAgentSessionFactory(
-                config=config,
-                sandbox_backend=backend,
-                mode=mode,
-                verifier=None,
-            )
-
-        spec = self._get_agent_spec(agent)
-        return self._CLIAgentSessionFactory(
-            spec=spec,
+        return self._OpenCodeSessionFactory(
             config=config,
             sandbox_backend=backend,
             mode=mode,
             verifier=None,
         )
 
-    @staticmethod
-    def _infer_pi_provider(base_url: str) -> str:
-        url = (base_url or "").lower()
-        if "router.huggingface.co" in url:
-            return "huggingface"
-        if "anthropic" in url:
-            return "anthropic"
-        if "googleapis.com" in url or "generativelanguage" in url:
-            return "gemini"
-        return "openai"
-
-    def _collect_agent_log_tail(self, session: Any, agent: str) -> str:
+    def _collect_agent_log_tail(self, session: Any) -> str:
         if hasattr(session, "collect_artifacts"):
             try:
                 artifacts = session.collect_artifacts()
@@ -547,8 +472,7 @@ def _collect_agent_log_tail(self, session: Any, agent: str) -> str:
                     return json.dumps(val, default=str)[-2000:]
             except Exception:
                 pass
-        path = _AGENT_LOG_BY_AGENT.get(agent, AGENT_LOG)
-        return self._safe_read(session.sandbox, path)[-2000:]
+        return self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
 
     # ── Helpers ────────────────────────────────────────────────────────────
 
@@ -597,6 +521,27 @@ def _collect_files(self, sandbox: Any) -> tuple[dict[str, str], list[str]]:
                 extras.append(path)
         return files, extras
 
+    def _collect_proxy_turns(self, session: Any) -> list[Any]:
+        turns: list[Any] = []
+        if not hasattr(session, "fetch_proxy_trace"):
+            return turns
+        for rec in session.fetch_proxy_trace():
+            response = rec.get("response") or {}
+            turns.append(
+                self._RolloutTurn(
+                    turn=int(rec.get("turn") or 0),
+                    finish_reason=rec.get("finish_reason"),
+                    completion_tokens=list(rec.get("completion_tokens") or []),
+                    completion_token_ids=list(rec.get("completion_token_ids") or []),
+                    per_token_logps=list(rec.get("per_token_logps") or []),
+                    latency_s=float(rec.get("latency_s") or 0.0),
+                    timestamp=float(rec.get("timestamp") or 0.0),
+                    upstream_status=response.get("upstream_status"),
+                    upstream_error=response.get("upstream_error"),
+                )
+            )
+        return turns
+
     @staticmethod
     def _safe_read(sandbox: Any, path: str) -> str:
         try:
diff --git a/envs/coding_agent_env/task.py b/envs/opencode_env/task.py
similarity index 73%
rename from envs/coding_agent_env/task.py
rename to envs/opencode_env/task.py
index 8633eb7aa..f9d208d84 100644
--- a/envs/coding_agent_env/task.py
+++ b/envs/opencode_env/task.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Task payload accepted by :class:`CodingAgentSessionFactory`."""
+"""Task payload accepted by :class:`OpenCodeSessionFactory`."""
 
 from __future__ import annotations
 
@@ -13,8 +13,8 @@
 from pydantic import BaseModel, Field
 
 
-class CodingAgentTask(BaseModel):
-    """One task for a coding-agent rollout.
+class OpenCodeTask(BaseModel):
+    """One task for an OpenCode rollout.
 
     The primitive only needs ``instruction`` (the prompt handed to ``opencode
     run``). Callers may attach ``setup_shell`` (run once inside the sandbox
@@ -29,8 +29,8 @@ class CodingAgentTask(BaseModel):
     metadata: dict[str, Any] = Field(default_factory=dict)
 
     @classmethod
-    def coerce(cls, value: Any) -> "CodingAgentTask":
-        """Accept a bare string, a dict, or an existing ``CodingAgentTask``."""
+    def coerce(cls, value: Any) -> "OpenCodeTask":
+        """Accept a bare string, a dict, or an existing ``OpenCodeTask``."""
         if isinstance(value, cls):
             return value
         if isinstance(value, str):
@@ -38,6 +38,6 @@ def coerce(cls, value: Any) -> "CodingAgentTask":
         if isinstance(value, dict):
             return cls(**value)
         raise TypeError(
-            f"Cannot coerce {type(value).__name__} to CodingAgentTask; "
-            "pass a str, dict, or CodingAgentTask."
+            f"Cannot coerce {type(value).__name__} to OpenCodeTask; "
+            "pass a str, dict, or OpenCodeTask."
         )
diff --git a/envs/coding_agent_env/uv.lock b/envs/opencode_env/uv.lock
similarity index 99%
rename from envs/coding_agent_env/uv.lock
rename to envs/opencode_env/uv.lock
index aa35531cc..aa802ee9d 100644
--- a/envs/coding_agent_env/uv.lock
+++ b/envs/opencode_env/uv.lock
@@ -1664,7 +1664,7 @@ wheels = [
 ]
 
 [[package]]
-name = "openenv-coding-agent-env"
+name = "openenv-opencode-env"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
diff --git a/examples/coding_agent_env_simple.py b/examples/opencode_env_simple.py
similarity index 80%
rename from examples/coding_agent_env_simple.py
rename to examples/opencode_env_simple.py
index caf81bad8..660421fdd 100644
--- a/examples/coding_agent_env_simple.py
+++ b/examples/opencode_env_simple.py
@@ -5,16 +5,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""End-to-end coding_agent_env example: write binary_search.py and verify it.
+"""End-to-end opencode_env example: write binary_search.py and verify it.
 
-Hits the deployed HF Space ``AdithyaSK/coding-agent-env`` (override via
-``CODING_AGENT_ENV_SPACE`` env var to point at your own Space or a local
+Hits the deployed HF Space ``AdithyaSK/opencode-env`` (override via
+``OPENCODE_ENV_SPACE`` env var to point at your own Space or a local
 container). The single MCP tool ``run_rollout`` does:
 
-  1. Spawns a fresh E2B sandbox (using the prebaked ``coding-agent-rl``
+  1. Spawns a fresh E2B sandbox (using the prebaked ``opencode-rl``
      template — falls back to a cold install if the template isn't
      present in your E2B account).
-  2. Runs the selected harness CLI with the instruction.
+  2. Runs OpenCode with the instruction.
   3. Executes the verify bash commands; reward = passed / total.
   4. Returns a ``RolloutResult`` with reward + produced file contents.
 
@@ -26,7 +26,7 @@
 
 Usage::
 
-    PYTHONPATH=src:envs uv run python examples/coding_agent_env_simple.py
+    PYTHONPATH=src:envs uv run python examples/opencode_env_simple.py
 
 Expected output (~20s with the prebaked template)::
 
@@ -45,13 +45,13 @@
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "envs"))
 
-from coding_agent_env import CodingAgentEnv  # noqa: E402
-from coding_agent_env.client import _extract_text  # noqa: E402
-from coding_agent_env.models import RolloutResult  # noqa: E402
+from opencode_env import OpenCodeEnv  # noqa: E402
+from opencode_env.client import _extract_text  # noqa: E402
+from opencode_env.models import RolloutResult  # noqa: E402
 
 
 SPACE = os.environ.get(
-    "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space"
+    "OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space"
 )
 
 INSTRUCTION = (
@@ -89,7 +89,7 @@ async def main() -> int:
     print(f"Instruction:     {INSTRUCTION.splitlines()[0]} ...")
     print()
 
-    async with CodingAgentEnv(base_url=SPACE) as env:
+    async with OpenCodeEnv(base_url=SPACE) as env:
         await env.reset()
         raw = await env.call_tool(
             "run_rollout",
@@ -99,7 +99,7 @@ async def main() -> int:
             instruction=INSTRUCTION,
             setup=[],  # no setup commands
             verify=VERIFY,
-            template="coding-agent-rl",  # prebaked E2B template
+            template="opencode-rl",  # prebaked E2B template
             task_id="binary_search_simple",
             agent_timeout_s=600,
         )
diff --git a/tests/core/test_cli_agent_driver.py b/tests/core/test_cli_agent_driver.py
index 7338fc323..f174b6733 100644
--- a/tests/core/test_cli_agent_driver.py
+++ b/tests/core/test_cli_agent_driver.py
@@ -29,7 +29,7 @@
 from openenv.core.harness.sandbox.base import ExecResult, SandboxHandle
 
 
-# Fake sandbox infrastructure (mirrors test_coding_agent_env.py pattern)
+# Fake sandbox infrastructure (mirrors test_opencode_env.py pattern)
 
 
 @dataclass
diff --git a/tests/envs/test_coding_agent_env.py b/tests/envs/test_opencode_env.py
similarity index 71%
rename from tests/envs/test_coding_agent_env.py
rename to tests/envs/test_opencode_env.py
index fa3dcae79..701d562e9 100644
--- a/tests/envs/test_coding_agent_env.py
+++ b/tests/envs/test_opencode_env.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Smoke tests for ``coding_agent_env``.
+"""Smoke tests for ``opencode_env``.
 
 The default suite runs in CI without any external dependencies (no E2B,
 no LLM, no network). It covers:
@@ -13,7 +13,7 @@
   - The endpoint catalog (`vllm` / `openai` / `hf_router`) resolves
     explicit + env-var + default-value precedence correctly.
   - Pydantic models accept their expected shapes.
-  - The `CodingAgentTask` coercion helper handles str / dict / `CodingAgentTask`.
+  - The `OpenCodeTask` coercion helper handles str / dict / `OpenCodeTask`.
 
 A second class is marked ``@pytest.mark.integration`` and exercises the
 deployed Space end-to-end. It only runs when ``E2B_API_KEY`` and at least
@@ -44,17 +44,18 @@
 
 def test_public_api_imports() -> None:
     """Top-level package re-exports the documented surface."""
-    from coding_agent_env import (  # noqa: F401
-        CodingAgentConfig,
-        CodingAgentEnv,
-        CodingAgentSession,
-        CodingAgentSessionFactory,
-        CodingAgentState,
-        CodingAgentTask,
+    from opencode_env import (  # noqa: F401
+        OpenCodeConfig,
+        OpenCodeEnv,
+        OpenCodeSession,
+        OpenCodeSessionFactory,
+        OpenCodeState,
+        OpenCodeTask,
         CommandResult,
         E2BSandboxBackend,
         Provider,
         RolloutResult,
+        RolloutTurn,
         SandboxBackend,
         SandboxHandle,
     )
@@ -62,14 +63,14 @@ def test_public_api_imports() -> None:
 
 def test_server_modules_import() -> None:
     """Server-side modules (FastAPI app, MCP env, catalog) import cleanly."""
-    from coding_agent_env.server.app import app  # noqa: F401
-    from coding_agent_env.server.catalog import (  # noqa: F401
+    from opencode_env.server.app import app  # noqa: F401
+    from opencode_env.server.catalog import (  # noqa: F401
         catalog_summary,
         ENDPOINT_KINDS,
         resolve_endpoint,
     )
-    from coding_agent_env.server.coding_environment import (  # noqa: F401
-        CodingAgentEnvironment,
+    from opencode_env.server.opencode_environment import (  # noqa: F401
+        OpenCodeEnvironment,
     )
 
 
@@ -79,14 +80,14 @@ def test_server_modules_import() -> None:
 
 
 def test_catalog_kinds() -> None:
-    from coding_agent_env.server.catalog import ENDPOINT_KINDS
+    from opencode_env.server.catalog import ENDPOINT_KINDS
 
     assert ENDPOINT_KINDS == ("vllm", "openai", "hf_router")
 
 
 def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) -> None:
     """Explicit args beat env vars beat catalog defaults."""
-    from coding_agent_env.server.catalog import resolve_endpoint
+    from opencode_env.server.catalog import resolve_endpoint
 
     monkeypatch.setenv("OPENAI_API_KEY", "from-env")
     r = resolve_endpoint(
@@ -105,7 +106,7 @@ def test_resolve_endpoint_explicit_args_win(monkeypatch: pytest.MonkeyPatch) ->
 def test_resolve_endpoint_env_var_used_when_arg_missing(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    from coding_agent_env.server.catalog import resolve_endpoint
+    from opencode_env.server.catalog import resolve_endpoint
 
     monkeypatch.setenv("OPENAI_API_KEY", "key-from-env")
     monkeypatch.setenv("OPENAI_MODEL", "gpt-4o")
@@ -119,7 +120,7 @@ def test_resolve_endpoint_normalizes_v1_suffix(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Base URL gets ``/v1`` appended if missing, otherwise left alone."""
-    from coding_agent_env.server.catalog import resolve_endpoint
+    from opencode_env.server.catalog import resolve_endpoint
 
     monkeypatch.setenv("VLLM_URL", "https://my-vllm.example/")
     monkeypatch.setenv("VLLM_API_KEY", "x")
@@ -132,7 +133,7 @@ def test_resolve_endpoint_normalizes_v1_suffix(
 
 
 def test_resolve_endpoint_unknown_kind_raises() -> None:
-    from coding_agent_env.server.catalog import resolve_endpoint
+    from opencode_env.server.catalog import resolve_endpoint
 
     with pytest.raises(ValueError, match="unknown endpoint kind"):
         resolve_endpoint("bogus", base_url="x", api_key="y", model="z")
@@ -141,7 +142,7 @@ def test_resolve_endpoint_unknown_kind_raises() -> None:
 def test_resolve_endpoint_missing_creds_raises(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    from coding_agent_env.server.catalog import resolve_endpoint
+    from opencode_env.server.catalog import resolve_endpoint
 
     # Strip any inherited env vars.
     for k in ("OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL"):
@@ -151,7 +152,7 @@ def test_resolve_endpoint_missing_creds_raises(
 
 
 def test_catalog_summary_shape() -> None:
-    from coding_agent_env.server.catalog import catalog_summary
+    from opencode_env.server.catalog import catalog_summary
 
     summary = catalog_summary()
     assert {entry["kind"] for entry in summary} == {"vllm", "openai", "hf_router"}
@@ -165,12 +166,10 @@ def test_catalog_summary_shape() -> None:
 
 
 def test_build_agent_config_opencode() -> None:
-    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
+    from opencode_env.server.opencode_environment import OpenCodeEnvironment
 
-    env = CodingAgentEnvironment()
+    env = OpenCodeEnvironment()
     cfg = env._build_agent_config(
-        agent="opencode",
-        mode="black_box",
         base_url="https://api.openai.com/v1",
         api_key="sk-test",
         model="gpt-4o-mini",
@@ -179,14 +178,15 @@ def test_build_agent_config_opencode() -> None:
         top_logprobs=7,
         max_tokens_cap=2048,
     )
-    assert isinstance(cfg, env._CodingAgentConfig)
+    assert isinstance(cfg, env._OpenCodeConfig)
     assert cfg.model == "gpt-4o-mini"
     assert cfg.agent_timeout_s == 123.0
     assert cfg.max_tokens_cap == 2048
+    assert cfg.proxy_max_tokens_cap == 2048
+    assert cfg.proxy_top_logprobs == 7
+    assert cfg.proxy_disable_thinking is True
 
     cfg_4096 = env._build_agent_config(
-        agent="opencode",
-        mode="black_box",
         base_url="https://api.openai.com/v1",
         api_key="sk-test",
         model="gpt-4o-mini",
@@ -198,8 +198,6 @@ def test_build_agent_config_opencode() -> None:
     assert cfg_4096.max_tokens_cap == 4096
 
     cfg_uncapped = env._build_agent_config(
-        agent="opencode",
-        mode="black_box",
         base_url="https://api.openai.com/v1",
         api_key="sk-test",
         model="gpt-4o-mini",
@@ -211,50 +209,15 @@ def test_build_agent_config_opencode() -> None:
     assert cfg_uncapped.max_tokens_cap is None
 
 
-def test_build_agent_config_pi() -> None:
-    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
-
-    env = CodingAgentEnvironment()
-    cfg = env._build_agent_config(
-        agent="pi",
-        mode="black_box",
-        base_url="https://router.huggingface.co/v1",
-        api_key="hf_xxx",
-        model="zai-org/GLM-5.1",
-        agent_timeout_s=180.0,
-        disable_thinking=True,
-        top_logprobs=5,
-        max_tokens_cap=4096,
-    )
-    assert cfg.provider == "huggingface"
-    assert cfg.thinking == "off"
-    assert cfg.model == "zai-org/GLM-5.1"
-
-    cfg_gate = env._build_agent_config(
-        agent="pi",
-        mode="interception_gate",
-        base_url="https://router.huggingface.co/v1",
-        api_key="hf_xxx",
-        model="zai-org/GLM-5.1",
-        agent_timeout_s=180.0,
-        disable_thinking=False,
-        top_logprobs=5,
-        max_tokens_cap=4096,
-    )
-    assert cfg_gate.provider == "huggingface"
-
-
 def test_build_session_factory_requires_e2b_dependency() -> None:
-    from coding_agent_env.server.coding_environment import CodingAgentEnvironment
+    from opencode_env.server.opencode_environment import OpenCodeEnvironment
 
-    env = CodingAgentEnvironment()
+    env = OpenCodeEnvironment()
     env._E2BSandboxBackend = None
     cfg = env._build_agent_config(
-        agent="pi",
-        mode="black_box",
-        base_url="https://router.huggingface.co/v1",
-        api_key="hf_xxx",
-        model="zai-org/GLM-5.1",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-test",
+        model="gpt-4o-mini",
         agent_timeout_s=180.0,
         disable_thinking=False,
         top_logprobs=5,
@@ -263,13 +226,9 @@ def test_build_session_factory_requires_e2b_dependency() -> None:
 
     with pytest.raises(RuntimeError, match="E2BSandboxBackend unavailable"):
         env._build_session_factory(
-            agent="pi",
             config=cfg,
             mode="black_box",
             template="",
-            disable_thinking=False,
-            top_logprobs=5,
-            max_tokens_cap=4096,
         )
 
 
@@ -279,7 +238,7 @@ def test_build_session_factory_requires_e2b_dependency() -> None:
 
 
 def test_rollout_result_serializes_round_trip() -> None:
-    from coding_agent_env import CommandResult, RolloutResult
+    from opencode_env import CommandResult, RolloutResult, RolloutTurn
 
     r = RolloutResult(
         task_id="t1",
@@ -290,45 +249,54 @@ def test_rollout_result_serializes_round_trip() -> None:
         mode="black_box",
         setup_results=[CommandResult(cmd="pip install pandas", exit_code=0)],
         verify_results=[CommandResult(cmd="pytest", exit_code=1, stderr="boom")],
+        proxy_turns=[
+            RolloutTurn(
+                turn=1,
+                completion_tokens=["ok"],
+                completion_token_ids=[123],
+                per_token_logps=[-0.1],
+            )
+        ],
         files={"/home/user/workdir/x.py": "print('x')"},
     )
     blob = r.model_dump_json()
     rebuilt = RolloutResult.model_validate_json(blob)
     assert rebuilt.reward == 0.75
     assert rebuilt.verify_results[0].exit_code == 1
+    assert rebuilt.proxy_turns[0].per_token_logps == [-0.1]
 
 
-def test_coding_agent_task_coerce_str() -> None:
-    from coding_agent_env import CodingAgentTask
+def test_opencode_task_coerce_str() -> None:
+    from opencode_env import OpenCodeTask
 
-    t = CodingAgentTask.coerce("write fizzbuzz.py")
+    t = OpenCodeTask.coerce("write fizzbuzz.py")
     assert t.instruction == "write fizzbuzz.py"
     assert t.setup_shell is None
     assert t.upload_files == {}
 
 
-def test_coding_agent_task_coerce_dict() -> None:
-    from coding_agent_env import CodingAgentTask
+def test_opencode_task_coerce_dict() -> None:
+    from opencode_env import OpenCodeTask
 
-    t = CodingAgentTask.coerce(
+    t = OpenCodeTask.coerce(
         {"instruction": "x", "setup_shell": "pip install pandas"}
     )
     assert t.instruction == "x"
     assert t.setup_shell == "pip install pandas"
 
 
-def test_coding_agent_task_coerce_existing_passthrough() -> None:
-    from coding_agent_env import CodingAgentTask
+def test_opencode_task_coerce_existing_passthrough() -> None:
+    from opencode_env import OpenCodeTask
 
-    src = CodingAgentTask(instruction="y")
-    assert CodingAgentTask.coerce(src) is src
+    src = OpenCodeTask(instruction="y")
+    assert OpenCodeTask.coerce(src) is src
 
 
-def test_coding_agent_task_coerce_rejects_unknown_type() -> None:
-    from coding_agent_env import CodingAgentTask
+def test_opencode_task_coerce_rejects_unknown_type() -> None:
+    from opencode_env import OpenCodeTask
 
     with pytest.raises(TypeError, match="Cannot coerce"):
-        CodingAgentTask.coerce(42)  # type: ignore[arg-type]
+        OpenCodeTask.coerce(42)  # type: ignore[arg-type]
 
 
 # ---------------------------------------------------------------------------
@@ -354,16 +322,16 @@ def test_run_rollout_e2e_via_deployed_space() -> None:
 
     import asyncio
 
-    from coding_agent_env import CodingAgentEnv
-    from coding_agent_env.client import _extract_text
-    from coding_agent_env.models import RolloutResult
+    from opencode_env import OpenCodeEnv
+    from opencode_env.client import _extract_text
+    from opencode_env.models import RolloutResult
 
     SPACE = os.environ.get(
-        "CODING_AGENT_ENV_SPACE", "https://adithyask-coding-agent-env.hf.space"
+        "OPENCODE_ENV_SPACE", "https://adithyask-opencode-env.hf.space"
     )
 
     async def _go() -> RolloutResult:
-        async with CodingAgentEnv(base_url=SPACE) as env:
+        async with OpenCodeEnv(base_url=SPACE) as env:
             await env.reset()
             raw = await env.call_tool(
                 "run_rollout",
@@ -382,7 +350,7 @@ async def _go() -> RolloutResult:
                     "import binary_search; "
                     "assert binary_search.binary_search([1,2,3,4,5], 3) == 2; print('OK')\"",
                 ],
-                template="coding-agent-rl",
+                template="opencode-rl",
                 agent_timeout_s=600,
             )
             return RolloutResult.model_validate_json(_extract_text(raw))