From 0badc86c16c66497e73b46f1efd846b66572667d Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 21:51:48 +0900 Subject: [PATCH 1/7] feat: add Harbor Terminal-Bench integration for Sisyphus agent Add benchmark infrastructure to evaluate Sisyphus on Harbor Terminal-Bench: - SisyphusAgent: Custom Harbor InstalledAgent that runs OpenCode with Sisyphus - install-sisyphus.sh.j2: Jinja2 template for container setup - ATIF trajectory parsing for metrics collection Tested with hello-world@1.0 dataset (100% pass rate) --- .gitignore | 2 + benchmark/__init__.py | 0 benchmark/install-sisyphus.sh.j2 | 27 ++++ benchmark/sisyphus_agent.py | 248 +++++++++++++++++++++++++++++++ 4 files changed, 277 insertions(+) create mode 100644 benchmark/__init__.py create mode 100644 benchmark/install-sisyphus.sh.j2 create mode 100644 benchmark/sisyphus_agent.py diff --git a/.gitignore b/.gitignore index b43656d722..0016d0def2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Dependencies .sisyphus/ node_modules/ +__pycache__ # Build output dist/ @@ -18,6 +19,7 @@ Thumbs.db # Logs *.log npm-debug.log* +jobs # Lock files (use bun.lockb instead) package-lock.json diff --git a/benchmark/__init__.py b/benchmark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmark/install-sisyphus.sh.j2 b/benchmark/install-sisyphus.sh.j2 new file mode 100644 index 0000000000..f41cc33900 --- /dev/null +++ b/benchmark/install-sisyphus.sh.j2 @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +apt-get update +apt-get install -y curl + +curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.2/install.sh | bash + +source "$HOME/.nvm/nvm.sh" + +nvm install 22 +npm -v + +{% if version %} +npm i -g opencode-ai@{{ version }} +{% else %} +npm i -g opencode-ai@latest +{% endif %} + +{% if omo_version %} +npx oh-my-opencode@{{ omo_version }} install --no-tui --claude=yes --chatgpt=no --gemini=no +{% else %} +npx oh-my-opencode@latest install --no-tui --claude=yes --chatgpt=no --gemini=no +{% endif %} + +opencode --version +echo "Sisyphus agent ready" diff --git a/benchmark/sisyphus_agent.py b/benchmark/sisyphus_agent.py new file mode 100644 index 0000000000..e5305b6fcc --- /dev/null +++ b/benchmark/sisyphus_agent.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +import json +import os +import shlex +from datetime import datetime, timezone +from pathlib import Path + +from harbor.agents.installed.base import BaseInstalledAgent, ExecInput +from harbor.models.agent.context import AgentContext +from harbor.models.trajectories import ( + Agent, + FinalMetrics, + Metrics, + Observation, + ObservationResult, + Step, + ToolCall, + Trajectory, +) + + +class SisyphusAgent(BaseInstalledAgent): + def __init__( + self, + logs_dir: Path, + prompt_template_path: Path | str | None = None, + version: str | None = None, + omo_version: str | None = None, + *args, + **kwargs, + ): + super().__init__(logs_dir, prompt_template_path, version, *args, **kwargs) + self._omo_version = omo_version or "latest" + + @staticmethod + def name() -> str: + return "sisyphus" + + @property + def _install_agent_template_path(self) -> Path: + return Path(__file__).parent / "install-sisyphus.sh.j2" + + @property + def _template_variables(self) -> dict[str, str]: + variables = super()._template_variables + variables["omo_version"] = self._omo_version + return variables + + def create_run_agent_commands(self, instruction: str) -> list[ExecInput]: + escaped_instruction = shlex.quote(instruction) + + if not self.model_name or "/" not in self.model_name: + raise ValueError("Model name must be in the format provider/model_name") + + provider, _ = self.model_name.split("/", 1) + + env = self._get_provider_env(provider) + env["OPENCODE_FAKE_VCS"] = "git" + + return [ + ExecInput( + command=( + f"opencode --model {self.model_name} run " + f"--agent Sisyphus --format=json {escaped_instruction} " + f"2>&1 | tee /logs/agent/sisyphus.txt" + ), + env=env, + ) + ] + + def _get_provider_env(self, provider: str) -> dict[str, str]: + env = {} + keys = [] + + provider_keys = { + "amazon-bedrock": [ + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_REGION", + ], + "anthropic": ["ANTHROPIC_API_KEY"], + "azure": ["AZURE_RESOURCE_NAME", "AZURE_API_KEY"], + "deepseek": ["DEEPSEEK_API_KEY"], + "github-copilot": ["GITHUB_TOKEN"], + "google": [ + "GEMINI_API_KEY", + "GOOGLE_GENERATIVE_AI_API_KEY", + "GOOGLE_APPLICATION_CREDENTIALS", + "GOOGLE_CLOUD_PROJECT", + "GOOGLE_CLOUD_LOCATION", + "GOOGLE_GENAI_USE_VERTEXAI", + "GOOGLE_API_KEY", + ], + "groq": ["GROQ_API_KEY"], + "huggingface": ["HF_TOKEN"], + "llama": ["LLAMA_API_KEY"], + "mistral": ["MISTRAL_API_KEY"], + "openai": ["OPENAI_API_KEY"], + "xai": ["XAI_API_KEY"], + } + + keys = provider_keys.get(provider, []) + if not keys: + raise ValueError(f"Unknown provider {provider}") + + for key in keys: + if key in os.environ: + env[key] = os.environ[key] + + return env + + def populate_context_post_run(self, context: AgentContext) -> None: + output_file = self.logs_dir / "command-0" / "stdout.txt" + if not output_file.exists(): + return + + trajectory = self._parse_opencode_output(output_file) + if trajectory: + trajectory_path = self.logs_dir / "trajectory.json" + with open(trajectory_path, "w") as f: + json.dump(trajectory.to_json_dict(), f, indent=2) + + if trajectory.final_metrics: + context.cost_usd = trajectory.final_metrics.total_cost_usd + context.n_input_tokens = trajectory.final_metrics.total_prompt_tokens + context.n_output_tokens = ( + trajectory.final_metrics.total_completion_tokens + ) + + def _parse_opencode_output(self, output_file: Path) -> Trajectory | None: + content = output_file.read_text() + events = [] + + for line in content.split("\n"): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + events.append(event) + except json.JSONDecodeError: + continue + + if not events: + return None + + steps = [] + step_id = 1 + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_cost = 0.0 + + for event in events: + event_type = event.get("type", "") + timestamp = event.get("timestamp", datetime.now(timezone.utc).isoformat()) + + if event_type == "user": + steps.append( + Step( + step_id=step_id, + timestamp=timestamp, + source="user", + message=event.get("content", ""), + ) + ) + step_id += 1 + + elif event_type == "assistant": + tool_calls = [] + observation = None + + if "tool_calls" in event: + for tc in event["tool_calls"]: + tool_calls.append( + ToolCall( + tool_call_id=tc.get("id", f"call_{step_id}"), + function_name=tc.get("name", ""), + arguments=tc.get("arguments", {}), + ) + ) + + if "tool_results" in event: + results = [] + for tr in event["tool_results"]: + results.append( + ObservationResult( + source_call_id=tr.get("call_id", ""), + content=tr.get("content", ""), + ) + ) + if results: + observation = Observation(results=results) + + metrics = None + if "usage" in event: + usage = event["usage"] + prompt_tokens = usage.get("prompt_tokens", 0) + completion_tokens = usage.get("completion_tokens", 0) + cost = usage.get("cost", 0.0) + + total_prompt_tokens += prompt_tokens + total_completion_tokens += completion_tokens + total_cost += cost + + metrics = Metrics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cost_usd=cost, + ) + + steps.append( + Step( + step_id=step_id, + timestamp=timestamp, + source="agent", + model_name=self.model_name, + message=event.get("content", ""), + reasoning_content=event.get("thinking", None), + tool_calls=tool_calls if tool_calls else None, + observation=observation, + metrics=metrics, + ) + ) + step_id += 1 + + if not steps: + return None + + return Trajectory( + schema_version="ATIF-v1.4", + session_id=f"sisyphus-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", + agent=Agent( + name="sisyphus", + version=self._omo_version, + model_name=self.model_name, + ), + steps=steps, + final_metrics=FinalMetrics( + total_prompt_tokens=total_prompt_tokens, + total_completion_tokens=total_completion_tokens, + total_cost_usd=total_cost, + total_steps=len(steps), + ), + ) + + +if __name__ == "__main__": + print(f"SisyphusAgent registered: {SisyphusAgent.name()}") From e2bfa7165c3476169bdfa81cbb8fc08ad1ee9712 Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 21:58:16 +0900 Subject: [PATCH 2/7] fix: add bun installation to container setup oh-my-opencode CLI requires bun runtime which was missing in Docker container, causing setup to fail with 'bun: No such file or directory' (exit code 127). --- benchmark/install-sisyphus.sh.j2 | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/benchmark/install-sisyphus.sh.j2 b/benchmark/install-sisyphus.sh.j2 index f41cc33900..1432ad9c47 100644 --- a/benchmark/install-sisyphus.sh.j2 +++ b/benchmark/install-sisyphus.sh.j2 @@ -2,21 +2,28 @@ set -e apt-get update -apt-get install -y curl +apt-get install -y curl unzip -curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.2/install.sh | bash +# Install bun (required by oh-my-opencode) +curl -fsSL https://bun.sh/install | bash +export BUN_INSTALL="$HOME/.bun" +export PATH="$BUN_INSTALL/bin:$PATH" +bun --version +# Install nvm and Node.js +curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.2/install.sh | bash source "$HOME/.nvm/nvm.sh" - nvm install 22 npm -v +# Install OpenCode {% if version %} npm i -g opencode-ai@{{ version }} {% else %} npm i -g opencode-ai@latest {% endif %} +# Install oh-my-opencode plugin (provides Sisyphus agent) {% if omo_version %} npx oh-my-opencode@{{ omo_version }} install --no-tui --claude=yes --chatgpt=no --gemini=no {% else %} From 08c9e80b73e3e0554db524295ac3bc93d8d27958 Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 22:03:03 +0900 Subject: [PATCH 3/7] refactor: simplify container setup using bun only Remove nvm/node/npm - bun handles everything: - bun install -g opencode-ai - bunx oh-my-opencode install --- benchmark/install-sisyphus.sh.j2 | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/benchmark/install-sisyphus.sh.j2 b/benchmark/install-sisyphus.sh.j2 index 1432ad9c47..9f690ac70e 100644 --- a/benchmark/install-sisyphus.sh.j2 +++ b/benchmark/install-sisyphus.sh.j2 @@ -4,30 +4,24 @@ set -e apt-get update apt-get install -y curl unzip -# Install bun (required by oh-my-opencode) +# Install bun curl -fsSL https://bun.sh/install | bash export BUN_INSTALL="$HOME/.bun" export PATH="$BUN_INSTALL/bin:$PATH" bun --version -# Install nvm and Node.js -curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.2/install.sh | bash -source "$HOME/.nvm/nvm.sh" -nvm install 22 -npm -v - # Install OpenCode {% if version %} -npm i -g opencode-ai@{{ version }} +bun install -g opencode-ai@{{ version }} {% else %} -npm i -g opencode-ai@latest +bun install -g opencode-ai@latest {% endif %} # Install oh-my-opencode plugin (provides Sisyphus agent) {% if omo_version %} -npx oh-my-opencode@{{ omo_version }} install --no-tui --claude=yes --chatgpt=no --gemini=no +bunx oh-my-opencode@{{ omo_version }} install --no-tui --claude=yes --chatgpt=no --gemini=no {% else %} -npx oh-my-opencode@latest install --no-tui --claude=yes --chatgpt=no --gemini=no +bunx oh-my-opencode@latest install --no-tui --claude=yes --chatgpt=no --gemini=no {% endif %} opencode --version From d5e4f13c6e7a4db584687cafebc8f350e709f0e6 Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 22:23:00 +0900 Subject: [PATCH 4/7] refactor: simplify SisyphusAgent to minimal implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove ATIF trajectory parsing - keep only essential: - name(), install template path, run command - 263 lines → 74 lines --- benchmark/sisyphus_agent.py | 174 ++---------------------------------- 1 file changed, 5 insertions(+), 169 deletions(-) diff --git a/benchmark/sisyphus_agent.py b/benchmark/sisyphus_agent.py index e5305b6fcc..5ef52b0ed4 100644 --- a/benchmark/sisyphus_agent.py +++ b/benchmark/sisyphus_agent.py @@ -1,36 +1,15 @@ -#!/usr/bin/env python3 -import json import os import shlex -from datetime import datetime, timezone from pathlib import Path from harbor.agents.installed.base import BaseInstalledAgent, ExecInput from harbor.models.agent.context import AgentContext -from harbor.models.trajectories import ( - Agent, - FinalMetrics, - Metrics, - Observation, - ObservationResult, - Step, - ToolCall, - Trajectory, -) class SisyphusAgent(BaseInstalledAgent): - def __init__( - self, - logs_dir: Path, - prompt_template_path: Path | str | None = None, - version: str | None = None, - omo_version: str | None = None, - *args, - **kwargs, - ): - super().__init__(logs_dir, prompt_template_path, version, *args, **kwargs) - self._omo_version = omo_version or "latest" + """ + Sisyphus agent uses OpenCode with oh-my-opencode plugin. + """ @staticmethod def name() -> str: @@ -40,11 +19,8 @@ def name() -> str: def _install_agent_template_path(self) -> Path: return Path(__file__).parent / "install-sisyphus.sh.j2" - @property - def _template_variables(self) -> dict[str, str]: - variables = super()._template_variables - variables["omo_version"] = self._omo_version - return variables + def populate_context_post_run(self, context: AgentContext) -> None: + pass def create_run_agent_commands(self, instruction: str) -> list[ExecInput]: escaped_instruction = shlex.quote(instruction) @@ -70,8 +46,6 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]: def _get_provider_env(self, provider: str) -> dict[str, str]: env = {} - keys = [] - provider_keys = { "amazon-bedrock": [ "AWS_ACCESS_KEY_ID", @@ -108,141 +82,3 @@ def _get_provider_env(self, provider: str) -> dict[str, str]: env[key] = os.environ[key] return env - - def populate_context_post_run(self, context: AgentContext) -> None: - output_file = self.logs_dir / "command-0" / "stdout.txt" - if not output_file.exists(): - return - - trajectory = self._parse_opencode_output(output_file) - if trajectory: - trajectory_path = self.logs_dir / "trajectory.json" - with open(trajectory_path, "w") as f: - json.dump(trajectory.to_json_dict(), f, indent=2) - - if trajectory.final_metrics: - context.cost_usd = trajectory.final_metrics.total_cost_usd - context.n_input_tokens = trajectory.final_metrics.total_prompt_tokens - context.n_output_tokens = ( - trajectory.final_metrics.total_completion_tokens - ) - - def _parse_opencode_output(self, output_file: Path) -> Trajectory | None: - content = output_file.read_text() - events = [] - - for line in content.split("\n"): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - events.append(event) - except json.JSONDecodeError: - continue - - if not events: - return None - - steps = [] - step_id = 1 - total_prompt_tokens = 0 - total_completion_tokens = 0 - total_cost = 0.0 - - for event in events: - event_type = event.get("type", "") - timestamp = event.get("timestamp", datetime.now(timezone.utc).isoformat()) - - if event_type == "user": - steps.append( - Step( - step_id=step_id, - timestamp=timestamp, - source="user", - message=event.get("content", ""), - ) - ) - step_id += 1 - - elif event_type == "assistant": - tool_calls = [] - observation = None - - if "tool_calls" in event: - for tc in event["tool_calls"]: - tool_calls.append( - ToolCall( - tool_call_id=tc.get("id", f"call_{step_id}"), - function_name=tc.get("name", ""), - arguments=tc.get("arguments", {}), - ) - ) - - if "tool_results" in event: - results = [] - for tr in event["tool_results"]: - results.append( - ObservationResult( - source_call_id=tr.get("call_id", ""), - content=tr.get("content", ""), - ) - ) - if results: - observation = Observation(results=results) - - metrics = None - if "usage" in event: - usage = event["usage"] - prompt_tokens = usage.get("prompt_tokens", 0) - completion_tokens = usage.get("completion_tokens", 0) - cost = usage.get("cost", 0.0) - - total_prompt_tokens += prompt_tokens - total_completion_tokens += completion_tokens - total_cost += cost - - metrics = Metrics( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - cost_usd=cost, - ) - - steps.append( - Step( - step_id=step_id, - timestamp=timestamp, - source="agent", - model_name=self.model_name, - message=event.get("content", ""), - reasoning_content=event.get("thinking", None), - tool_calls=tool_calls if tool_calls else None, - observation=observation, - metrics=metrics, - ) - ) - step_id += 1 - - if not steps: - return None - - return Trajectory( - schema_version="ATIF-v1.4", - session_id=f"sisyphus-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", - agent=Agent( - name="sisyphus", - version=self._omo_version, - model_name=self.model_name, - ), - steps=steps, - final_metrics=FinalMetrics( - total_prompt_tokens=total_prompt_tokens, - total_completion_tokens=total_completion_tokens, - total_cost_usd=total_cost, - total_steps=len(steps), - ), - ) - - -if __name__ == "__main__": - print(f"SisyphusAgent registered: {SisyphusAgent.name()}") From 1d0dabbf345e8864c66ea5e6b5120e03de2637a4 Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 22:25:34 +0900 Subject: [PATCH 5/7] feat: use opencode/zen provider to reduce benchmark costs Switch from --claude=yes to --claude=no for oh-my-opencode install. This configures all subagents to use the free opencode/zen provider instead of paid Anthropic API. --- benchmark/install-sisyphus.sh.j2 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/install-sisyphus.sh.j2 b/benchmark/install-sisyphus.sh.j2 index 9f690ac70e..de7fa8b13e 100644 --- a/benchmark/install-sisyphus.sh.j2 +++ b/benchmark/install-sisyphus.sh.j2 @@ -18,10 +18,11 @@ bun install -g opencode-ai@latest {% endif %} # Install oh-my-opencode plugin (provides Sisyphus agent) +# --claude=no uses opencode/zen provider (free) instead of Anthropic {% if omo_version %} -bunx oh-my-opencode@{{ omo_version }} install --no-tui --claude=yes --chatgpt=no --gemini=no +bunx oh-my-opencode@{{ omo_version }} install --no-tui --claude=no --chatgpt=no --gemini=no {% else %} -bunx oh-my-opencode@latest install --no-tui --claude=yes --chatgpt=no --gemini=no +bunx oh-my-opencode@latest install --no-tui --claude=no --chatgpt=no --gemini=no {% endif %} opencode --version From 042a9b1d1476569ddbe4ce16568c2446f868938a Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 22:32:02 +0900 Subject: [PATCH 6/7] fix: add opencode provider support (no API key required) The opencode/zen provider doesn't require an API key, so add it to provider_keys with an empty list. --- benchmark/sisyphus_agent.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/sisyphus_agent.py b/benchmark/sisyphus_agent.py index 5ef52b0ed4..efbf42abf0 100644 --- a/benchmark/sisyphus_agent.py +++ b/benchmark/sisyphus_agent.py @@ -70,12 +70,11 @@ def _get_provider_env(self, provider: str) -> dict[str, str]: "llama": ["LLAMA_API_KEY"], "mistral": ["MISTRAL_API_KEY"], "openai": ["OPENAI_API_KEY"], + "opencode": [], # opencode/zen - no API key required "xai": ["XAI_API_KEY"], } keys = provider_keys.get(provider, []) - if not keys: - raise ValueError(f"Unknown provider {provider}") for key in keys: if key in os.environ: From 8eee7ad1d5108c83bd958f57896e88ea1e66c656 Mon Sep 17 00:00:00 2001 From: minpeter Date: Wed, 7 Jan 2026 22:58:52 +0900 Subject: [PATCH 7/7] Add pre-install config to disable noisy hooks in benchmarks ``` Pre-create oh-my-opencode config before plugin install to prevent hook initialization during load, which can cause rate limits or unnecessary overhead in benchmark environments. ``` --- benchmark/install-sisyphus.sh.j2 | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/benchmark/install-sisyphus.sh.j2 b/benchmark/install-sisyphus.sh.j2 index de7fa8b13e..f323353301 100644 --- a/benchmark/install-sisyphus.sh.j2 +++ b/benchmark/install-sisyphus.sh.j2 @@ -17,6 +17,23 @@ bun install -g opencode-ai@{{ version }} bun install -g opencode-ai@latest {% endif %} +# Pre-create oh-my-opencode config BEFORE install to disable problematic hooks +# This prevents hooks from initializing during plugin load +# - comment-checker: Downloads Go binary from GitHub (rate limiting with multiple containers) +# - auto-update-checker: Checks for updates (unnecessary in benchmarks) +# - session-notification: OS notifications (no display in containers) +# - background-notification: OS notifications (no display in containers) +mkdir -p ~/.config/opencode +cat > ~/.config/opencode/oh-my-opencode.json << 'EOF' +{ + "disabled_hooks": [ + "auto-update-checker", + "session-notification", + "background-notification" + ] +} +EOF + # Install oh-my-opencode plugin (provides Sisyphus agent) # --claude=no uses opencode/zen provider (free) instead of Anthropic {% if omo_version %}