diff --git a/OS_images/main.py b/OS_images/main.py index a5b207cef..154b66f32 100644 --- a/OS_images/main.py +++ b/OS_images/main.py @@ -346,16 +346,33 @@ def get_cursor(): pyautogui.moveTo(current_x, current_y) # ===================================== - cursor_obj = Xcursor() - imgarray = cursor_obj.getCursorImageArrayFast() - cursor_img = Image.fromarray(imgarray) - - # Taking screenshot after the wake-up - screenshot = pyautogui.screenshot() - - cursor_x, cursor_y = pyautogui.position() - screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img) - screenshot.save(file_path) + max_screenshot_attempts = 3 + for _screenshot_attempt in range(max_screenshot_attempts): + try: + cursor_obj = Xcursor() + imgarray = cursor_obj.getCursorImageArrayFast() + cursor_img = Image.fromarray(imgarray) + + # Taking screenshot after the wake-up + screenshot = pyautogui.screenshot() + + cursor_x, cursor_y = pyautogui.position() + screenshot.paste(cursor_img, (cursor_x, cursor_y), cursor_img) + screenshot.save(file_path) + break # Success + except Exception as e: + logger.warning(f"Screenshot attempt {_screenshot_attempt + 1}/{max_screenshot_attempts} failed: {e}") + # Clean up stale temp files that may cause PIL errors + import glob + for tmp_png in glob.glob("/tmp/tmp*.png"): + try: + os.remove(tmp_png) + except OSError: + pass + if _screenshot_attempt == max_screenshot_attempts - 1: + logger.error(f"All {max_screenshot_attempts} screenshot attempts failed, returning error") + return jsonify({"status": "error", "message": f"Screenshot failed: {e}"}), 503 + time.sleep(0.5) elif user_platform == "Darwin": # (Mac OS) # Use the screencapture utility to capture the screen with the cursor subprocess.run(["screencapture", "-C", file_path]) @@ -3773,4 +3790,4 @@ def run_bash_script(): pass if __name__ == '__main__': - app.run(debug=True, host="0.0.0.0") + app.run(debug=False, host="0.0.0.0") diff --git a/cua/README.md b/cua/README.md index 8b337c5ec..d7ab238f4 100644 --- a/cua/README.md +++ b/cua/README.md @@ -1,51 +1,150 @@ # CUA Data Collection -## Running on Interactive Session -### 1. Boot Up vLLM Servers -We use two vLLM servers - 1 for Qwen3-VL-235B (goal generation policy, aka `planner model` in the code) and 1 for -UI-TARS-1.5-7B (action generation policy, aka `actor model` in the code). +Runs a **planner** model (Qwen3-VL-235B, tp=8) on a non-reserved GPU node and one or more **actor** nodes (UI-TARS-1.5-7B, tp=4) on reserved GPU nodes with KVM-accelerated Linux VMs for data collection. All nodes use the combined `cua-vllm-0.13.0.sqsh` container image (vLLM/CUDA + QEMU/KVM). + +**Why `enroot exec`?** The srun enroot container loses `/dev/kvm` write access. Running `enroot exec` from outside the container retains it. Actor nodes use the SSH+enroot holder-job pattern for this reason. + +## Automated Full Run -Run both servers on 2 nodes by ```bash cd scripts -sbatch run_model.sbatch +bash run.sh ``` -The logs will be shown at `scripts/logs/planner.out` and `scripts/logs/actor.out`. +This: +1. Submits a planner sbatch job (Qwen3-VL-235B, 8 GPUs, non-reserved) +2. Waits for the planner to start and write its hostname to a coordination file +3. Launches `NUM_ACTORS` actor instances, each submitting its own holder job on a reserved node +4. Each actor starts UI-TARS-1.5-7B vLLM + data collection VMs inside its container +5. Waits for all actors to finish, then cancels the planner + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `NUM_ACTORS` | 1 | Number of actor nodes to launch | +| `MAX_PARALLEL` | 16 | Parallel VMs per actor node | +| `MAX_TRAJECTORIES` | 10000 | Trajectories to collect per actor | -### 2. Run CPU node for data collection, with /dev/kvm write permission -We now run CPU interactive session, in which we boot up the linux virtual machine (VM) and call the vLLM servers to -collect the trajectories. Here, it's important we have a write access to `/dev/kvm`, as it allows us to accelerate VM. +Each actor independently collects up to `MAX_TRAJECTORIES`. With `NUM_ACTORS=3, MAX_TRAJECTORIES=500`, you get up to 1500 total. + +### Examples + +```bash +# Single actor, defaults +bash run.sh + +# 3 actors, 4 parallel VMs each, 500 trajectories each +NUM_ACTORS=3 MAX_PARALLEL=4 MAX_TRAJECTORIES=500 bash run.sh +``` + +### Logs + +All logs go to `scripts/logs_multi_thread/`: + +| File | Contents | +|------|----------| +| `planner-.out` | Planner vLLM server output | +| `actor_launcher_.log` | Actor N lifecycle (job submission, container polling, SSH exec) | +| `actor_-.out` | Actor N vLLM + data collection output | +| `vllm_actor_.log` | Actor N vLLM server detailed logs | + +### SLURM Configuration + +| Component | Account | Partition | Reservation | +|-----------|---------|-----------|-------------| +| Planner | `nvr_lacr_llm` | `interactive` | none | +| Actor | `llmservice_fm_vision` | `interactive` | `sla_res_osworld_agent_vlm` | + +## Interactive Debugging + +### 1. Start the Planner vLLM Server + +In a separate terminal, submit the planner on its own 8-GPU node: + +```bash +IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" +PLANNER_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/models/Qwen3-VL-235B-A22B-Thinking" + +srun --job-name=planner \ + --account=nvr_lacr_llm \ + --partition=interactive \ + --gpus-per-node=8 \ + --nodes=1 --ntasks-per-node=1 \ + --time=04:00:00 --exclusive \ + --container-image=$IMAGE \ + --container-mounts=/lustre:/lustre \ + bash -c "vllm serve $PLANNER_MODEL \ + --api-key gen \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --limit-mm-per-prompt.video 0 \ + --limit-mm-per-prompt.image 3 \ + --async-scheduling \ + --max-model-len 65536 \ + --gpu-memory-utilization 0.9" +``` + +Note which node it lands on via `squeue -u $USER` — you'll need this as `$PLANNER_NODE` later (e.g. `pool0-2838`). + +### 2. Get an Interactive Shell with KVM -Run the bash script to spin up the CPU interactive node by ```bash cd scripts bash debug_interactive.sh ``` -***What does `debug_interactive.sh` do?*** \ -(1) We first allocate 1 CPU interactive node (with `sleep infinity &` as the command). We assign one of the reserved nodes -for `/dev/kvm` access. \ -(2) When the node is ready with enroot container running, we ssh into the node, and fetch the enroot container ID. \ -(3) We run `enroot exec $CONTAINER_ID bash` in order to access a bash shell inside that enroot container. +This allocates a GPU interactive node (8 GPUs) with the combined container image. The script: +1. Submits a background `sleep infinity` job to reserve the node +2. Waits for the enroot container to be ready +3. SSHs into the node and runs `enroot exec` to enter the container + +When you exit the shell (`exit`), the script automatically cancels the SLURM job via a cleanup trap. -***Wait, why aren't we just directly using the CPU interactive node in step (1)?*** \ -This is a very finicky detail, but in step (1), the enroot container environment loses `/dev/kvm` access that the -CPU node originally had. The only way to retain `/dev/kvm` access inside enroot is to first boot up the container, -then running `enroot exec $CONTAINER_ID bash` from outside. +### 3. Start the Actor vLLM Server +Inside the container shell, start UI-TARS-1.5-7B in the background: -### 3-A. Run Data Collection Script for Debugging ```bash -python debug_collect_trajectories.py --planner_node $PLANNER_NODE --actor_node $ACTOR_NODE +vllm serve ByteDance-Seed/UI-TARS-1.5-7B \ + --api-key gen \ + --tensor-parallel-size 4 \ + --limit-mm-per-prompt.image 5 \ + --limit-mm-per-prompt.video 0 \ + --max-model-len 65536 & ``` -`$PLANNER_NODE` and `$ACTOR_NODE` should be manually set by the user (e.g., pool0-2838). -### 3-B. Run Data Collection Script for Parallel Processing +Wait for it to be healthy: ```bash -python parallel_collect_trajectories.py --planner_node $PLANNER_NODE --actor_node $ACTOR_NODE +curl http://localhost:8000/health ``` -## Running with SBATCH -TBD (we just need SBATCH script to run `parallel_collect_trajectories.py`) +### 4. Run Debug Data Collection + +Still inside the container: + +```bash +cd /lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server/cua +source cua_env_reqs/bin/activate +export PYTHONPATH=/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server:$PYTHONPATH + +python debug_collect_trajectories.py \ + --planner_node $PLANNER_NODE \ + --actor_node localhost +``` + +`$PLANNER_NODE` is the node from step 1. The actor is `localhost` since it's running on the same node. + +## Scripts Reference +| Script | Purpose | +|--------|---------| +| `run.sh` | Multi-actor launcher (1 planner + N actors) | +| `run_planner.sbatch` | Planner vLLM server sbatch job | +| `run_actor_and_vm.sh` | Single actor launcher (SSH+enroot pattern) | +| `run_all.sbatch` | Legacy consolidated 2-node sbatch (kept for reference) | +| `debug_interactive.sh` | Interactive GPU shell with KVM | +| `debug_check_kvm.sbatch` | Verify KVM works on GPU nodes (inside container) | +| `check_kvm_cpu.sbatch` | Verify KVM on CPU nodes (legacy) | +| `check_kvm_bash.sh` | Quick KVM write-permission test | +| `run_models.sbatch` | Start both model servers on 2 GPU nodes (standalone) | diff --git a/cua/cleanup_nvcf.py b/cua/cleanup_nvcf.py new file mode 100644 index 000000000..447298c9a --- /dev/null +++ b/cua/cleanup_nvcf.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""List and clean up NVCF functions. + +Usage: + # List all functions in the org: + python cleanup_nvcf.py --list + + # Undeploy and delete only YOUR pool functions (nvcf-pool-*): + python cleanup_nvcf.py --cleanup + + # Undeploy and delete ALL functions (careful — includes other users'): + python cleanup_nvcf.py --cleanup --all + + # Undeploy and delete a specific function: + python cleanup_nvcf.py --delete FUNCTION_ID VERSION_ID +""" +import argparse +import os +import sys + +sys.path.insert(0, "/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server") + +from openhands.nvidia.os_world.nvcf import OSWorldDeployer + +POOL_NAME_PREFIX = "nvcf-pool-" + + +def main(): + parser = argparse.ArgumentParser(description="NVCF function cleanup utility") + parser.add_argument("--list", action="store_true", help="List all functions in the org") + parser.add_argument("--cleanup", action="store_true", + help="Undeploy and delete pool functions (nvcf-pool-* only, unless --all)") + parser.add_argument("--all", action="store_true", + help="With --cleanup: delete ALL functions, not just pool ones") + parser.add_argument("--delete", nargs=2, metavar=("FUNC_ID", "VER_ID"), + help="Delete a specific function") + args = parser.parse_args() + + api_key = os.environ.get("NGC_API_KEY") + org = os.environ.get("NGC_ORG") + if not api_key or not org: + print("ERROR: Set NGC_API_KEY and NGC_ORG environment variables") + sys.exit(1) + + deployer = OSWorldDeployer(api_key=api_key, org_name=org) + + if args.list or (not args.cleanup and not args.delete): + print("Listing all private NVCF functions in org...\n") + result = deployer.list_functions() + functions = result.get("functions", []) + if not functions: + print("No functions found.") + return + for fn in functions: + fn_id = fn.get("id", "?") + name = fn.get("name", "?") + status = fn.get("status", "?") + ver_id = fn.get("versionId", "?") + mine = " <-- pool" if name.startswith(POOL_NAME_PREFIX) else "" + print(f" {name:40s} status={status:10s} fn={fn_id} ver={ver_id}{mine}") + print(f"\nTotal: {len(functions)} functions") + + if args.delete: + fn_id, ver_id = args.delete + print(f"Undeploying {fn_id}...") + try: + deployer.undeploy(fn_id, ver_id, graceful=True) + print("Undeployed. Deleting...") + except Exception as e: + print(f"Undeploy failed (may already be undeployed): {e}") + try: + deployer.delete_function(fn_id, ver_id) + print("Deleted.") + except Exception as e: + print(f"Delete failed: {e}") + + if args.cleanup: + result = deployer.list_functions() + functions = result.get("functions", []) + + if not args.all: + # Only clean up pool functions + functions = [f for f in functions if f.get("name", "").startswith(POOL_NAME_PREFIX)] + print(f"Cleaning up {len(functions)} pool functions (nvcf-pool-*)...\n") + else: + print(f"Cleaning up ALL {len(functions)} functions...\n") + + if not functions: + print("Nothing to clean up.") + return + + for fn in functions: + fn_id = fn.get("id", "?") + ver_id = fn.get("versionId", "?") + name = fn.get("name", "?") + status = fn.get("status", "?") + print(f" Cleaning up: {name} ({fn_id}) status={status}") + try: + deployer.undeploy(fn_id, ver_id, graceful=True) + print(f" Undeployed") + except Exception as e: + print(f" Undeploy skipped: {e}") + try: + deployer.delete_function(fn_id, ver_id) + print(f" Deleted") + except Exception as e: + print(f" Delete failed: {e}") + print(f"\nDone. Cleaned up {len(functions)} functions.") + + +if __name__ == "__main__": + main() diff --git a/cua/modules/debug_env_controller.py b/cua/modules/debug_env_controller.py index 71c113fb4..5bf9ab85e 100644 --- a/cua/modules/debug_env_controller.py +++ b/cua/modules/debug_env_controller.py @@ -1,123 +1,203 @@ import logging +import os +import sys import re -from typing import Dict, Tuple - -import ipdb - -from examples.setup import SetupController -from openhands.core.config import OpenHandsConfig -from openhands.events import EventStream -from openhands.events.action.os import OSWorldInteractiveAction -from openhands.events.observation import ErrorObservation -from openhands.runtime.impl.singularity.osworld_singularity_runtime import OSWorldSingularityRuntime -from openhands.storage import get_file_store +import uuid +import time +import threading +import requests +from typing import Dict, List, Optional, Tuple + +# Ensure OSWorld is importable +_osworld_path = "/lustre/fsw/portfolios/nvr/users/bcui/OSWorld" +if _osworld_path not in sys.path: + sys.path.insert(0, _osworld_path) + +from desktop_env.desktop_env import DesktopEnv from openhands.core.logger import openhands_logger # Create a child logger logger = openhands_logger.getChild('env_controller') logger.setLevel(logging.DEBUG) +# Semaphore to limit concurrent downloads (shared with setup.py via env var) +_DOWNLOAD_SEMAPHORE = threading.Semaphore(int(os.environ.get('OSWORLD_MAX_CONCURRENT_DOWNLOADS', '3'))) + class EnvController: """ - Static Wrapper class that interfaces with OSWorldSingularityRuntime. + Static wrapper class that interfaces with OSWorld's DesktopEnv. + Replaces the previous OpenHands runtime-based approach with OSWorld's + native DesktopEnv + NVCFProvider for NVCF deployments. """ + @staticmethod - async def initialize_runtime(job_id: str, vm_image_path: str, os_type: str, - osworld_setup: Dict) -> OSWorldSingularityRuntime: + def pre_download_setup_files(osworld_setup: Dict, cache_dir: str = "/tmp/osworld_cache") -> bool: """ - Initialize OSWorldSingularityRuntime. - Used by DataCollector._init_worker to boot up the VM. + Pre-download all setup files to local cache BEFORE deploying NVCF. + This avoids wasting NVCF resources if downloads fail (e.g., HF 429 errors). + + Returns True if all downloads succeeded, False otherwise. """ - config = OpenHandsConfig() - config.runtime = "osworld" - config.sandbox.base_container_image = "ubuntu:24.04" - config.sandbox.run_as_fakeroot = False - config.sandbox.runtime_container_image = None # Trigger auto-build - - # Unique event stream per trajectory - file_store = get_file_store('local', f'/tmp/synthetic_data_gen_{job_id}') - event_stream = EventStream(sid=job_id, file_store=file_store) - - logger.debug(f"[initialize_runtime] Creating runtime for {job_id}") - logger.debug(f"[initialize_runtime] VM image: {vm_image_path}") - logger.debug(f"[initialize_runtime] Base image: {config.sandbox.base_container_image}") - - runtime = OSWorldSingularityRuntime( - config=config, - event_stream=event_stream, - sid=job_id, - os_type=os_type, - vm_image_path=vm_image_path, - attach_to_existing=False, - ) + config_list = osworld_setup.get("config", []) + if not config_list: + return True + + os.makedirs(cache_dir, exist_ok=True) + + # Build headers with HF token if available + dl_headers = {} + hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') + if hf_token: + dl_headers['Authorization'] = f'Bearer {hf_token}' + + for cfg in config_list: + if cfg.get("type") != "download": + continue + + files = cfg.get("parameters", {}).get("files", []) + for f in files: + url = f.get("url", "") + path = f.get("path", "") + if not url or not path: + continue + + cache_path = os.path.join(cache_dir, "{:}_{:}".format( + uuid.uuid5(uuid.NAMESPACE_URL, url), + os.path.basename(path))) + + if os.path.exists(cache_path): + logger.info(f"[pre_download] Cache hit: {cache_path}") + continue + + logger.info(f"[pre_download] Downloading {url} to cache...") + max_retries = 8 + downloaded = False + last_error = None + + with _DOWNLOAD_SEMAPHORE: + for i in range(max_retries): + try: + backoff = min(2 ** i + 1, 60) + if i > 0: + logger.info(f"[pre_download] Waiting {backoff}s before retry {i+1}/{max_retries}") + time.sleep(backoff) + + response = requests.get(url, stream=True, timeout=300, headers=dl_headers) + response.raise_for_status() + + downloaded_size = 0 + with open(cache_path, 'wb') as fh: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + fh.write(chunk) + downloaded_size += len(chunk) + + logger.info(f"[pre_download] Downloaded {downloaded_size / (1024*1024):.2f} MB to {cache_path}") + downloaded = True + break + + except requests.RequestException as e: + last_error = e + logger.warning(f"[pre_download] Failed {url}: {e} ({max_retries - i - 1} retries left)") + if os.path.exists(cache_path): + os.remove(cache_path) + + if not downloaded: + logger.error(f"[pre_download] All retries exhausted for {url}. Last error: {last_error}") + return False + + return True - logger.debug(f"[initialize_runtime] Runtime object created, connecting to VM...") - - await runtime.connect() - logger.debug(f"[initialize_runtime] ✓ Runtime initialized and connected for {job_id}") - logger.debug(f"[initialize_runtime] VM URL: {runtime.osworld_vm_url if hasattr(runtime, 'osworld_vm_url') else 'N/A'}") - - if osworld_setup and os_type == "linux": - logger.debug(f"[initialize_runtime] Setting up OSWorld...") - logger.debug(f"[initialize_runtime OSWorld Setup: {osworld_setup}") - setup_controller = SetupController( - vm_ip="127.0.0.1", - server_port=runtime._vm_server_port, - chromium_port=runtime._chromium_port, - cache_dir="/tmp/osworld_example", # might need to be changed to a unique directory for each job - client_password="password", - runtime=runtime - ) - await setup_controller.setup(osworld_setup['config']) - logger.debug(f"[initialize_runtime] ✓ OSWorld setup completed") - else: - logger.debug(f"[initialize_runtime] No OSWorld setup provided") + @staticmethod + async def initialize_runtime( + job_id: str, + vm_image_path: str, + os_type: str, + osworld_setup: Dict, + runtime_type: str = "singularity", + nvcf_function_id: Optional[str] = None, + nvcf_version_id: Optional[str] = None, + nvcf_api_key: Optional[str] = None, + nvcf_org: Optional[str] = None, + ): + """ + Initialize runtime using OSWorld's DesktopEnv. - return runtime + For NVCF runtime: creates DesktopEnv(provider_name='nvcf') which + auto-deploys an NVCF function and starts a local proxy. - @staticmethod - def execute_pyautogui_command(runtime: OSWorldSingularityRuntime, pyautogui_command: str): - pyautogui_action = OSWorldInteractiveAction( - method="execute_python_command", - params={ - "command": pyautogui_command, - } + For singularity runtime: creates DesktopEnv(provider_name='singularity') + which uses the local KVM-based approach. + """ + logger.debug(f"[initialize_runtime] Creating {runtime_type} DesktopEnv for {job_id}") + + if runtime_type == "nvcf": + # Set env vars that OSWorld's NVCFProvider reads + if nvcf_api_key: + os.environ.setdefault("NGC_API_KEY", nvcf_api_key) + if nvcf_org: + os.environ.setdefault("NGC_ORG", nvcf_org) + if nvcf_function_id: + os.environ["NVCF_FUNCTION_ID"] = nvcf_function_id + if nvcf_version_id: + os.environ["NVCF_VERSION_ID"] = nvcf_version_id + + provider_name = "nvcf" + else: + provider_name = "singularity" + + env = DesktopEnv( + provider_name=provider_name, + path_to_vm=vm_image_path if runtime_type != "nvcf" else "", + action_space="pyautogui", + headless=True, + os_type="Ubuntu" if os_type == "linux" else os_type, + require_a11y_tree=False, ) - result = runtime.run_action(pyautogui_action) - if not isinstance(result, ErrorObservation): - logger.debug("[execute_pyautogui_command] ✓ Action complete") - else: - logger.debug(f"[execute_pyautogui_command] Error in Action: {result}") + logger.debug(f"[initialize_runtime] DesktopEnv created, resetting with OSWorld setup...") - @staticmethod - def get_screen_size(runtime: OSWorldSingularityRuntime) -> Tuple[int, int]: - observation = runtime.run_action(OSWorldInteractiveAction( - method="get_vm_screen_size", - params={}, - thought="" - )) + # DesktopEnv.reset() handles: start emulator, NVCF deploy, proxy, snapshot revert, setup + env.reset(task_config=osworld_setup) - assert hasattr(observation, "content"), "get_screen_size failed." + logger.debug(f"[initialize_runtime] DesktopEnv reset complete for {job_id}") - match = re.search(r"Width: (\d+), Height: (\d+)", observation.content) - width, height = int(match.group(1)), int(match.group(2)) + return env + + @staticmethod + def execute_pyautogui_command(env, pyautogui_command: str): + """Execute a pyautogui command on the remote VM via OSWorld's PythonController.""" + try: + env.controller.execute_python_command(pyautogui_command) + logger.debug("[execute_pyautogui_command] Action complete") + except Exception as e: + logger.debug(f"[execute_pyautogui_command] Error in Action: {e}") - return width, height + @staticmethod + def get_screen_size(env) -> Tuple[int, int]: + """Get the screen size of the remote VM.""" + try: + size = env.controller.get_vm_screen_size() + if isinstance(size, tuple) and len(size) == 2: + return size + # Fallback: parse from string if needed + if isinstance(size, str): + match = re.search(r"(\d+)\D+(\d+)", size) + if match: + return int(match.group(1)), int(match.group(2)) + except Exception as e: + logger.warning(f"[get_screen_size] Failed: {e}, using defaults") + + return env.screen_width, env.screen_height @staticmethod - def get_screenshot(runtime: OSWorldSingularityRuntime) -> bytes: + def get_screenshot(env) -> bytes: """ - Returns the current screenshot from the runtime, in base64 format. - If screenshot_path is set, save the screenshot as png. + Returns the current screenshot from the DesktopEnv as bytes. """ - screenshot = runtime.get_vm_screenshot() + screenshot = env.controller.get_screenshot() if not screenshot: - logger.debug("✗ Failed to get screenshot from runtime.") - raise RuntimeError("Failed to get screenshot from runtime.") - + logger.debug("Failed to get screenshot from DesktopEnv.") + raise RuntimeError("Failed to get screenshot from DesktopEnv.") return screenshot - - - diff --git a/cua/modules/module_data_collector.py b/cua/modules/module_data_collector.py index eb9233493..2c3d75772 100644 --- a/cua/modules/module_data_collector.py +++ b/cua/modules/module_data_collector.py @@ -12,8 +12,6 @@ from pathlib import Path from typing import Optional, Dict, Any, Tuple -import ipdb - from modules.actors.debug_uitars_actor import UITarsActor from modules.debug_planner import Planner from modules.debug_env_controller import EnvController @@ -42,6 +40,13 @@ def __init__(self, args: Namespace): self.vm_image_path = args.vm_image_path self.os_type = 'linux' if 'Ubuntu' in self.vm_image_path else 'windows' + # Runtime type: "singularity" (local KVM) or "nvcf" (NVCF via OSWorld DesktopEnv) + self.runtime_type = getattr(args, 'runtime', 'singularity') + + # NVCF credentials (passed via env vars to OSWorld's NVCFProvider) + self.nvcf_api_key = getattr(args, 'nvcf_api_key', None) + self.nvcf_org = getattr(args, 'nvcf_org', None) + self.max_steps_per_trajectory = args.max_steps_per_trajectory self.max_steps_per_goal = args.max_steps_per_goal @@ -117,10 +122,15 @@ def save_trajectory(trajectory: Dict, trajectory_save_dir: Path): logger.debug(f"✓ [save_trajectory] Saved to {str(trajectory_save_dir / 'trajectory.json')}") - async def init_runtime_for_job(self, trajectory_idx: int) -> Tuple: + async def init_runtime_for_job(self, trajectory_idx: int, + nvcf_function_id: str = None, + nvcf_version_id: str = None) -> Tuple: """ Stage 1: Initialize the VM and OSWorld setup. - Returns: (runtime, trajectory, trajectory_save_dir, trajectory_id, osworld_setup) + Returns: (env, trajectory, trajectory_save_dir, trajectory_id, osworld_setup) + + Uses OSWorld's DesktopEnv which handles NVCF deploy, local proxy, + and environment setup internally. """ # Create unique IDs job_id = f"job_{trajectory_idx:04d}" @@ -140,13 +150,35 @@ async def init_runtime_for_job(self, trajectory_idx: int) -> Tuple: else: osworld_setup_ready = True - # Initialize Runtime (Async) - runtime = await EnvController.initialize_runtime( - job_id, self.vm_image_path, self.os_type, osworld_setup + logger.info(f"[job {trajectory_idx:04d}] Sampled OSWorld config: id={osworld_setup.get('id', 'unknown')}, " + f"snapshot={osworld_setup.get('snapshot', 'unknown')}, " + f"apps={osworld_setup.get('related_apps', [])}, " + f"instruction={osworld_setup.get('instruction', '')[:80]}") + + # Pre-download setup files to local cache BEFORE deploying NVCF. + # This avoids wasting expensive NVCF GPU resources if downloads fail. + if self.runtime_type == "nvcf": + logger.info(f"[job {trajectory_idx:04d}] Pre-downloading setup files before NVCF deploy...") + download_ok = EnvController.pre_download_setup_files(osworld_setup) + if not download_ok: + raise RuntimeError( + f"[job {trajectory_idx:04d}] Setup file pre-download failed. " + f"Skipping NVCF deploy to avoid wasting resources." + ) + logger.info(f"[job {trajectory_idx:04d}] Pre-download complete, proceeding with NVCF deploy.") + + # Initialize DesktopEnv (handles NVCF deploy + proxy + setup internally) + env = await EnvController.initialize_runtime( + job_id, self.vm_image_path, self.os_type, osworld_setup, + runtime_type=self.runtime_type, + nvcf_function_id=nvcf_function_id, + nvcf_version_id=nvcf_version_id, + nvcf_api_key=self.nvcf_api_key, + nvcf_org=self.nvcf_org, ) # Get screen size - width, height = EnvController.get_screen_size(runtime) + width, height = EnvController.get_screen_size(env) # Prepare Metadata trajectory = { @@ -160,25 +192,24 @@ async def init_runtime_for_job(self, trajectory_idx: int) -> Tuple: 'steps': [], } - return runtime, trajectory, trajectory_save_dir, trajectory_id, osworld_setup + return env, trajectory, trajectory_save_dir, trajectory_id, osworld_setup - async def collect_trajectory(self, runtime, trajectory: Dict, trajectory_save_dir: Path, osworld_setup: Dict): + async def collect_trajectory(self, env, trajectory: Dict, trajectory_save_dir: Path, osworld_setup: Dict): """ Stage 2: Run the Agent Loop (Goal Generation -> Action Execution). + `env` is an OSWorld DesktopEnv instance. """ # Wait for UI initialization time.sleep(3.0) # Initial Screenshot - screenshot_bytes = EnvController.get_screenshot(runtime) + screenshot_bytes = EnvController.get_screenshot(env) image_filename = trajectory_save_dir / f"0-0.png" save_image(screenshot_bytes, image_filename, logger) # --- 1. Generate High Level Goal --- # - # todo implement the verification mechanism for goal achievability using requirements - # generate goal in a separate loop - prev_requirements = [] # will be a list of tuple [("condition 1", "verdict 1"), ...] - example_goals = random.sample(self.example_instructions, 1) # for now, we sample 1 example goal + prev_requirements = [] + example_goals = random.sample(self.example_instructions, 1) goal, requirements = self.planner.generate_goal_with_long_horizon( screenshot_bytes, osworld_setup["config"], example_goals, prev_requirements, ) @@ -231,21 +262,19 @@ async def collect_trajectory(self, runtime, trajectory: Dict, trajectory_save_di ) if action_result is None: - # UI-TARS action generation failed (failed to meet the requirement) - # in this case, save only up to the current trajectory break pyautogui_command = action_result["pyautogui_command"] action_generation = action_result["action_generation"] # Execute - EnvController.execute_pyautogui_command(runtime, pyautogui_command) + EnvController.execute_pyautogui_command(env, pyautogui_command) # Wait & Observe time.sleep(3.0) # Capture new state - screenshot_bytes = EnvController.get_screenshot(runtime) + screenshot_bytes = EnvController.get_screenshot(env) # Save step info action_idx = len(step_for_this_subgoal['actions']) @@ -276,11 +305,11 @@ async def single_trajectory_job(self, trajectory_idx: int): Simply chains the two stages sequentially in the main thread. """ # 1. Init - runtime, trajectory_data, save_dir, t_id, setup = await self.init_runtime_for_job(trajectory_idx) + env, trajectory_data, save_dir, t_id, setup = await self.init_runtime_for_job(trajectory_idx) try: # 2. Collect - await self.collect_trajectory(runtime, trajectory_data, save_dir, setup) + await self.collect_trajectory(env, trajectory_data, save_dir, setup) finally: - # Cleanup for debug mode - runtime.close() + # Cleanup + env.close() diff --git a/cua/modules/nvcf_pool.py b/cua/modules/nvcf_pool.py new file mode 100644 index 000000000..4f4cc7167 --- /dev/null +++ b/cua/modules/nvcf_pool.py @@ -0,0 +1,230 @@ +"""NVCF Function Pool: manages a warm pool of pre-deployed NVCF functions for parallel data collection.""" + +import logging +import math +import queue +import threading +from concurrent.futures import ThreadPoolExecutor +from typing import List, Optional, Tuple + +from openhands.core.logger import openhands_logger +from openhands.nvidia.os_world.nvcf import ( + OSWorldDeployer, + OSWorldDeploymentConfig, + OSWorldFunctionConfig, +) + +logger = openhands_logger.getChild('nvcf_pool') +logger.setLevel(logging.INFO) + + +class NVCFPool: + """Thread-safe pool of pre-deployed NVCF functions. + + Deploys NVCF functions at startup and provides acquire/release semantics + so workers can check out a warm VM, use it for a trajectory, and return it. + + When num_vms_per_instance > 1, fewer functions are deployed, each with + multiple VM instances on the same machine, reducing resource overhead. + """ + + def __init__( + self, + pool_size: int, + num_vms_per_instance: int = 1, + nvcf_api_key: Optional[str] = None, + nvcf_org: Optional[str] = None, + ): + self.pool_size = pool_size + self.num_vms_per_instance = num_vms_per_instance + self._deployer = OSWorldDeployer(api_key=nvcf_api_key, org_name=nvcf_org) + self._nvcf_api_key = nvcf_api_key + self._nvcf_org = nvcf_org + + # Number of NVCF functions to deploy + self._num_functions = math.ceil(pool_size / num_vms_per_instance) + + # Each entry is (function_id, version_id) + self._all_functions: List[Tuple[str, str]] = [] + self._available: queue.Queue[Tuple[str, str]] = queue.Queue() + self._lock = threading.Lock() + + def _deploy_one(self, index: int) -> Tuple[str, str]: + """Deploy a single NVCF function and wait for it to become ACTIVE.""" + func_config = OSWorldFunctionConfig( + name=f"nvcf-pool-{index}", + description=f"Warm pool function {index} ({self.num_vms_per_instance} VMs)", + ) + deploy_config = OSWorldDeploymentConfig( + gpu="L40S", + min_instances=self.num_vms_per_instance, + max_instances=self.num_vms_per_instance, + ) + + logger.info(f"[pool-{index}] Creating function ({self.num_vms_per_instance} VMs)...") + result = self._deployer.create_function(func_config) + function = result.get("function", {}) + function_id = function.get("id") + version_id = function.get("versionId") + if not function_id or not version_id: + raise RuntimeError(f"[pool-{index}] create_function failed: {result}") + + logger.info(f"[pool-{index}] Deploying {function_id}...") + self._deployer.deploy(function_id, version_id, deploy_config) + + logger.info(f"[pool-{index}] Waiting for ACTIVE...") + self._deployer.wait_for_active( + function_id, version_id, timeout=1800, poll_interval=30 + ) + logger.info(f"[pool-{index}] ACTIVE: {function_id} with {self.num_vms_per_instance} instances") + return function_id, version_id + + def _undeploy_one(self, function_id: str, version_id: str) -> None: + """Undeploy and delete a single NVCF function.""" + try: + logger.info(f"Undeploying {function_id}...") + self._deployer.undeploy(function_id, version_id, graceful=True) + except Exception as e: + logger.warning(f"Failed to undeploy {function_id}: {e}") + try: + self._deployer.delete_function(function_id, version_id) + except Exception as e: + logger.warning(f"Failed to delete function {function_id}: {e}") + + def deploy_all(self, max_workers: int = 8) -> None: + """Deploy NVCF functions in parallel and wait for all to become ACTIVE. + + With num_vms_per_instance > 1, deploys fewer functions (each with + multiple instances) to reach the desired pool_size. + """ + logger.info( + f"Deploying {self._num_functions} NVCF function(s) " + f"x {self.num_vms_per_instance} VMs each = {self.pool_size} total slots..." + ) + + workers = min(max_workers, self._num_functions) + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [executor.submit(self._deploy_one, i) for i in range(self._num_functions)] + for future in futures: + fn_id, ver_id = future.result() # raises if deploy failed + self._all_functions.append((fn_id, ver_id)) + # Add one entry per VM instance so acquire/release works correctly + for _ in range(self.num_vms_per_instance): + self._available.put((fn_id, ver_id)) + + logger.info( + f"All {self._num_functions} function(s) deployed. " + f"{self._available.qsize()} VM slots ready." + ) + + def deploy_all_from_ids(self, function_ids: List[Tuple[str, str]], vms_per_function: int = 1) -> None: + """Use pre-existing function IDs instead of deploying new ones.""" + for fn_id, ver_id in function_ids: + self._all_functions.append((fn_id, ver_id)) + for _ in range(vms_per_function): + self._available.put((fn_id, ver_id)) + self.pool_size = len(function_ids) * vms_per_function + self.num_vms_per_instance = vms_per_function + logger.info(f"Pool initialized with {len(function_ids)} pre-existing function(s), {self.pool_size} total slots.") + + def acquire(self, timeout: Optional[float] = None) -> Tuple[str, str]: + """Acquire a function from the pool. Blocks until one is available. + + Returns: + (function_id, version_id) tuple + """ + try: + return self._available.get(block=True, timeout=timeout) + except queue.Empty: + raise TimeoutError(f"No NVCF function available within {timeout}s") + + def release(self, function_id: str, version_id: str) -> None: + """Return a function to the pool for reuse.""" + self._available.put((function_id, version_id)) + + def health_check(self, function_id: str) -> bool: + """Check if an NVCF function is still healthy by pinging /platform.""" + try: + import requests + headers = { + "Authorization": f"Bearer {self._nvcf_api_key}", + "Function-ID": function_id, + } + r = requests.get( + "https://grpc.nvcf.nvidia.com/api/platform", + headers=headers, + timeout=10.0, + ) + return r.status_code == 200 + except Exception: + return False + + def _is_function_gone(self, function_id: str, version_id: str) -> bool: + """Check if a function has been completely deleted/evicted (404).""" + try: + self._deployer.get_function_info(function_id, version_id) + return False + except Exception as e: + if '404' in str(e) or 'Not found' in str(e): + return True + return False + + def release_or_replace(self, function_id: str, version_id: str) -> None: + """Release a function back to pool, replacing it if unhealthy. + + For multi-instance functions, individual instance failures are handled + by NVCF internally (it maintains min_instances). We only deploy a + full replacement if the entire function is gone (404). + """ + if self.health_check(function_id): + self._available.put((function_id, version_id)) + return + + # For multi-instance functions: check if the function itself is gone + # vs just a transient instance failure that NVCF will self-heal. + if self.num_vms_per_instance > 1 and not self._is_function_gone(function_id, version_id): + logger.warning( + f"Function {function_id} health check failed but function still exists. " + f"NVCF should self-heal the instance. Releasing slot back to pool." + ) + self._available.put((function_id, version_id)) + return + + logger.warning(f"Function {function_id} is gone (404), deploying replacement...") + # Undeploy broken function in background (best-effort cleanup) + threading.Thread( + target=self._undeploy_one, args=(function_id, version_id), daemon=True + ).start() + # Deploy replacement + try: + new_fn_id, new_ver_id = self._deploy_one(len(self._all_functions)) + with self._lock: + self._all_functions.append((new_fn_id, new_ver_id)) + # Add slots for all VMs on the replacement function + for _ in range(self.num_vms_per_instance): + self._available.put((new_fn_id, new_ver_id)) + logger.info(f"Replacement function {new_fn_id} deployed with {self.num_vms_per_instance} VM slots.") + except Exception as e: + logger.error(f"Failed to deploy replacement: {e}. Pool size reduced.") + + def undeploy_all(self) -> None: + """Undeploy and delete all functions in the pool.""" + logger.info(f"Undeploying {len(self._all_functions)} NVCF function(s)...") + for fn_id, ver_id in self._all_functions: + self._undeploy_one(fn_id, ver_id) + self._all_functions.clear() + # Drain the queue + while not self._available.empty(): + try: + self._available.get_nowait() + except queue.Empty: + break + logger.info("All NVCF functions undeployed.") + + @property + def nvcf_api_key(self) -> Optional[str]: + return self._nvcf_api_key + + @property + def nvcf_org(self) -> Optional[str]: + return self._nvcf_org diff --git a/cua/parallel_collect_trajectories.py b/cua/parallel_collect_trajectories.py index 53f91753f..da5a0de87 100644 --- a/cua/parallel_collect_trajectories.py +++ b/cua/parallel_collect_trajectories.py @@ -11,7 +11,7 @@ from openhands.core.logger import openhands_logger # Configure logging -openhands_logger.setLevel(logging.WARNING) +openhands_logger.setLevel(logging.DEBUG) logger = openhands_logger.getChild('parallel_collector') logger.setLevel(logging.INFO) @@ -35,7 +35,7 @@ def __init__(self, index: int): self.error: Optional[str] = None # Runtime objects (populated during execution) - self.runtime: Any = None + self.env: Any = None # OSWorld DesktopEnv instance self.trajectory_data: Optional[Dict] = None self.save_dir: Any = None self.osworld_setup: Any = None @@ -46,6 +46,7 @@ def __init__(self, args, data_collector: DataCollector): self.data_collector = data_collector self.max_parallel = args.max_parallel self.max_trajectories = args.max_trajectories + self.runtime_type = getattr(args, 'runtime', 'singularity') # Queues self.init_queue: queue.Queue = queue.Queue() @@ -65,9 +66,10 @@ def __init__(self, args, data_collector: DataCollector): self._server_running = False # For sequential VM start-ups to mitigate boot storm + # NVCF uses pre-deployed VMs so no boot storm delay needed self._launch_lock = threading.Lock() self._last_launch_time = 0 - self._launch_delay_seconds = 15.0 # Wait 15s between starts + self._launch_delay_seconds = 0.0 if self.runtime_type == "nvcf" else 15.0 def start_workers(self): self._server_running = True @@ -112,23 +114,16 @@ async def _init_worker(self, worker_id: int): job_details = self.jobs[job_idx] # Wait for available runtime slot - # logger.debug(f"[init-{worker_id}] Waiting for slot for job {job_idx}") await asyncio.to_thread(self._runtime_semaphore.acquire) # Rate Limit Logic: Prevent Boot Storm wait_time = 0.0 with self._launch_lock: now = time.time() - # The earliest this worker can start is either NOW, - # or 15s after the last scheduled launch. target_start_time = max(now, self._last_launch_time + self._launch_delay_seconds) - wait_time = target_start_time - now - - # Reserve this slot by updating the global timestamp immediately self._last_launch_time = target_start_time - # Perform the wait asynchronously (outside the lock) if wait_time > 0: if wait_time > 1.0: logger.info(f"[init-{worker_id}] Delayed boot-up: waiting {wait_time:.1f}s...") @@ -141,14 +136,14 @@ async def _init_worker(self, worker_id: int): try: # --- call init_runtime_for_job --- # - # This creates the runtime and runs setup - runtime, traj_data, save_dir, traj_id, setup = \ + # This creates the DesktopEnv, deploys NVCF (if needed), and runs setup + env, traj_data, save_dir, traj_id, setup = \ await self.data_collector.init_runtime_for_job(job_idx) # Store details in the pre-allocated object - job_details.job_id = traj_id # Using traj_id as primary ID + job_details.job_id = traj_id job_details.trajectory_id = traj_id - job_details.runtime = runtime + job_details.env = env job_details.trajectory_data = traj_data job_details.save_dir = save_dir job_details.osworld_setup = setup @@ -159,8 +154,15 @@ async def _init_worker(self, worker_id: int): except Exception as e: logger.error(f"[init-{worker_id}] Failed setup for job {job_idx}: {e}") job_details.error = str(e) - job_details.completed = False # Failed - job_details.event.set() # Signal main thread we are done (failed) + job_details.completed = False + job_details.event.set() + + # Close DesktopEnv if it was created + if job_details.env: + try: + job_details.env.close() + except Exception: + pass # Release semaphore immediately on failure self._runtime_semaphore.release() @@ -184,7 +186,7 @@ async def _collect_worker(self, worker_id: int): # --- call collect_trajectory --- # This runs the Planner/Actor loop await self.data_collector.collect_trajectory( - job_details.runtime, + job_details.env, job_details.trajectory_data, job_details.save_dir, job_details.osworld_setup @@ -196,10 +198,9 @@ async def _collect_worker(self, worker_id: int): logger.error(f"[collect-{worker_id}] Error in {traj_id}: {e}") job_details.error = str(e) finally: - # Cleanup Runtime - if job_details.runtime: - # Run close in background thread to not block loop - threading.Thread(target=job_details.runtime.close, daemon=True).start() + # Cleanup DesktopEnv (closes NVCF proxy, undeploys function) + if job_details.env: + threading.Thread(target=job_details.env.close, daemon=True).start() # Release semaphore (allows new Init worker to proceed) self._runtime_semaphore.release() @@ -269,6 +270,10 @@ def parse_args(): parser.add_argument("--planner_node", type=str, required=True) parser.add_argument("--actor_node", type=str, required=True) + # Runtime selection + parser.add_argument("--runtime", type=str, choices=["singularity", "nvcf"], default="singularity", + help="Runtime backend: 'singularity' (local KVM) or 'nvcf' (NVCF via OSWorld DesktopEnv)") + # Environment & Setup parser.add_argument("--vm_image_path", type=str, default="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/cua/prorl-agent-server/OS_images/Ubuntu.qcow2") @@ -295,8 +300,7 @@ def parse_args(): # Parallel specific args parser.add_argument("--max_parallel", type=int, default=24, help="Max concurrent VMs") parser.add_argument( - "--max_trajectories", type=int, default=10000, help="Total trajectories to generate" - ) + "--max_trajectories", type=int, default=10000, help="Total trajectories to generate") return parser.parse_args() @@ -309,6 +313,9 @@ async def main(): logger.info("DataCollector initialized (datasets loaded)") # 2. Start Parallel Generator + # Each worker's DesktopEnv manages its own NVCF function lifecycle + # (deploy, local proxy, health monitoring, undeploy on close) + # No centralized NVCFPool needed - OSWorld's NVCFProvider handles everything. generator = ParallelTrajectoryGenerator(args, data_collector) await generator.run() diff --git a/cua/scripts/collect_trajectories_sbatch.sh b/cua/scripts/collect_trajectories_sbatch.sh new file mode 100755 index 000000000..4aeaecf9c --- /dev/null +++ b/cua/scripts/collect_trajectories_sbatch.sh @@ -0,0 +1,251 @@ +#!/bin/bash +# ============================================================================ +# 2-Node Batch Trajectory Collection +# ============================================================================ +# Node 0 (Planner): Runs Qwen3-VL-235B vLLM server (tp=8, all 8 GPUs) +# Node 1 (Actor): 2x UI-TARS-1.5-7B vLLM servers (tp=4 each, GPUs 0-3 + 4-7) +# + round-robin load balancer on port 8000 +# + trajectory collection +# +# Usage: sbatch collect_trajectories_sbatch.sh +# ============================================================================ + +#SBATCH --job-name=traj_collect +#SBATCH --account=nvr_lacr_llm +#SBATCH --partition=batch_block1 +#SBATCH --gpus-per-node=8 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=0 +#SBATCH --time=04:00:00 +#SBATCH --exclusive +#SBATCH --output=/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server/cua/scripts/logs/traj_collect_%j.out +#SBATCH --error=/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server/cua/scripts/logs/traj_collect_%j.err + +set -euo pipefail + +# ============================================================================ +# Configuration +# ============================================================================ +IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" +PROJECT_ROOT="/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server" +PROJECT_DIR="$PROJECT_ROOT/cua" + +PLANNER_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/models/Qwen3-VL-235B-A22B-Thinking" +ACTOR_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/bcui/huggingface_models/UI-TARS-1.5-7B" + +PLANNER_PORT=8000 +ACTOR_PORT=8000 # Load balancer port (what the code talks to) +ACTOR_PORT_1=8001 # Actor replica 1 (GPUs 0-3) +ACTOR_PORT_2=8002 # Actor replica 2 (GPUs 4-7) + +# Trajectory collection settings +MAX_PARALLEL=16 +MAX_TRAJECTORIES=1024 + +# Log file name (timestamped) +TIMESTAMP=$(date +%m-%d-%H%M) +LOG_FILE="$PROJECT_DIR/${TIMESTAMP}-logs.log" + +# ============================================================================ +# Resolve node assignments +# ============================================================================ +ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +PLANNER_NODE=$(echo "$ALL_NODES" | head -n 1) +ACTOR_NODE=$(echo "$ALL_NODES" | tail -n 1) + +echo "[sbatch] Job ID: $SLURM_JOB_ID" +echo "[sbatch] Planner Node: $PLANNER_NODE" +echo "[sbatch] Actor Node: $ACTOR_NODE" +echo "[sbatch] Log file: $LOG_FILE" + +mkdir -p "$PROJECT_DIR/scripts/logs" + +# ============================================================================ +# Launch Planner vLLM server on Node 0 +# ============================================================================ +echo "[sbatch] Starting Planner vLLM server on $PLANNER_NODE..." +srun --nodes=1 --ntasks=1 --nodelist="$PLANNER_NODE" \ + --container-image="$IMAGE" \ + --container-mounts=/lustre:/lustre \ + --container-writable \ + bash -c " + vllm serve $PLANNER_MODEL \ + --api-key gen \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --limit-mm-per-prompt.video 0 \ + --limit-mm-per-prompt.image 3 \ + --async-scheduling \ + --max-model-len 65536 \ + --gpu-memory-utilization 0.9 \ + > $PROJECT_DIR/scripts/logs/planner_${SLURM_JOB_ID}.log 2>&1 + " & +PLANNER_SRUN_PID=$! + +# ============================================================================ +# Launch Actor vLLM server + trajectory collection on Node 1 +# ============================================================================ +echo "[sbatch] Starting 2x Actor vLLM servers + collection on $ACTOR_NODE..." +srun --nodes=1 --ntasks=1 --nodelist="$ACTOR_NODE" \ + --container-image="$IMAGE" \ + --container-mounts=/lustre:/lustre \ + --container-writable \ + bash -c " + # --- Start Actor vLLM replica 1 (GPUs 0-3) --- + CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve $ACTOR_MODEL \ + --served-model-name ByteDance-Seed/UI-TARS-1.5-7B \ + --api-key gen \ + --port $ACTOR_PORT_1 \ + --tensor-parallel-size 4 \ + --limit-mm-per-prompt.image 5 \ + --limit-mm-per-prompt.video 0 \ + --max-model-len 65536 \ + --disable-log-requests \ + --disable-log-stats \ + > $PROJECT_DIR/scripts/logs/actor1_${SLURM_JOB_ID}.log 2>&1 & + ACTOR1_PID=\$! + + # --- Start Actor vLLM replica 2 (GPUs 4-7) --- + CUDA_VISIBLE_DEVICES=4,5,6,7 vllm serve $ACTOR_MODEL \ + --served-model-name ByteDance-Seed/UI-TARS-1.5-7B \ + --api-key gen \ + --port $ACTOR_PORT_2 \ + --tensor-parallel-size 4 \ + --limit-mm-per-prompt.image 5 \ + --limit-mm-per-prompt.video 0 \ + --max-model-len 65536 \ + --disable-log-requests \ + --disable-log-stats \ + > $PROJECT_DIR/scripts/logs/actor2_${SLURM_JOB_ID}.log 2>&1 & + ACTOR2_PID=\$! + + # --- Start round-robin load balancer on port $ACTOR_PORT --- + python3 -c ' +import http.server, http.client, threading, sys, io + +backends = [(\"localhost\", $ACTOR_PORT_1), (\"localhost\", $ACTOR_PORT_2)] +counter = 0 +lock = threading.Lock() + +class LBHandler(http.server.BaseHTTPRequestHandler): + def do_ANY(self, method): + global counter + with lock: + host, port = backends[counter % len(backends)] + counter += 1 + + content_length = int(self.headers.get(\"Content-Length\", 0)) + body = self.rfile.read(content_length) if content_length > 0 else None + + try: + conn = http.client.HTTPConnection(host, port, timeout=300) + conn.request(method, self.path, body=body, headers=dict(self.headers)) + resp = conn.getresponse() + resp_body = resp.read() + + self.send_response(resp.status) + for k, v in resp.getheaders(): + if k.lower() not in (\"transfer-encoding\",): + self.send_header(k, v) + self.end_headers() + self.wfile.write(resp_body) + conn.close() + except Exception as e: + self.send_response(502) + self.end_headers() + self.wfile.write(f\"LB error: {e}\".encode()) + + def do_GET(self): self.do_ANY(\"GET\") + def do_POST(self): self.do_ANY(\"POST\") + def do_PUT(self): self.do_ANY(\"PUT\") + def do_DELETE(self): self.do_ANY(\"DELETE\") + def log_message(self, format, *args): pass # silence logs + +server = http.server.ThreadingHTTPServer((\"0.0.0.0\", $ACTOR_PORT), LBHandler) +print(f\"[LB] Round-robin load balancer on port $ACTOR_PORT -> {backends}\", flush=True) +server.serve_forever() +' > $PROJECT_DIR/scripts/logs/actor_lb_${SLURM_JOB_ID}.log 2>&1 & + LB_PID=\$! + + # --- Wait for all servers to be healthy --- + echo '[actor-node] Waiting for vLLM servers to become healthy...' + + wait_for_server() { + local host=\$1 + local port=\$2 + local name=\$3 + local max_wait=600 + local elapsed=0 + + while [ \$elapsed -lt \$max_wait ]; do + if curl -sf http://\${host}:\${port}/health > /dev/null 2>&1; then + echo \"[actor-node] \$name server healthy (\${elapsed}s)\" + return 0 + fi + sleep 10 + elapsed=\$((elapsed + 10)) + if [ \$((elapsed % 60)) -eq 0 ]; then + echo \"[actor-node] Still waiting for \$name (\${elapsed}s)...\" + fi + done + echo \"[actor-node] ERROR: \$name server did not start within \${max_wait}s\" + return 1 + } + + # Wait for both actor replicas and planner + wait_for_server localhost $ACTOR_PORT_1 'Actor-1 (GPU 0-3)' + ACTOR1_OK=\$? + + wait_for_server localhost $ACTOR_PORT_2 'Actor-2 (GPU 4-7)' + ACTOR2_OK=\$? + + wait_for_server $PLANNER_NODE $PLANNER_PORT Planner + PLANNER_OK=\$? + + if [ \$ACTOR1_OK -ne 0 ] || [ \$ACTOR2_OK -ne 0 ] || [ \$PLANNER_OK -ne 0 ]; then + echo '[actor-node] ERROR: One or more servers failed to start.' + echo 'Planner log:' && tail -20 $PROJECT_DIR/scripts/logs/planner_${SLURM_JOB_ID}.log 2>/dev/null + echo 'Actor-1 log:' && tail -20 $PROJECT_DIR/scripts/logs/actor1_${SLURM_JOB_ID}.log 2>/dev/null + echo 'Actor-2 log:' && tail -20 $PROJECT_DIR/scripts/logs/actor2_${SLURM_JOB_ID}.log 2>/dev/null + kill \$ACTOR1_PID \$ACTOR2_PID \$LB_PID 2>/dev/null + exit 1 + fi + + echo '[actor-node] All servers healthy. Starting trajectory collection...' + + # --- Run trajectory collection --- + cd $PROJECT_DIR + source cua_env_reqs/bin/activate + export PYTHONPATH=$PROJECT_ROOT:\$PYTHONPATH + + python parallel_collect_trajectories.py \ + --planner_node $PLANNER_NODE \ + --actor_node $ACTOR_NODE \ + --runtime nvcf \ + --max_parallel $MAX_PARALLEL \ + --max_trajectories $MAX_TRAJECTORIES \ + 2>&1 | tee $LOG_FILE + + COLLECT_EXIT=\$? + + # Cleanup + kill \$ACTOR1_PID \$ACTOR2_PID \$LB_PID 2>/dev/null + echo \"[actor-node] Collection finished (exit code: \$COLLECT_EXIT)\" + exit \$COLLECT_EXIT + " & +ACTOR_SRUN_PID=$! + +# ============================================================================ +# Wait for completion +# ============================================================================ +# Wait for the actor srun (which runs collection). When it finishes, kill planner. +wait $ACTOR_SRUN_PID +COLLECT_EXIT=$? + +echo "[sbatch] Actor node finished (exit: $COLLECT_EXIT). Stopping planner..." +kill $PLANNER_SRUN_PID 2>/dev/null +wait $PLANNER_SRUN_PID 2>/dev/null + +echo "[sbatch] Done. Log: $LOG_FILE" +exit $COLLECT_EXIT diff --git a/cua/scripts/debug_check_kvm.sbatch b/cua/scripts/debug_check_kvm.sbatch index 27f2719da..650b8671e 100644 --- a/cua/scripts/debug_check_kvm.sbatch +++ b/cua/scripts/debug_check_kvm.sbatch @@ -1,29 +1,53 @@ #!/bin/bash -#SBATCH --array=0%1 # todo set the number of runs here -#SBATCH --partition=cpu_interactive -#SBATCH --reservation=sla_res_osworld_agent_vlm_cpu_only -#SBATCH --account=nvr_lpr_agentic -#SBATCH --job-name=cua-check_kvm +#SBATCH --array=0%1 +#SBATCH --partition=interactive +#SBATCH --account=llmservice_fm_vision +#SBATCH --reservation=sla_res_osworld_agent_vlm +#SBATCH --gpus-per-node=8 +#SBATCH --job-name=cua-check_kvm_gpu #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --time=04:00:00 -#SBATCH --output=logs/slurm-%A.out -#SBATCH --error=logs/slurm-%A.out +#SBATCH --time=00:30:00 +#SBATCH --exclusive +#SBATCH --output=logs/slurm-kvm-gpu-%A.out +#SBATCH --error=logs/slurm-kvm-gpu-%A.out +# ============================================================================ +# Test KVM availability on GPU partition nodes +# This validates that /dev/kvm is accessible inside a container on GPU nodes, +# which is required for the consolidated 2-node setup. +# ============================================================================ -CONTAINER_IMAGE=/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_cpu.sqsh +CONTAINER_IMAGE=/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_vllm.sqsh # 1. Identify the Compute Node -# In case this script runs on a head node, we grab the assigned node name. TARGET_NODE=$(scontrol show hostnames "$SLURM_NODELIST" | head -n 1) -echo "[Script] Job assigned to node: $TARGET_NODE" +echo "[Script] Job assigned to GPU node: $TARGET_NODE" + +# 1.5. Check if /dev/kvm exists on the node (before container) +echo "[Script] Checking if /dev/kvm exists on $TARGET_NODE (host level)..." +ssh -q -o StrictHostKeyChecking=no "$TARGET_NODE" " + echo '--- Host-level KVM check ---' + if [ -e /dev/kvm ]; then + echo '/dev/kvm EXISTS on host' + ls -la /dev/kvm + stat /dev/kvm + getent group kvm 2>/dev/null || echo 'kvm group not found' + else + echo 'WARNING: /dev/kvm does NOT exist on this GPU node!' + echo 'KVM may not be enabled on GPU partition nodes.' + echo 'This is a HARD BLOCKER for the consolidated setup.' + fi + echo '--- End host-level check ---' +" -# 2. Launch Container on that Node +# 2. Launch Container on that Node (with /dev/kvm mount) echo "[Script] Launching container (sleep infinity) on $TARGET_NODE..." srun --nodelist="$TARGET_NODE" \ --container-image="$CONTAINER_IMAGE" \ - --container-mounts=/lustre:/lustre \ + --container-mounts=/lustre:/lustre,/dev/kvm:/dev/kvm \ + --container-writable \ sleep infinity & SRUN_PID=$! @@ -35,34 +59,54 @@ CONTAINER_PID="" while [ -z "$CONTAINER_PID" ]; do sleep 2 - # explanation of the command sent via ssh: - # 1. enroot list -f -> Lists processes - # 2. grep pyxis -> Filters for your container name - # 3. grep sleep -> Ensures the command column shows 'sleep' - # 4. awk print $2 -> Grabs the PID CONTAINER_PID=$(ssh -q -o StrictHostKeyChecking=no "$TARGET_NODE" \ "enroot list -f | grep 'pyxis' | grep 'sleep' | awk '{print \$2}' | head -n 1") if [ -z "$CONTAINER_PID" ]; then - echo "[Script] Container registered, but 'sleep' command not yet visible. Retrying..." + echo "[Script] Container not yet visible. Retrying..." fi done echo "[Script] Found PID: $CONTAINER_PID on $TARGET_NODE" -# 4. Execute the Test (via SSH -> Enroot Exec) -echo "[Script] Checking kvm write permission..." +# 4. Execute KVM checks inside container +echo "[Script] Checking KVM inside container on GPU node..." ssh -o StrictHostKeyChecking=no "$TARGET_NODE" "enroot exec $CONTAINER_PID bash -c ' - if touch /dev/kvm 2>/dev/null; then - echo \"SUCCESS: /dev/kvm is writable!\" + echo \"--- Container-level KVM check (GPU node) ---\" + echo \"Running as user: \$(whoami) (uid=\$(id -u), gid=\$(id -g))\" + echo \"User groups: \$(groups)\" + echo \"\" + + if [ -e /dev/kvm ]; then + echo \"/dev/kvm EXISTS in container\" + ls -la /dev/kvm + + if [ -r /dev/kvm ]; then + echo \"READ permission: YES\" + else + echo \"READ permission: NO\" + fi + + if [ -w /dev/kvm ]; then + echo \"WRITE permission: YES\" + echo \"\" + echo \"SUCCESS: /dev/kvm is writable on GPU node!\" + echo \"The consolidated 2-node setup should work.\" + else + echo \"WRITE permission: NO\" + echo \"\" + echo \"FAILURE: Cannot write to /dev/kvm on GPU node\" + echo \"Admin intervention may be needed.\" + exit 1 + fi else - echo \"FAILURE: Permission denied.\" + echo \"FAILURE: /dev/kvm does NOT exist in container\" + echo \"The /dev/kvm mount may have failed or KVM is not available on this GPU node.\" exit 1 fi -'" - - - - - + echo \"\" + echo \"--- GPU check ---\" + nvidia-smi -L 2>/dev/null || echo \"nvidia-smi not available (expected in non-CUDA container)\" + echo \"--- End checks ---\" +'" diff --git a/cua/scripts/debug_interactive.sh b/cua/scripts/debug_interactive.sh index 0f7299f0a..9d43e850d 100644 --- a/cua/scripts/debug_interactive.sh +++ b/cua/scripts/debug_interactive.sh @@ -3,13 +3,14 @@ # --- 1. Submit the "Holder" Job --- echo "[Local] Submitting background job to reserve node..." -# for CPU reservation -IMAGE="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_cpu.sqsh" +# GPU reservation (consolidated: runs VMs on GPU node) +# IMAGE="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_vllm.sqsh" +IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" JOB_ID=$(sbatch --parsable \ --job-name=kvm_interactive \ - --account=nvr_lpr_agentic \ - --partition=cpu_interactive \ - --reservation=sla_res_osworld_agent_vlm_cpu_only \ + --account=nvr_lacr_llm \ + --partition=interactive \ + --gpus-per-node=8 \ --nodes=1 \ --ntasks-per-node=1 \ --time=04:00:00 \ @@ -18,14 +19,15 @@ JOB_ID=$(sbatch --parsable \ --error=/dev/null \ --wrap="srun --container-image=$IMAGE --container-mounts=/lustre:/lustre sleep infinity") -# for GPU reservation - note: the container image for GPU node is not ready yet -#IMAGE="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_vllm.sqsh" +# Old CPU-only reservation (kept for reference) +#IMAGE="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_cpu.sqsh" #JOB_ID=$(sbatch --parsable \ # --job-name=kvm_interactive \ -# --account=llmservice_fm_vision \ -# --partition=interactive \ -# --gpus-per-node=8 \ -# --reservation=sla_res_osworld_agent_vlm \ +# --account=nvr_lpr_agentic \ +# --partition=cpu_interactive \ +# --reservation=sla_res_osworld_agent_vlm_cpu_only \ +# --nodes=1 \ +# --ntasks-per-node=1 \ # --time=04:00:00 \ # --exclusive \ # --output=/dev/null \ @@ -87,7 +89,10 @@ echo "[Local] Found Container PID: $CONTAINER_PID" # --- 5. Launch Interactive Session --- echo "==========================================================" -echo " KVM-enabled shell on $NODE " +echo " KVM-enabled GPU shell on $NODE " +echo "==========================================================" +echo " Container: $(basename $IMAGE)" +echo " /dev/kvm mounted for VM support" echo "==========================================================" # -t forces pseudo-terminal allocation so you get an interactive shell diff --git a/cua/scripts/debug_interactive_2node.sh b/cua/scripts/debug_interactive_2node.sh new file mode 100755 index 000000000..2af498c94 --- /dev/null +++ b/cua/scripts/debug_interactive_2node.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# ============================================================================ +# 2-Node Interactive GPU Debug Script +# ============================================================================ +# Node 1 (Planner): Runs Qwen3-VL-235B vLLM server (tp=8, all 8 GPUs) +# Node 2 (Actor): Runs UI-TARS-1.5-7B vLLM server (tp=4) + KVM for VMs +# +# Both vLLM servers auto-start, then you get an interactive shell on the +# Actor node for manual debugging / data collection. +# ============================================================================ + +IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" + +PROJECT_ROOT="/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server" +PROJECT_DIR="$PROJECT_ROOT/cua" + +PLANNER_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/models/Qwen3-VL-235B-A22B-Thinking" +# ACTOR_MODEL="ByteDance-Seed/UI-TARS-1.5-7B" +ACTOR_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/bcui/huggingface_models/UI-TARS-1.5-7B" + +PLANNER_PORT=8000 +ACTOR_PORT=8000 + +# --- 1. Submit Holder Job (2 GPU nodes) --- +echo "[Local] Submitting 2-node GPU job..." + +JOB_ID=$(sbatch --parsable \ + --job-name=debug_2node \ + --account=llmservice_fm_vision \ + --partition=interactive \ + --reservation=sla_res_osworld_agent_vlm \ + --gpus-per-node=8 \ + --nodes=2 \ + --ntasks-per-node=1 \ + --mem=0 \ + --time=04:00:00 \ + --exclusive \ + --output=$PROJECT_DIR/scripts/logs/debug_2node_holder_%j.out \ + --error=$PROJECT_DIR/scripts/logs/debug_2node_holder_%j.err \ + --wrap="srun --container-image=$IMAGE --container-mounts=/lustre:/lustre --container-writable sleep infinity") + +if [ -z "$JOB_ID" ]; then + echo "Error: Job submission failed." + exit 1 +fi + +echo "[Local] Job submitted. ID: $JOB_ID" + +# --- 2. Cleanup Trap --- +cleanup() { + echo "" + echo "[Local] Cleaning up... Cancelling Job $JOB_ID" + scancel "$JOB_ID" +} +trap cleanup EXIT + +# --- 3. Wait for Job to Start --- +echo "[Local] Waiting for job to start..." +NODES="" +while [ -z "$NODES" ]; do + JOB_STATE=$(squeue -j "$JOB_ID" -h -o %T) + if [ "$JOB_STATE" == "RUNNING" ]; then + NODES=$(squeue -j "$JOB_ID" -h -o %N) + elif [ -z "$JOB_STATE" ]; then + echo "Error: Job disappeared from queue!" + exit 1 + fi + sleep 2 +done + +# Expand nodelist to individual hostnames +ALL_NODES=$(scontrol show hostnames "$NODES") +PLANNER_NODE=$(echo "$ALL_NODES" | head -n 1) +ACTOR_NODE=$(echo "$ALL_NODES" | tail -n 1) + +echo "[Local] Job is RUNNING" +echo "[Local] Planner Node: $PLANNER_NODE" +echo "[Local] Actor Node: $ACTOR_NODE" + +# --- 4. Wait for Containers on Both Nodes --- +wait_for_container() { + local node=$1 + local name=$2 + local pid="" + + echo "[Local] Waiting for container on $name ($node)..." >&2 + while [ -z "$pid" ]; do + sleep 2 + pid=$(ssh -q -o StrictHostKeyChecking=no "$node" \ + "enroot list -f | grep 'pyxis' | grep 'sleep' | awk '{print \$2}' | head -n 1") + if [ -z "$pid" ]; then + printf "." >&2 + fi + done + echo "" >&2 + echo "[Local] $name container ready (PID: $pid)" >&2 + echo "$pid" +} + +PLANNER_PID=$(wait_for_container "$PLANNER_NODE" "Planner") +ACTOR_PID=$(wait_for_container "$ACTOR_NODE" "Actor") + +# --- 5. Launch Planner vLLM Server (background) --- +echo "[Local] Starting Planner vLLM server on $PLANNER_NODE..." +ssh -q -o StrictHostKeyChecking=no "$PLANNER_NODE" \ + "enroot exec $PLANNER_PID bash -c ' + mkdir -p $PROJECT_DIR/scripts/logs + nohup vllm serve $PLANNER_MODEL \ + --api-key gen \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --limit-mm-per-prompt.video 0 \ + --limit-mm-per-prompt.image 3 \ + --async-scheduling \ + --max-model-len 65536 \ + --gpu-memory-utilization 0.9 \ + > $PROJECT_DIR/scripts/logs/planner_debug.log 2>&1 & + echo \"[Planner] vLLM server launched (PID: \$!)\" + '" & + +# --- 6. Launch Actor vLLM Server (background) --- +echo "[Local] Starting Actor vLLM server on $ACTOR_NODE..." +ssh -q -o StrictHostKeyChecking=no "$ACTOR_NODE" \ + "enroot exec $ACTOR_PID bash -c ' + mkdir -p $PROJECT_DIR/scripts/logs + nohup vllm serve $ACTOR_MODEL \ + --served-model-name ByteDance-Seed/UI-TARS-1.5-7B \ + --api-key gen \ + --tensor-parallel-size 4 \ + --limit-mm-per-prompt.image 5 \ + --limit-mm-per-prompt.video 0 \ + --max-model-len 65536 \ + --disable-log-requests \ + --disable-log-stats \ + > $PROJECT_DIR/scripts/logs/actor_debug.log 2>&1 & + echo \"[Actor] vLLM server launched (PID: \$!)\" + '" & + +wait # wait for both SSH commands to return + +# --- 7. Wait for Both Servers to Be Healthy --- +echo "[Local] Waiting for vLLM servers to become healthy..." + +wait_for_server() { + local node=$1 + local container_pid=$2 + local port=$3 + local name=$4 + local max_wait=600 + local elapsed=0 + + while [ $elapsed -lt $max_wait ]; do + if ssh -q -o StrictHostKeyChecking=no "$node" \ + "enroot exec $container_pid curl -sf http://localhost:$port/health" > /dev/null 2>&1; then + echo "[Local] $name server healthy on $node:$port" + return 0 + fi + sleep 10 + elapsed=$((elapsed + 10)) + if [ $((elapsed % 60)) -eq 0 ]; then + echo "[Local] Still waiting for $name (${elapsed}s)..." + fi + done + + echo "[Local] ERROR: $name server did not start within ${max_wait}s" + return 1 +} + +wait_for_server "$PLANNER_NODE" "$PLANNER_PID" "$PLANNER_PORT" "Planner" & +WAIT_PLANNER_PID=$! + +wait_for_server "$ACTOR_NODE" "$ACTOR_PID" "$ACTOR_PORT" "Actor" & +WAIT_ACTOR_PID=$! + +wait $WAIT_PLANNER_PID +PLANNER_OK=$? + +wait $WAIT_ACTOR_PID +ACTOR_OK=$? + +if [ $PLANNER_OK -ne 0 ] || [ $ACTOR_OK -ne 0 ]; then + echo "[Local] ERROR: One or both servers failed to start." + echo "[Local] Check logs:" + echo " Planner: $PROJECT_DIR/scripts/logs/planner_debug.log" + echo " Actor: $PROJECT_DIR/scripts/logs/actor_debug.log" + exit 1 +fi + +# --- 8. Launch Interactive Shell on Actor Node --- +echo "" +echo "==========================================================" +echo " 2-Node Interactive Debug Session" +echo "==========================================================" +echo " Planner: $PLANNER_NODE (Qwen3-VL-235B, port $PLANNER_PORT)" +echo " Actor: $ACTOR_NODE (UI-TARS-1.5-7B, port $ACTOR_PORT)" +echo " KVM: available via reservation" +echo "" +echo " Planner API: http://$PLANNER_NODE:$PLANNER_PORT" +echo " Actor API: http://localhost:$ACTOR_PORT" +echo "" +echo " Logs:" +echo " tail -f $PROJECT_DIR/scripts/logs/planner_debug.log" +echo " tail -f $PROJECT_DIR/scripts/logs/actor_debug.log" +echo "==========================================================" + +ssh -t -q -o StrictHostKeyChecking=no "$ACTOR_NODE" \ + "enroot exec $ACTOR_PID bash -c ' + cd $PROJECT_DIR + source cua_env_reqs/bin/activate + export PYTHONPATH=$PROJECT_ROOT:\$PYTHONPATH + export PLANNER_NODE=$PLANNER_NODE + export PLANNER_PORT=$PLANNER_PORT + export ACTOR_PORT=$ACTOR_PORT + exec /bin/bash -l + '" + +# --- 9. End --- +echo "[Local] Session ended." diff --git a/cua/scripts/run.sh b/cua/scripts/run.sh new file mode 100755 index 000000000..98cd852a2 --- /dev/null +++ b/cua/scripts/run.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# ============================================================================ +# CUA Data Collection - Multi-Actor Launcher +# ============================================================================ +# Orchestrates 1 planner + N actor nodes: +# 1. Submits planner sbatch job (Qwen3-VL-235B vLLM) +# 2. Waits for planner to write its hostname to a coordination file +# 3. Launches NUM_ACTORS instances of run_actor_and_vm.sh in parallel +# 4. Each actor submits its own holder job on a reserved node +# 5. Waits for all actors to finish, then cancels planner +# +# Usage: +# bash run.sh # 1 actor (default) +# NUM_ACTORS=3 MAX_PARALLEL=4 MAX_TRAJECTORIES=500 bash run.sh +# ============================================================================ + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export LOG_DIR="${LOG_DIR:-$SCRIPT_DIR/logs_multi_thread}" + +# Configurable parameters +NUM_ACTORS="${NUM_ACTORS:-1}" +MAX_PARALLEL="${MAX_PARALLEL:-16}" +MAX_TRAJECTORIES="${MAX_TRAJECTORIES:-10000}" + +# Create logs directory +mkdir -p "$LOG_DIR" + +# Generate unique coordination file +COORD_ID="$(date +%Y%m%d_%H%M%S)_$$" +COORD_FILE="$LOG_DIR/.planner_host_${COORD_ID}" + +PLANNER_JOB_ID="" + +echo "============================================" +echo "CUA Data Collection - Multi-Actor Launcher" +echo "============================================" +echo "NUM_ACTORS: $NUM_ACTORS" +echo "MAX_PARALLEL: $MAX_PARALLEL (per actor)" +echo "MAX_TRAJECTORIES: $MAX_TRAJECTORIES (per actor)" +echo "Coordination file: $COORD_FILE" +echo "" + +# --- Cleanup: cancel planner on exit --- +cleanup() { + echo "" + echo "[run.sh] Cleaning up..." + if [ -n "$PLANNER_JOB_ID" ]; then + echo "[run.sh] Cancelling planner job $PLANNER_JOB_ID" + scancel "$PLANNER_JOB_ID" 2>/dev/null + fi + rm -f "$COORD_FILE" +} +trap cleanup EXIT + +# --- 1. Submit Planner Job --- +echo "[run.sh] Submitting planner job..." +PLANNER_JOB_ID=$(sbatch \ + --output="$LOG_DIR/planner-%j.out" \ + --export=ALL,COORD_FILE="$COORD_FILE" \ + --parsable \ + "$SCRIPT_DIR/run_planner.sbatch") +echo "[run.sh] Planner job submitted: $PLANNER_JOB_ID" + +# --- 2. Wait for planner hostname --- +echo "[run.sh] Waiting for planner to start and write hostname..." +PLANNER_NODE="" +COORD_ELAPSED=0 +MAX_COORD_WAIT=900 # 15 minutes + +while [ $COORD_ELAPSED -lt $MAX_COORD_WAIT ]; do + if [ -f "$COORD_FILE" ]; then + PLANNER_NODE=$(cat "$COORD_FILE" | tr -d '[:space:]') + if [ -n "$PLANNER_NODE" ]; then + echo "[run.sh] Planner node discovered: $PLANNER_NODE" + break + fi + fi + sleep 10 + COORD_ELAPSED=$((COORD_ELAPSED + 10)) + if [ $((COORD_ELAPSED % 60)) -eq 0 ]; then + echo "[run.sh] Still waiting for planner (${COORD_ELAPSED}s)..." + fi +done + +if [ -z "$PLANNER_NODE" ]; then + echo "[run.sh] ERROR: Planner did not start within ${MAX_COORD_WAIT}s." + exit 1 +fi + +# --- 3. Launch N Actor Instances --- +echo "[run.sh] Launching $NUM_ACTORS actor(s)..." +ACTOR_PIDS=() + +for i in $(seq 1 "$NUM_ACTORS"); do + echo "[run.sh] Starting actor $i..." + PLANNER_NODE="$PLANNER_NODE" \ + MAX_PARALLEL="$MAX_PARALLEL" \ + MAX_TRAJECTORIES="$MAX_TRAJECTORIES" \ + bash "$SCRIPT_DIR/run_actor_and_vm.sh" "$i" \ + > "$LOG_DIR/actor_launcher_${i}.log" 2>&1 & + ACTOR_PIDS+=($!) + echo "[run.sh] Actor $i launched (PID ${ACTOR_PIDS[-1]}, log: $LOG_DIR/actor_launcher_${i}.log)" +done + +# --- 4. Wait for all actors --- +echo "" +echo "[run.sh] All actors launched. Waiting for completion..." +echo "" +echo "Monitor with:" +echo " squeue -u \$USER" +echo " tail -f $LOG_DIR/planner-*.out" +for i in $(seq 1 "$NUM_ACTORS"); do + echo " tail -f $LOG_DIR/actor_launcher_${i}.log" +done +echo "" + +FAILED=0 +for i in "${!ACTOR_PIDS[@]}"; do + ACTOR_NUM=$((i + 1)) + wait "${ACTOR_PIDS[$i]}" 2>/dev/null + EXIT_CODE=$? + if [ $EXIT_CODE -eq 0 ]; then + echo "[run.sh] Actor $ACTOR_NUM finished successfully." + else + echo "[run.sh] Actor $ACTOR_NUM failed (exit code $EXIT_CODE)." + FAILED=$((FAILED + 1)) + fi +done + +echo "" +echo "============================================" +echo "[run.sh] All actors finished. $FAILED/$NUM_ACTORS failed." +echo "[run.sh] Planner will be cancelled by cleanup trap." +echo "============================================" + +if [ $FAILED -gt 0 ]; then + exit 1 +fi diff --git a/cua/scripts/run_actor_and_vm.sh b/cua/scripts/run_actor_and_vm.sh new file mode 100755 index 000000000..81b20a89a --- /dev/null +++ b/cua/scripts/run_actor_and_vm.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# ============================================================================ +# Actor + VM Launcher (SSH+Enroot Pattern) +# ============================================================================ +# Submits a holder "sleep infinity" job on a reserved GPU node, waits for +# the container to be ready, then SSH+enroot execs into it to run: +# 1. UI-TARS-1.5-7B vLLM server (background, TP=4) +# 2. Data collection via parallel_collect_trajectories.py +# +# Required env vars: +# PLANNER_NODE - hostname of the planner vLLM server +# +# Optional env vars: +# MAX_PARALLEL - parallel VMs per actor (default: 1) +# MAX_TRAJECTORIES - trajectories to collect (default: 10000) +# +# Optional arg: +# $1 = actor index (for log naming, default: 0) +# +# Usage: +# PLANNER_NODE=pool0-12345 bash run_actor_and_vm.sh 1 +# ============================================================================ + +ACTOR_IDX="${1:-0}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOG_DIR="${LOG_DIR:-$SCRIPT_DIR/logs_single}" + +# Project paths +PROJECT_ROOT="/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server" +PROJECT_DIR="$PROJECT_ROOT/cua" + +ACTOR_IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" + +# Model +ACTOR_MODEL="ByteDance-Seed/UI-TARS-1.5-7B" + +# Data collection params +MAX_PARALLEL=${MAX_PARALLEL:-1} +MAX_TRAJECTORIES=${MAX_TRAJECTORIES:-10000} + +PLANNER_PORT=8000 +ACTOR_PORT=8000 + +# Validate +if [ -z "$PLANNER_NODE" ]; then + echo "[Actor $ACTOR_IDX] ERROR: PLANNER_NODE not set." + exit 1 +fi + +mkdir -p "$LOG_DIR" + +echo "[Actor $ACTOR_IDX] PLANNER_NODE=$PLANNER_NODE" +echo "[Actor $ACTOR_IDX] MAX_PARALLEL=$MAX_PARALLEL MAX_TRAJECTORIES=$MAX_TRAJECTORIES" + +# --- 1. Submit holder job on reserved node --- +echo "[Actor $ACTOR_IDX] Submitting holder job..." +ACTOR_JOB_ID=$(sbatch --parsable \ + --job-name="cua_actor_${ACTOR_IDX}" \ + --account=llmservice_fm_vision \ + --partition=interactive \ + --reservation=sla_res_osworld_agent_vlm \ + --gpus-per-node=8 \ + --mem=0 \ + --time=04:00:00 \ + --exclusive \ + --output="$LOG_DIR/actor_holder_${ACTOR_IDX}-%j.out" \ + --wrap="srun --container-image=$ACTOR_IMAGE --container-mounts=/lustre:/lustre sleep infinity") + +if [ -z "$ACTOR_JOB_ID" ]; then + echo "[Actor $ACTOR_IDX] ERROR: Job submission failed." + exit 1 +fi +echo "[Actor $ACTOR_IDX] Holder job submitted: $ACTOR_JOB_ID" + +# --- 2. Cleanup trap --- +cleanup() { + echo "" + echo "[Actor $ACTOR_IDX] Cleaning up... Cancelling holder job $ACTOR_JOB_ID" + scancel "$ACTOR_JOB_ID" 2>/dev/null +} +trap cleanup EXIT + +# --- 3. Wait for job to start --- +echo "[Actor $ACTOR_IDX] Waiting for holder job to start..." +ACTOR_NODE="" +while [ -z "$ACTOR_NODE" ]; do + JOB_STATE=$(squeue -j "$ACTOR_JOB_ID" -h -o %T 2>/dev/null) + + if [ "$JOB_STATE" == "RUNNING" ]; then + ACTOR_NODE=$(squeue -j "$ACTOR_JOB_ID" -h -o %N) + elif [ -z "$JOB_STATE" ]; then + echo "[Actor $ACTOR_IDX] ERROR: Job $ACTOR_JOB_ID disappeared from queue!" + exit 1 + fi + sleep 2 +done +echo "[Actor $ACTOR_IDX] Job RUNNING on node: $ACTOR_NODE" + +# --- 4. Wait for container readiness --- +echo "[Actor $ACTOR_IDX] Polling for container readiness on $ACTOR_NODE..." +CONTAINER_PID="" +while [ -z "$CONTAINER_PID" ]; do + sleep 2 + CONTAINER_PID=$(ssh -q -o StrictHostKeyChecking=no "$ACTOR_NODE" \ + "enroot list -f | grep 'pyxis' | grep 'sleep' | awk '{print \$2}' | head -n 1" 2>/dev/null) + + if [ -z "$CONTAINER_PID" ]; then + printf "." + fi +done +echo "" +echo "[Actor $ACTOR_IDX] Container ready, PID: $CONTAINER_PID" + +# --- 5. Wait for planner to be ready --- +echo "[Actor $ACTOR_IDX] Waiting for planner at $PLANNER_NODE:$PLANNER_PORT..." +PLANNER_WAIT=0 +PLANNER_MAX_WAIT=900 # 15 minutes +while ! nc -z "$PLANNER_NODE" "$PLANNER_PORT" 2>/dev/null; do + sleep 10 + PLANNER_WAIT=$((PLANNER_WAIT + 10)) + if [ $((PLANNER_WAIT % 60)) -eq 0 ]; then + echo "[Actor $ACTOR_IDX] Still waiting for planner (${PLANNER_WAIT}s)..." + fi + if [ $PLANNER_WAIT -ge $PLANNER_MAX_WAIT ]; then + echo "[Actor $ACTOR_IDX] ERROR: Planner not ready within ${PLANNER_MAX_WAIT}s." + exit 1 + fi +done +echo "[Actor $ACTOR_IDX] Planner is accepting connections!" + +# --- 6. SSH+enroot exec: launch actor vLLM + data collection --- +ACTOR_LOG_FILE="$LOG_DIR/actor_${ACTOR_IDX}-${ACTOR_JOB_ID}.out" +echo "==========================================================" +echo "[Actor $ACTOR_IDX] Executing on $ACTOR_NODE (job $ACTOR_JOB_ID)" +echo "[Actor $ACTOR_IDX] Logging to: $ACTOR_LOG_FILE" +echo "==========================================================" + +ssh -t -q -o StrictHostKeyChecking=no "$ACTOR_NODE" \ + "enroot exec $CONTAINER_PID /bin/bash -c ' + set -e + + echo \"[Actor $ACTOR_IDX] Launching UI-TARS-1.5-7B vLLM...\" + vllm serve $ACTOR_MODEL \ + --api-key gen \ + --tensor-parallel-size 4 \ + --limit-mm-per-prompt.image 5 \ + --limit-mm-per-prompt.video 0 \ + --max-model-len 65536 \ + --disable-log-requests \ + --disable-log-stats \ + > $LOG_DIR/vllm_actor_${ACTOR_IDX}.log 2>&1 & + VLLM_PID=\$! + + # Wait for local actor vLLM to be healthy + echo \"[Actor $ACTOR_IDX] Waiting for actor vLLM health...\" + ELAPSED=0 + MAX_WAIT=600 + while [ \$ELAPSED -lt \$MAX_WAIT ]; do + if curl -sf http://localhost:$ACTOR_PORT/health > /dev/null 2>&1; then + echo \"[Actor $ACTOR_IDX] Actor vLLM healthy!\" + break + fi + sleep 10 + ELAPSED=\$((ELAPSED + 10)) + if [ \$((ELAPSED % 60)) -eq 0 ]; then + echo \"[Actor $ACTOR_IDX] Still waiting for actor vLLM (\${ELAPSED}s)...\" + fi + done + + if [ \$ELAPSED -ge \$MAX_WAIT ]; then + echo \"[Actor $ACTOR_IDX] ERROR: Actor vLLM did not start within \${MAX_WAIT}s\" + kill \$VLLM_PID 2>/dev/null + exit 1 + fi + + echo \"[Actor $ACTOR_IDX] Starting data collection...\" + cd $PROJECT_DIR + source cua_env_reqs/bin/activate + export PYTHONPATH=$PROJECT_ROOT:\$PYTHONPATH + + python parallel_collect_trajectories.py \ + --planner_node $PLANNER_NODE \ + --actor_node localhost \ + --max_parallel $MAX_PARALLEL \ + --max_trajectories $MAX_TRAJECTORIES + + COLLECT_EXIT=\$? + echo \"[Actor $ACTOR_IDX] Data collection finished with exit code \$COLLECT_EXIT\" + kill \$VLLM_PID 2>/dev/null + exit \$COLLECT_EXIT + '" 2>&1 | tee "$ACTOR_LOG_FILE" diff --git a/cua/scripts/run_all.sbatch b/cua/scripts/run_all.sbatch new file mode 100644 index 000000000..baacd93b4 --- /dev/null +++ b/cua/scripts/run_all.sbatch @@ -0,0 +1,181 @@ +#!/bin/bash +#SBATCH --job-name=cua_all_single_thread +#SBATCH --account=nvr_lacr_llm +#SBATCH --partition=interactive +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --mem=0 +#SBATCH --time=04:00:00 +#SBATCH --exclusive +#SBATCH --output=logs_single/run_all-%j.out + +# ============================================================================ +# Consolidated 2-Node CUA Data Collection Script +# ============================================================================ +# Node 1 (Planner): Runs Qwen3-VL-235B vLLM server (tp=8, all 8 GPUs) +# Node 2 (Actor): Runs UI-TARS-1.5-7B vLLM server (tp=4) + data collection VMs +# +# This replaces the previous 3-node setup (2 GPU + 1 CPU) by co-locating +# VMs on the Actor node which has spare GPU/CPU capacity. +# ============================================================================ + +# Combined container with vLLM/CUDA + QEMU/KVM dependencies +# CONTAINER_IMAGE="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/cua_vllm.sqsh" +# CONTAINER_IMAGE="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/images/vllm-0.13.0.sqsh" +CONTAINER_IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" + +# Project paths +PROJECT_ROOT="/lustre/fsw/portfolios/nvr/users/bcui/ProRL-Agent-Server" +PROJECT_DIR="$PROJECT_ROOT/cua" + +# Model paths +PLANNER_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/models/Qwen3-VL-235B-A22B-Thinking" +# PLANNER_MODEL="/lustre/fsw/portfolios/nvr/users/bcui/models/qwen-3-vl-4b-thinking" +ACTOR_MODEL="ByteDance-Seed/UI-TARS-1.5-7B" + +PLANNER_PORT=8000 +ACTOR_PORT=8000 + +# Trying to run this w/ KVM? +MAX_PARALLEL=${MAX_PARALLEL:-1} +MAX_TRAJECTORIES=${MAX_TRAJECTORIES:-10000} + +# --- 1. Get Node List and Assign Roles --- +ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +PLANNER_NODE=$(echo "$ALL_NODES" | head -n 1) +ACTOR_NODE=$(echo "$ALL_NODES" | head -n 2 | tail -n 1) + +echo "[run_all] Job ID: $SLURM_JOB_ID" +echo "[run_all] Planner Node: $PLANNER_NODE" +echo "[run_all] Actor Node: $ACTOR_NODE" +echo "[run_all] Max Parallel: $MAX_PARALLEL" +echo "[run_all] Max Trajs: $MAX_TRAJECTORIES" + +# TODO: add back --enable-expert-parallel \ +# --enable-expert-parallel \ +# --- 2. Launch Planner vLLM Server (Node 1, all 8 GPUs) --- +srun -w "$PLANNER_NODE" \ + --nodes=1 \ + --ntasks=1 \ + --container-image="$CONTAINER_IMAGE" \ + --container-mounts=/lustre:/lustre \ + --output=logs_single/planner.out \ + bash -c " + echo '[Planner] Launching Qwen3-VL-235B on $PLANNER_NODE' + vllm serve $PLANNER_MODEL \ + --api-key gen \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --limit-mm-per-prompt.video 0 \ + --limit-mm-per-prompt.image 3 \ + --async-scheduling \ + --max-model-len 65536 \ + --gpu-memory-utilization 0.9 + " & +PLANNER_SRUN_PID=$! + +# --- 3. Launch Actor vLLM + Data Collection (Node 2) --- +# The Actor node runs both the vLLM server AND the data collection VMs. +# /dev/kvm is mounted for QEMU/KVM hardware virtualization. +srun -w "$ACTOR_NODE" \ + --nodes=1 \ + --ntasks=1 \ + --container-image="$CONTAINER_IMAGE" \ + --container-mounts=/lustre:/lustre,/dev/kvm:/dev/kvm \ + --container-writable \ + --output=logs_single/actor_and_collect.out \ + bash -c " + set -e + + # Suppress vLLM noise: only show warnings and above + # export VLLM_LOGGING_LEVEL=WARNING + + echo '[Actor] Launching UI-TARS-1.5-7B on $ACTOR_NODE' + # Redirect vLLM output to separate log to keep collection logs_single clean + vllm serve $ACTOR_MODEL \ + --api-key gen \ + --tensor-parallel-size 4 \ + --limit-mm-per-prompt.image 5 \ + --limit-mm-per-prompt.video 0 \ + --max-model-len 65536 \ + --disable-log-requests \ + --disable-log-stats \ + > $PROJECT_DIR/scripts/logs_single/vllm_actor.log 2>&1 & + VLLM_PID=\$! + + # --- Health Check: Wait for both vLLM servers --- + echo '[Actor] Waiting for vLLM servers to become healthy...' + + wait_for_server() { + local host=\$1 + local port=\$2 + local name=\$3 + local max_wait=600 # 10 minutes + local elapsed=0 + + while [ \$elapsed -lt \$max_wait ]; do + if curl -sf http://\${host}:\${port}/health > /dev/null 2>&1; then + echo \"[Actor] \$name server healthy at \${host}:\${port}\" + return 0 + fi + sleep 10 + elapsed=\$((elapsed + 10)) + if [ \$((elapsed % 60)) -eq 0 ]; then + echo \"[Actor] Still waiting for \$name (\${elapsed}s)...\" + fi + done + + echo \"[Actor] ERROR: \$name server did not start within \${max_wait}s\" + return 1 + } + + # Wait for local Actor server + wait_for_server localhost $ACTOR_PORT Actor + ACTOR_OK=\$? + + # Wait for remote Planner server + wait_for_server $PLANNER_NODE $PLANNER_PORT Planner + PLANNER_OK=\$? + + if [ \$ACTOR_OK -ne 0 ] || [ \$PLANNER_OK -ne 0 ]; then + echo '[Actor] ERROR: Server health check failed. Aborting.' + kill \$VLLM_PID 2>/dev/null + exit 1 + fi + + echo '[Actor] Both servers healthy. Starting data collection...' + + # --- Launch Data Collection --- + cd $PROJECT_DIR + source cua_env_reqs/bin/activate + export PYTHONPATH=$PROJECT_ROOT:\$PYTHONPATH + + python parallel_collect_trajectories.py \ + --planner_node $PLANNER_NODE \ + --actor_node localhost \ + --max_parallel $MAX_PARALLEL \ + --max_trajectories $MAX_TRAJECTORIES + + COLLECT_EXIT=\$? + echo \"[Actor] Data collection finished with exit code \$COLLECT_EXIT\" + + # Cleanup + kill \$VLLM_PID 2>/dev/null + exit \$COLLECT_EXIT + " & +ACTOR_SRUN_PID=$! + +# --- 4. Wait for completion --- +# The Actor srun will finish when data collection completes (or fails). +# The Planner srun runs indefinitely until we kill it. +echo "[run_all] Waiting for Actor node (data collection) to finish..." +wait $ACTOR_SRUN_PID +ACTOR_EXIT=$? + +echo "[run_all] Actor node exited with code $ACTOR_EXIT. Stopping Planner..." +kill $PLANNER_SRUN_PID 2>/dev/null +wait $PLANNER_SRUN_PID 2>/dev/null + +echo "[run_all] All done. Exit code: $ACTOR_EXIT" +exit $ACTOR_EXIT diff --git a/cua/scripts/run_planner.sbatch b/cua/scripts/run_planner.sbatch new file mode 100644 index 000000000..cc1202d12 --- /dev/null +++ b/cua/scripts/run_planner.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=cua_planner +#SBATCH --account=nvr_lacr_llm +#SBATCH --partition=interactive +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --mem=0 +#SBATCH --time=04:00:00 +#SBATCH --exclusive +#SBATCH --output=logs_single/planner-%j.out + +# ============================================================================ +# Planner Node: Runs Qwen3-VL-235B vLLM server (tp=8, all 8 GPUs) +# ============================================================================ +# Requires COORD_FILE env var (set by run.sh wrapper) +# Writes hostname to COORD_FILE so the actor job can discover this node. +# ============================================================================ + +if [ -z "$COORD_FILE" ]; then + echo "[Planner] ERROR: COORD_FILE not set. Use run.sh or pass via --export." + exit 1 +fi + +CONTAINER_IMAGE="/lustre/fsw/portfolios/nvr/users/bcui/images/cua-vllm-0.13.0.sqsh" +PLANNER_MODEL="/lustre/fs1/portfolios/nvr/projects/nvr_lacr_llm/users/jaehunj/models/Qwen3-VL-235B-A22B-Thinking" + +# Write hostname for actor discovery +PLANNER_HOST=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1) +echo "$PLANNER_HOST" > "$COORD_FILE" +echo "[Planner] Wrote hostname '$PLANNER_HOST' to $COORD_FILE" + +# Cleanup coordination file on exit +cleanup() { + echo "[Planner] Cleaning up coordination file..." + rm -f "$COORD_FILE" +} +trap cleanup EXIT + +echo "[Planner] Job ID: $SLURM_JOB_ID" +echo "[Planner] Node: $PLANNER_HOST" + +# Launch Planner vLLM server (runs in foreground until cancelled or time limit) +srun --nodes=1 \ + --ntasks=1 \ + --container-image="$CONTAINER_IMAGE" \ + --container-mounts=/lustre:/lustre \ + bash -c " + echo '[Planner] Launching Qwen3-VL-235B on $(hostname)' + vllm serve $PLANNER_MODEL \ + --api-key gen \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --limit-mm-per-prompt.video 0 \ + --limit-mm-per-prompt.image 3 \ + --async-scheduling \ + --max-model-len 65536 \ + --gpu-memory-utilization 0.9 + " diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/osworld_nvcf_example.py b/examples/osworld_nvcf_example.py new file mode 100644 index 000000000..21f0e33b4 --- /dev/null +++ b/examples/osworld_nvcf_example.py @@ -0,0 +1,533 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating OSWorld NVCF runtime usage with OSWorldInteractiveAction. + +This script mirrors examples/osworld_example.py (Singularity) so that the NVCF runtime +achieves exactly similar results: same sections (1–14), same actions and checks. +The runtime deploys an OSWorld function at connect() if NVCF_FUNCTION_ID is not set, +then runs the same flow as the Singularity example. + +Prerequisites: +- NGC_API_KEY and NGC_ORG in environment (e.g. ~/.bashrc) +- Optional: NVCF_FUNCTION_ID set to use an existing deployed function (no deploy on connect) +- Optional: pip install ngcsdk (required for deploy-on-connect) +- Container image is hardcoded in openhands.nvidia.os_world.nvcf.config (DEFAULT_CONTAINER_IMAGE) + +Run from project root: + PYTHONPATH=. python examples/osworld_nvcf_example.py +""" + +import asyncio +import sys +import os + +# Ensure openhands is importable +if __name__ == "__main__": + _root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + if _root not in sys.path: + sys.path.insert(0, _root) + +from openhands.core.config import OpenHandsConfig +from openhands.core.logger import openhands_logger as logger +from openhands.events import EventStream +from openhands.events.action.os import OSWorldInteractiveAction +from openhands.events.observation import ErrorObservation +from openhands.runtime.impl.nvcf import OSWorldNVCFRuntime +from openhands.storage import get_file_store + +# Use /tmp paths with nvcf prefix so we don't overwrite Singularity example outputs +PREFIX = "osworld_nvcf" + + +async def main(): + """Main example function: same flow as osworld_example.py.""" + print("=" * 80) + print("OSWorld NVCF Runtime Example") + print("=" * 80) + print() + + # 1. Create configuration + print("1. Creating configuration...") + config = OpenHandsConfig() + config.runtime = 'osworld_nvcf' + config.sandbox.base_container_image = 'ubuntu:24.04' + if os.environ.get("NVCF_FUNCTION_ID"): + print(f"✓ Using existing NVCF function (NVCF_FUNCTION_ID set)") + else: + print(f"✓ Will deploy OSWorld function on connect (NGC_ORG required)") + print(f" Runtime: {config.runtime}") + print() + + # 2. Create event stream + print("2. Creating event stream...") + file_store = get_file_store('local', f'/tmp/{PREFIX}_example') + event_stream = EventStream(sid=f'{PREFIX}-example', file_store=file_store) + print("✓ Event stream created") + print() + + # 3. Create OSWorld NVCF runtime (deploy on connect if no NVCF_FUNCTION_ID) + print("3. Creating OSWorld NVCF runtime...") + runtime = OSWorldNVCFRuntime( + config=config, + event_stream=event_stream, + sid=f'{PREFIX}-example', + os_type='linux', + nvcf_api_key=os.environ.get("NGC_API_KEY"), + nvcf_org=os.environ.get("NGC_ORG"), + undeploy_on_close=True, + #nvcf_function_id="", + #nvcf_version_id="" + ) + print("✓ Runtime created") + print(f" OS Type: {runtime.os_type}") + print() + + try: + # 4. Connect to runtime (deploys if needed, then verifies) + print("4. Connecting to runtime (this may take several minutes if deploying)...") + print(" - Deploying NVCF function (if not using existing)") + print(" - Waiting for function to become ACTIVE") + print(" - Verifying OSWorld server is reachable") + await runtime.connect() + print("✓ Runtime connected and VM is ready!") + print(f" VM Server URL: {runtime.osworld_vm_url}") + print(f" VNC URL: {runtime.vnc_url}") + print() + + # 5. Check if VM is alive + print("5. Checking VM health...") + runtime.check_if_alive() + print("✓ VM is alive and responding") + print() + + await asyncio.sleep(10) + + # 6. Get VM screenshot using OSWorldInteractiveAction + print("6. Taking VM screenshot...") + action = OSWorldInteractiveAction( + method='get_screenshot', + params={}, + thought='Taking initial screenshot of the VM desktop' + ) + observation = runtime.run_action(action) + print(f" Observation: {observation.content[:100]}..." if observation.content else " Observation: —") + + screenshot = runtime.get_vm_screenshot() + if screenshot: + screenshot_path = f'/tmp/{PREFIX}_screenshot.png' + with open(screenshot_path, 'wb') as f: + f.write(screenshot) + print(f"✓ Screenshot saved to {screenshot_path}") + print(f" Size: {len(screenshot)} bytes") + else: + print("✗ Failed to get screenshot") + print() + + # 7. Execute some actions using OSWorldInteractiveAction + print("7. Executing VM actions...") + print(" a. Clicking at position (10, 10)...") + action = OSWorldInteractiveAction( + method='execute_action', + params={ + 'action': { + 'action_type': 'CLICK', + 'parameters': {'x': 10, 'y': 10, 'button': 'left'} + } + }, + thought='Clicking at center-ish position on the screen' + ) + observation = runtime.run_action(action) + print(f" Result: {observation.content}") + print(f" Exit code: {observation.exit_code}") + await asyncio.sleep(1) + + print(" b. Typing 'Hello OSWorld'...") + action = OSWorldInteractiveAction( + method='execute_action', + params={ + 'action': { + 'action_type': 'TYPING', + 'parameters': {'text': 'Hello OSWorld'} + } + }, + thought='Typing a greeting message' + ) + observation = runtime.run_action(action) + print(f" Result: {observation.content}") + print(f" Exit code: {observation.exit_code}") + await asyncio.sleep(1) + + print(" c. Pressing Enter key...") + action = OSWorldInteractiveAction( + method='execute_action', + params={ + 'action': { + 'action_type': 'PRESS', + 'parameters': {'key': 'enter'} + } + }, + thought='Pressing Enter to confirm' + ) + observation = runtime.run_action(action) + print(f" Result: {observation.content}") + print(f" Exit code: {observation.exit_code}") + print("✓ Actions executed successfully") + print() + + # 8. Get VM information + print("8. Getting VM information...") + print(" a. Getting VM platform...") + action = OSWorldInteractiveAction( + method='get_vm_platform', + params={}, + thought='Getting the operating system platform' + ) + observation = runtime.run_action(action) + print(f" Platform: {observation.content}") + print(" b. Getting screen size...") + action = OSWorldInteractiveAction( + method='get_vm_screen_size', + params={}, + thought='Getting the screen dimensions' + ) + observation = runtime.run_action(action) + print(f" Screen size: {observation.content}") + print("✓ VM information retrieved") + print() + + # 9. Test advanced OSWorld methods + print("9. Testing advanced OSWorld methods...") + print(" a. Getting accessibility tree...") + action = OSWorldInteractiveAction( + method='get_accessibility_tree', + params={}, + thought='Getting UI accessibility tree for element inspection' + ) + observation = runtime.run_action(action) + axltree = observation.content[0] if isinstance(observation.content, (list, tuple)) and observation.content else observation.content + if axltree and len(axltree) > 0: + print(f" Accessibility tree retrieved ({len(axltree)} chars)") + print(f" Preview: {axltree[:200]}...") + with open(f'/tmp/{PREFIX}_accessibility_tree.xml', 'w') as f: + f.write(axltree) + print(f" Accessibility tree saved to /tmp/{PREFIX}_accessibility_tree.xml") + else: + print(" Note: Accessibility tree not available or empty") + await asyncio.sleep(1) + + print(" b. Getting terminal output...") + action = OSWorldInteractiveAction( + method='get_terminal_output', + params={}, + thought='Getting terminal output from the VM' + ) + observation = runtime.run_action(action) + if observation.content and len(observation.content) > 0: + print(f" Terminal output retrieved ({len(observation.content)} chars)") + print(f" Preview: {observation.content[:200]}...") + else: + print(" Note: No terminal output available") + await asyncio.sleep(1) + + print(" c. Executing Python command...") + action = OSWorldInteractiveAction( + method='execute_python_command', + params={ + 'command': "print('Hello from Python!'); import sys; print(f'Python version: {sys.version}')" + }, + thought='Running a simple Python command in the VM' + ) + observation = runtime.run_action(action) + print(f" Python output: {observation.content}") + print(f" Exit code: {observation.exit_code}") + await asyncio.sleep(1) + + print(" d. Running Python script...") + python_script = """ +import os +import platform + +print(f"Hostname: {platform.node()}") +print(f"Python: {platform.python_version()}") +print(f"OS: {platform.system()} {platform.release()}") +print(f"Current directory: {os.getcwd()}") +print(f"Home directory: {os.path.expanduser('~')}") +""" + action = OSWorldInteractiveAction( + method='run_python_script', + params={'script': python_script}, + thought='Running a multi-line Python script to get system info' + ) + observation = runtime.run_action(action) + print(f" Script output:") + for line in (observation.content or "").split('\n')[:10]: + if line.strip(): + print(f" {line}") + print(f" Exit code: {observation.exit_code}") + await asyncio.sleep(1) + + print(" e. Running bash script...") + bash_script = """echo "Hello from Bash!" +echo "Current user: $(whoami)" +echo "Current directory: $(pwd)" +echo "Date: $(date)" +""" + action = OSWorldInteractiveAction( + method='run_bash_script', + params={'script': bash_script, 'timeout': 30}, + thought='Running a simple bash script' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" ⚠ Bash script error: {observation.content}") + else: + print(f" Bash output:") + for line in (observation.content or "").split('\n'): + if line.strip(): + print(f" {line}") + if hasattr(observation, 'exit_code'): + print(f" Exit code: {observation.exit_code}") + print("✓ Advanced methods tested") + print() + + # 10. Test file download with get_file + print("10. Testing file download (get_file)...") + print(" a. Creating test file in VM...") + test_content = "Hello from OSWorld VM!\nThis is a test file.\nCreated at: $(date)" + action = OSWorldInteractiveAction( + method='run_bash_script', + params={ + 'script': f'echo "{test_content}" > /tmp/test_file.txt && cat /tmp/test_file.txt', + 'timeout': 10 + }, + thought='Creating a test file for download' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" ⚠ Could not create test file: {observation.content}") + else: + print(f" Test file created") + await asyncio.sleep(1) + print(" b. Downloading test file using get_file...") + action = OSWorldInteractiveAction( + method='get_file', + params={'file_path': '/tmp/test_file.txt'}, + thought='Downloading test file from VM' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" ⚠ Failed to download: {observation.content}") + else: + try: + import base64 + if observation.content.startswith('base64:'): + content_b64 = observation.content[7:] + file_data = base64.b64decode(content_b64) + download_path = f'/tmp/{PREFIX}_downloaded_file.txt' + with open(download_path, 'wb') as f: + f.write(file_data) + print(f" ✓ File downloaded to {download_path} ({len(file_data)} bytes)") + print(f" Content preview: {file_data.decode('utf-8')[:100]}") + else: + print(f" Unexpected format: {observation.content[:100]}") + except Exception as e: + print(f" Could not save file: {e}") + print("✓ File download tested") + print() + + # 11. Test VM information methods + print("11. Testing VM information methods...") + print(" a. Getting VM window size...") + action = OSWorldInteractiveAction( + method='get_vm_window_size', + params={'app_class_name': 'gnome-terminal-server'}, + thought='Getting window size for a specific application' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" Note: {observation.content}") + else: + print(f" Window size: {observation.content}") + await asyncio.sleep(1) + print(" b. Getting VM wallpaper...") + action = OSWorldInteractiveAction( + method='get_vm_wallpaper', + params={}, + thought='Getting the desktop wallpaper image' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" Note: {observation.content}") + else: + try: + import base64 + if observation.content.startswith('base64:'): + content_b64 = observation.content[7:] + wallpaper_data = base64.b64decode(content_b64) + wallpaper_path = f'/tmp/{PREFIX}_wallpaper.png' + with open(wallpaper_path, 'wb') as f: + f.write(wallpaper_data) + print(f" ✓ Wallpaper saved to {wallpaper_path} ({len(wallpaper_data)} bytes)") + else: + print(f" Unexpected format: {observation.content[:100]}") + except Exception as e: + print(f" Could not save wallpaper: {e}") + await asyncio.sleep(1) + print(" c. Getting VM desktop path...") + action = OSWorldInteractiveAction( + method='get_vm_desktop_path', + params={}, + thought='Getting the desktop directory path' + ) + observation = runtime.run_action(action) + desktop_path = observation.content + print(f" Desktop path: {desktop_path}") + await asyncio.sleep(1) + print(" d. Getting VM directory tree...") + action = OSWorldInteractiveAction( + method='get_vm_directory_tree', + params={'path': desktop_path if desktop_path else '/home'}, + thought='Listing directory contents' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" Error: {observation.content}") + else: + print(f" Directory tree:") + for i, line in enumerate((observation.content or "").split('\n')[:10]): + if line.strip(): + print(f" {line}") + lines = (observation.content or "").split('\n') + if len(lines) > 10: + print(f" ... ({len(lines) - 10} more lines)") + print("✓ VM information methods tested") + print() + + # 12. Test screen recording + print("12. Testing screen recording...") + print(" a. Starting screen recording...") + action = OSWorldInteractiveAction( + method='start_recording', + params={}, + thought='Starting to record the VM screen' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" ⚠ Recording not available: {observation.content}") + else: + print(f" Recording started: {observation.content}") + print(" b. Recording for 3 seconds...") + await asyncio.sleep(3) + print(" c. Performing actions while recording...") + action = OSWorldInteractiveAction( + method='execute_action', + params={ + 'action': { + 'action_type': 'MOVE_TO', + 'parameters': {'x': 100, 'y': 100} + } + }, + thought='Moving mouse to top-left during recording' + ) + runtime.run_action(action) + await asyncio.sleep(1) + print(" d. Moving to center...") + action = OSWorldInteractiveAction( + method='execute_action', + params={ + 'action': { + 'action_type': 'MOVE_TO', + 'parameters': {'x': 512, 'y': 384} + } + }, + thought='Moving mouse to center during recording' + ) + runtime.run_action(action) + await asyncio.sleep(1) + print(" e. Clicking at center...") + action = OSWorldInteractiveAction( + method='execute_action', + params={ + 'action': { + 'action_type': 'CLICK', + 'parameters': {'x': 512, 'y': 384} + } + }, + thought='Clicking at center during recording' + ) + runtime.run_action(action) + await asyncio.sleep(1) + print(" f. Stopping recording and downloading...") + action = OSWorldInteractiveAction( + method='end_recording', + params={}, + thought='Stopping the screen recording' + ) + observation = runtime.run_action(action) + if isinstance(observation, ErrorObservation): + print(f" ⚠ Failed to stop recording: {observation.content}") + else: + try: + import base64 + if observation.content.startswith('base64:'): + content_b64 = observation.content[7:] + video_data = base64.b64decode(content_b64) + video_path = f'/tmp/{PREFIX}_recording.mp4' + with open(video_path, 'wb') as f: + f.write(video_data) + print(f" ✓ Recording saved to {video_path} ({len(video_data)} bytes)") + else: + print(f" Unexpected format: {observation.content[:100]}") + except Exception as e: + print(f" Could not save recording: {e}") + print("✓ Screen recording tested") + print() + + # 13. Final screenshot + print("13. Taking final screenshot...") + action = OSWorldInteractiveAction( + method='get_screenshot', + params={}, + thought='Taking final screenshot after interactions' + ) + runtime.run_action(action) + screenshot = runtime.get_vm_screenshot() + if screenshot: + screenshot_path = f'/tmp/{PREFIX}_screenshot_after.png' + with open(screenshot_path, 'wb') as f: + f.write(screenshot) + print(f"✓ Final screenshot saved to {screenshot_path}") + await asyncio.sleep(2) + print() + print("=" * 80) + print("Example completed successfully!") + print("=" * 80) + print() + print("You can:") + print(f"1. View screenshots: open /tmp/{PREFIX}_screenshot*.png") + print(f"2. View wallpaper: open /tmp/{PREFIX}_wallpaper.png") + print(f"3. View recording: vlc /tmp/{PREFIX}_recording.mp4") + print(f"4. View downloaded file: cat /tmp/{PREFIX}_downloaded_file.txt") + print() + print("VM Service URLs (NVCF):") + print(f" • OSWorld API: {runtime.osworld_vm_url}") + print(f" • VNC: {runtime.vnc_url}") + print(f" • Chrome DevTools: {runtime.chromium_devtools_url}") + print(f" • VLC Web Interface: {runtime.vlc_url}") + print() + return 0 + + except Exception as e: + print(f"✗ Error: {e}") + logger.exception("Failed to run NVCF example") + return 1 + finally: + print("14. Cleaning up...") + runtime.close() + print("✓ Runtime closed") + print() + + +if __name__ == '__main__': + exit_code = asyncio.run(main()) + sys.exit(exit_code) diff --git a/examples/setup.py b/examples/setup.py index a93297530..1a23c0f20 100644 --- a/examples/setup.py +++ b/examples/setup.py @@ -35,7 +35,7 @@ FILE_PATH = os.path.dirname(os.path.abspath(__file__)) -MAX_RETRIES = 20 +MAX_RETRIES = 5 from openhands.nvidia.os_world import metrics, getters @@ -102,7 +102,7 @@ def normalize_url(url): return norm_url1 == norm_url2 class SetupController: - def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080, runtime=None): + def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080, runtime=None, http_client=None): self.vm_ip: str = vm_ip self.server_port: int = server_port self.chromium_port: int = chromium_port @@ -114,8 +114,72 @@ def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 922 self.screen_width: int = screen_width self.screen_height: int = screen_height self.runtime = runtime # Runtime object for interacting with the environment + self.http_client = http_client # Optional HTTP client (e.g. NVCFHttpClient) for routing VM requests self.additional_wait_time = 3 + def _vm_post(self, endpoint: str, **kwargs) -> requests.Response: + """POST to the VM server, routing through http_client if available.""" + if self.http_client: + return self.http_client.post(endpoint, **kwargs) + url = self.http_server + endpoint + return requests.post(url, **kwargs) + + def _vm_get(self, endpoint: str, **kwargs) -> requests.Response: + """GET from the VM server, routing through http_client if available.""" + if self.http_client: + return self.http_client.get(endpoint, **kwargs) + url = self.http_server + endpoint + return requests.get(url, **kwargs) + + def _get_cdp_url(self) -> str: + """Get the Chrome DevTools Protocol URL. + + Always uses the local address since: + - Singularity: vm_ip=127.0.0.1, chromium_port=actual Chrome port + - NVCF: vm_ip=127.0.0.1, chromium_port=local proxy port (proxy adds auth headers) + Playwright's connect_over_cdp() can't send custom auth headers on WebSocket, + so we must go through the local proxy for NVCF rather than direct to the NVCF WSS URL. + """ + return f"http://{self.vm_ip}:{self.chromium_port}" + + def _get_cdp_headers(self) -> Optional[Dict[str, str]]: + """Get CDP headers (for NVCF auth), or None for direct connections.""" + if self.http_client and hasattr(self.http_client, 'get_cdp_headers'): + return self.http_client.get_cdp_headers() + return None + + def _restart_vm_services(self, services: str = "chrome"): + """Restart crashed services inside the VM via bash script. + + Args: + services: Which services to restart. "chrome" restarts Chrome + socat. + """ + if services == "chrome": + script = """ +# Restart Chrome and socat (CDP proxy) +pkill -9 -f socat || true +pkill -9 -f 'chrome' || true +sleep 2 + +# Re-launch socat to proxy Chrome DevTools (port 9222 -> NVCF /chrome path) +nohup socat TCP-LISTEN:9222,fork,reuseaddr TCP:127.0.0.1:9223 &>/dev/null & + +# Re-launch Chrome +nohup google-chrome-wrapper --remote-debugging-port=9223 --remote-debugging-address=127.0.0.1 --remote-allow-origins=* --no-first-run --no-default-browser-check --disable-infobars --disable-session-crashed-bubble --disable-features=TranslateUI --start-maximized &>/dev/null & +sleep 3 +""" + else: + return + + try: + r = self._vm_post("/run_bash_script", json={"script": script, "timeout": 30}, timeout=60) + if r.status_code == 200: + logger.info(f"VM service restart ({services}) completed successfully") + else: + logger.warning(f"VM service restart ({services}) returned HTTP {r.status_code}") + except Exception as e: + logger.warning(f"VM service restart ({services}) failed: {e}") + def reset_cache_dir(self, cache_dir: str): self.cache_dir = cache_dir @@ -323,8 +387,8 @@ def _upload_file_setup(self, files: List[Dict[str, str]]): logger.debug(form.content_type) # Explicit connect/read timeout to avoid hanging forever - response = requests.post( - self.http_server + "/setup" + "/upload", + response = self._vm_post( + "/setup/upload", headers=headers, data=form, timeout=(10, 600) @@ -366,7 +430,7 @@ def _change_wallpaper_setup(self, path: str): # send request to server to change wallpaper # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/change_wallpaper", headers=headers, data=payload) + response = self._vm_post("/setup/change_wallpaper", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -391,16 +455,28 @@ def _open_setup(self, path: str): # send request to server to open file # Note: This uses a custom /setup endpoint, not a standard OSWorld method - try: - # The server-side call is now blocking and can take time. - # We set a timeout that is slightly longer than the server's timeout (1800s). - response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810) - response.raise_for_status() # This will raise an exception for 4xx and 5xx status codes - logger.info("Command executed successfully: %s", response.text) - time.sleep(self.additional_wait_time) - except requests.exceptions.RequestException as e: - logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") - raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e + max_retries = 5 + for attempt in range(max_retries): + try: + # The server-side call is now blocking and can take time. + # We set a timeout that is slightly longer than the server's timeout (1800s). + response = self._vm_post("/setup/open_file", headers=headers, data=payload, timeout=1810) + response.raise_for_status() # This will raise an exception for 4xx and 5xx status codes + logger.info("Command executed successfully: %s", response.text) + time.sleep(self.additional_wait_time) + return # Success + except requests.exceptions.RequestException as e: + status = getattr(getattr(e, 'response', None), 'status_code', None) + if status in (502, 503, 504) and attempt < max_retries - 1: + wait_time = 20 * (attempt + 1) + logger.warning( + f"open_file attempt {attempt + 1}/{max_retries} failed for '{path}' " + f"(HTTP {status}). Retrying in {wait_time}s..." + ) + time.sleep(wait_time) + continue + logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") + raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e def _ensure_launch_command_finish(self, command): if isinstance(command, list): @@ -532,13 +608,18 @@ def _launch_setup(self, command: Union[str, List[str]], shell: bool = False): logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.") command = command.split() + # For NVCF, rewrite launch commands (e.g. Chrome flags differ on cloud VMs) + if self.http_client and hasattr(self.http_client, 'update_launch_command'): + command = self.http_client.update_launch_command(command) + payload = json.dumps({"command": command, "shell": shell}) headers = {"Content-Type": "application/json"} # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/launch") - response = requests.post(self.http_server + "/setup" + "/launch", headers=headers, data=payload) + target = "NVCF" if self.http_client else (self.http_server + "/setup/launch") + logger.info("REQUEST ADDRESS: %s", target) + response = self._vm_post("/setup/launch", headers=headers, data=payload, timeout=300) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -611,7 +692,7 @@ def replace_screen_env_in_command(command): # Execute using runtime while not terminates: try: - response = requests.post(self.http_server + "/setup" + "/execute", headers=headers, data=payload) + response = self._vm_post("/setup/execute", headers=headers, data=payload, timeout=300) if response.status_code == 200: results: Dict[str, str] = response.json() if stdout: @@ -683,7 +764,7 @@ def _execute_with_verification_setup( # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/execute_with_verification", + response = self._vm_post("/setup/execute_with_verification", headers=headers, data=payload, timeout=max_wait_time + 10) if response.status_code == 200: result = response.json() @@ -734,7 +815,7 @@ def _activate_window_setup(self, window_name: str, strict: bool = False, by_clas # send request to server to open file # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload) + response = self._vm_post("/setup/activate_window", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -758,7 +839,7 @@ def _close_window_setup(self, window_name: str, strict: bool = False, by_class: # send request to server to open file # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/close_window", headers=headers, data=payload) + response = self._vm_post("/setup/close_window", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -771,13 +852,45 @@ async def _chrome_open_tabs_setup(self, urls_to_open: List[str]): if not self.runtime: raise Exception("Runtime is required for SetupController. Please provide a runtime object.") - host = self.vm_ip - port = self.chromium_port # fixme: this port is hard-coded, need to be changed from config file + # Pre-validate: check if Chrome DevTools is reachable via NVCF before Playwright retries + if self.http_client and hasattr(self.http_client, 'get_cdp_headers'): + max_pre_checks = 8 + r = None + for pre_attempt in range(max_pre_checks): + try: + cdp_headers = self.http_client.get_cdp_headers() + r = requests.get( + "https://grpc.nvcf.nvidia.com/chrome/json/version", + headers=cdp_headers, + timeout=10.0, + ) + if r.status_code == 200: + logger.info("Chrome pre-check passed (HTTP 200)") + break + logger.warning(f"Chrome pre-check attempt {pre_attempt+1}/{max_pre_checks}: HTTP {r.status_code}") + except Exception as e: + logger.warning(f"Chrome pre-check attempt {pre_attempt+1}/{max_pre_checks}: {e}") + + # After 3 consecutive failures, try restarting Chrome/socat inside the VM + if pre_attempt == 2: + logger.warning("Chrome pre-check failed 3 times, attempting VM-side Chrome restart...") + self._restart_vm_services("chrome") - remote_debugging_url = f"http://{host}:{port}" + if pre_attempt < max_pre_checks - 1: + wait_time = 10 * (pre_attempt + 1) + logger.info(f"Waiting {wait_time}s before next Chrome pre-check...") + time.sleep(wait_time) + else: + raise Exception( + f"Chrome DevTools unreachable after {max_pre_checks} pre-check attempts " + f"(last status: {getattr(r, 'status_code', 'N/A')}). " + f"Chrome or socat likely crashed inside the VM." + ) + + remote_debugging_url = self._get_cdp_url() logger.info("Connect to Chrome @: %s", remote_debugging_url) logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ)) - for attempt in range(15): + for attempt in range(5): if attempt > 0: time.sleep(5) @@ -787,12 +900,11 @@ async def _chrome_open_tabs_setup(self, urls_to_open: List[str]): browser = await p.chromium.connect_over_cdp(remote_debugging_url) # break except Exception as e: - if attempt < 14: + if attempt < 4: logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}") - # time.sleep(10) continue else: - logger.error(f"Failed to connect after multiple attempts: {e}") + logger.error(f"Failed to connect after 5 attempts: {e}") raise e if not browser: @@ -825,22 +937,19 @@ async def _chrome_close_tabs_setup(self, urls_to_close: List[str]): time.sleep(5) # Wait for Chrome to finish launching - host = self.vm_ip - port = self.chromium_port # fixme: this port is hard-coded, need to be changed from config file - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url = self._get_cdp_url() async with async_playwright() as p: browser = None - for attempt in range(15): + for attempt in range(5): try: browser = await p.chromium.connect_over_cdp(remote_debugging_url) break except Exception as e: - if attempt < 14: + if attempt < 4: logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}") time.sleep(5) else: - logger.error(f"Failed to connect after multiple attempts: {e}") + logger.error(f"Failed to connect after 5 attempts: {e}") raise e if not browser: @@ -962,22 +1071,19 @@ async def _login_setup(self, **config): if not self.runtime: raise Exception("Runtime is required for SetupController. Please provide a runtime object.") - host = self.vm_ip - port = self.chromium_port - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url = self._get_cdp_url() async with async_playwright() as p: browser = None - for attempt in range(15): + for attempt in range(5): try: browser = await p.chromium.connect_over_cdp(remote_debugging_url) break except Exception as e: - if attempt < 14: + if attempt < 4: logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}") time.sleep(5) else: - logger.error(f"Failed to connect after multiple attempts: {e}") + logger.error(f"Failed to connect after 5 attempts: {e}") raise e if not browser: return @@ -1025,7 +1131,7 @@ def execute_python_command(self, command: str): for _ in range(3): try: - response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'}, + response = self._vm_post("/execute", headers={'Content-Type': 'application/json'}, data=payload, timeout=90) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) @@ -1139,7 +1245,7 @@ def _update_browse_history_setup(self, **config): # send request to server to upload file try: logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload") - response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form) + response = self._vm_post("/setup/upload", headers=headers, data=form) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: diff --git a/openhands/agenthub/gui_agent/function_calling.py b/openhands/agenthub/gui_agent/function_calling.py index 028092901..d3d2312bd 100644 --- a/openhands/agenthub/gui_agent/function_calling.py +++ b/openhands/agenthub/gui_agent/function_calling.py @@ -70,6 +70,7 @@ def response_to_actions( mcp_tool_names: list[str] | None = None, timeout: float | None = None, ) -> Action: + actions: list[Action] = [] assert len(response.choices) == 1, 'Only one choice is supported for now' choice = response.choices[0] assistant_msg = choice.message @@ -84,247 +85,250 @@ def response_to_actions( thought += msg['text'] # Process each tool call to OpenHands action - tool_call = assistant_msg.tool_calls[0] - action: Action - logger.debug(f'Tool call in function_calling.py: {tool_call}') - try: - arguments = json.loads(tool_call.function.arguments) - except json.decoder.JSONDecodeError as e: - raise FunctionCallValidationError( - f'Failed to parse tool call arguments: {tool_call.function.arguments}' - ) from e + for tool_call in assistant_msg.tool_calls: + action: Action + logger.debug(f'Tool call in function_calling.py: {tool_call}') + try: + arguments = json.loads(tool_call.function.arguments) + except json.decoder.JSONDecodeError as e: + raise FunctionCallValidationError( + f'Failed to parse tool call arguments: {tool_call.function.arguments}' + ) from e - # ================================================ - # ClickTool - # ================================================ + # ================================================ + # ClickTool + # ================================================ - if tool_call.function.name == ClickTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'CLICK', - 'parameters': arguments, - } - }, - ) - # ================================================ - # RightClickTool - # ================================================ - elif tool_call.function.name == RightClickTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'RIGHT_CLICK', - 'parameters': arguments, - } - }, - ) - # ================================================ - # MiddleClickTool - # ================================================ - elif tool_call.function.name == MiddleClickTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'MIDDLE_CLICK', - 'parameters': arguments, - } - }, - ) - # ================================================ - # DoubleClickTool - # ================================================ - elif tool_call.function.name == DoubleClickTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'DOUBLE_CLICK', - 'parameters': arguments, - } - }, - ) - # ================================================ - # TripleClickTool - # ================================================ - elif tool_call.function.name == TripleClickTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'TRIPLE_CLICK', - 'parameters': arguments, - } - }, - ) - # ================================================ - # MoveToTool - # ================================================ - elif tool_call.function.name == MoveToTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'MOVE_TO', - 'parameters': arguments, - } - }, - ) - # ================================================ - # DragToTool - # ================================================ - elif tool_call.function.name == DragToTool['function']['name']: - arguments = validate_arguments(arguments, tool_call.function.name) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'DRAG_TO', - 'parameters': arguments, - } - }, - ) - # ================================================ - # ScrollTool - # ================================================ - elif tool_call.function.name == ScrollTool['function']['name']: - if 'amount' not in arguments: - raise FunctionCallValidationError( - f'Missing required argument "amount" in tool call {tool_call.function.name}' + if tool_call.function.name == ClickTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'CLICK', + 'parameters': arguments, + } + }, ) - # Map vertical scroll amount to dy; dx defaults to 0 - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'SCROLL', - 'parameters': arguments, - } - }, - ) - # ================================================ - # HorizontalScrollTool - # ================================================ - elif tool_call.function.name == HorizontalScrollTool['function']['name']: - if 'amount' not in arguments: - raise FunctionCallValidationError( - f'Missing required argument "amount" in tool call {tool_call.function.name}' + # ================================================ + # RightClickTool + # ================================================ + elif tool_call.function.name == RightClickTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'RIGHT_CLICK', + 'parameters': arguments, + } + }, ) - # Map vertical scroll amount to dy; dx defaults to 0 - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'SCROLL', - 'parameters': arguments, - } - }, - ) - # ================================================ - # WriteTool - # ================================================ - elif tool_call.function.name == WriteTool['function']['name']: - if 'text' not in arguments: - raise FunctionCallValidationError( - f'Missing required argument "text" in tool call {tool_call.function.name}' + # ================================================ + # MiddleClickTool + # ================================================ + elif tool_call.function.name == MiddleClickTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'MIDDLE_CLICK', + 'parameters': arguments, + } + }, ) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'TYPING', - 'parameters': arguments, - } - }, - ) - # ================================================ - # PressTool - # ================================================ - elif tool_call.function.name == PressTool['function']['name']: - if 'key' not in arguments: - raise FunctionCallValidationError( - f'Missing required argument "key" in tool call {tool_call.function.name}' + # ================================================ + # DoubleClickTool + # ================================================ + elif tool_call.function.name == DoubleClickTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'DOUBLE_CLICK', + 'parameters': arguments, + } + }, ) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'PRESS', - 'parameters': arguments, - } - }, - ) - # ================================================ - # HotkeyTool - # ================================================ - elif tool_call.function.name == HotkeyTool['function']['name']: - if 'keys' not in arguments: - raise FunctionCallValidationError( - f'Missing required argument "keys" in tool call {tool_call.function.name}' + # ================================================ + # TripleClickTool + # ================================================ + elif tool_call.function.name == TripleClickTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'TRIPLE_CLICK', + 'parameters': arguments, + } + }, ) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'HOTKEY', - 'parameters': arguments, - } - }, - ) - # ================================================ - # FailTool - # ================================================ - elif tool_call.function.name == FailTool['function']['name']: - action = AgentFinishAction( - task_completed='false', - ) - # ================================================ - # FinishTool - # ================================================ - elif tool_call.function.name == FinishTool['function']['name']: - action = AgentFinishAction( - task_completed='true', - ) - # ================================================ - # WaitTool - # ================================================ - elif tool_call.function.name == WaitTool['function']['name']: - if 'seconds' not in arguments: - raise FunctionCallValidationError( - f'Missing required argument "seconds" in tool call {tool_call.function.name}' + # ================================================ + # MoveToTool + # ================================================ + elif tool_call.function.name == MoveToTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'MOVE_TO', + 'parameters': arguments, + } + }, + ) + # ================================================ + # DragToTool + # ================================================ + elif tool_call.function.name == DragToTool['function']['name']: + arguments = validate_arguments(arguments, tool_call.function.name) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'DRAG_TO', + 'parameters': arguments, + } + }, + ) + # ================================================ + # ScrollTool + # ================================================ + elif tool_call.function.name == ScrollTool['function']['name']: + if 'amount' not in arguments: + raise FunctionCallValidationError( + f'Missing required argument "amount" in tool call {tool_call.function.name}' + ) + # Map vertical scroll amount to dy; dx defaults to 0 + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'SCROLL', + 'parameters': arguments, + } + }, + ) + # ================================================ + # HorizontalScrollTool + # ================================================ + elif tool_call.function.name == HorizontalScrollTool['function']['name']: + if 'amount' not in arguments: + raise FunctionCallValidationError( + f'Missing required argument "amount" in tool call {tool_call.function.name}' + ) + # Map vertical scroll amount to dy; dx defaults to 0 + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'SCROLL', + 'parameters': arguments, + } + }, + ) + # ================================================ + # WriteTool + # ================================================ + elif tool_call.function.name == WriteTool['function']['name']: + if 'text' not in arguments: + raise FunctionCallValidationError( + f'Missing required argument "text" in tool call {tool_call.function.name}' + ) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'TYPING', + 'parameters': arguments, + } + }, + ) + # ================================================ + # PressTool + # ================================================ + elif tool_call.function.name == PressTool['function']['name']: + if 'key' not in arguments: + raise FunctionCallValidationError( + f'Missing required argument "key" in tool call {tool_call.function.name}' + ) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'PRESS', + 'parameters': arguments, + } + }, + ) + # ================================================ + # HotkeyTool + # ================================================ + elif tool_call.function.name == HotkeyTool['function']['name']: + if 'keys' not in arguments: + raise FunctionCallValidationError( + f'Missing required argument "keys" in tool call {tool_call.function.name}' + ) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'HOTKEY', + 'parameters': arguments, + } + }, + ) + # ================================================ + # FailTool + # ================================================ + elif tool_call.function.name == FailTool['function']['name']: + action = AgentFinishAction( + task_completed='false', + ) + # ================================================ + # FinishTool + # ================================================ + elif tool_call.function.name == FinishTool['function']['name']: + action = AgentFinishAction( + task_completed='true', + ) + # ================================================ + # WaitTool + # ================================================ + elif tool_call.function.name == WaitTool['function']['name']: + if 'seconds' not in arguments: + raise FunctionCallValidationError( + f'Missing required argument "seconds" in tool call {tool_call.function.name}' + ) + action = OSWorldInteractiveAction( + method='execute_agentic_action', + params={ + 'action': { + 'action_type': 'WAIT', + 'parameters': arguments, + } + }, + ) + else: + raise FunctionCallNotExistsError( + f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.' ) - action = OSWorldInteractiveAction( - method='execute_agentic_action', - params={ - 'action': { - 'action_type': 'WAIT', - 'parameters': arguments, - } - }, - ) - else: - raise FunctionCallNotExistsError( - f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.' - ) - - action = combine_thought(action, thought) - # Add metadata for tool calling - action.tool_call_metadata = ToolCallMetadata( - tool_call_id=tool_call.id, - function_name=tool_call.function.name, - model_response=response, - total_calls_in_response=len(assistant_msg.tool_calls), - ) + + action = combine_thought(action, thought) + # Add metadata for tool calling + action.tool_call_metadata = ToolCallMetadata( + tool_call_id=tool_call.id, + function_name=tool_call.function.name, + model_response=response, + total_calls_in_response=len(assistant_msg.tool_calls), + ) + if timeout: + action.set_hard_timeout(timeout) + actions.append(action) else: action = MessageAction( content=str(assistant_msg.content) if assistant_msg.content else '', @@ -336,12 +340,14 @@ def response_to_actions( model_response=response, total_calls_in_response=0, ) + actions.append(action) # Add response id to actions # This will ensure we can match both actions without tool calls (e.g. MessageAction) # and actions with tool calls (e.g. CmdRunAction, IPythonRunCellAction, etc.) # with the token usage data - action.response_id = response.id - if timeout: - action.set_hard_timeout(timeout) - return action + for action in actions: + action.response_id = response.id + + assert len(actions) >= 1 + return actions diff --git a/openhands/agenthub/gui_agent/osworld_agent.py b/openhands/agenthub/gui_agent/osworld_agent.py index 6d474f0b8..3c1fddb28 100644 --- a/openhands/agenthub/gui_agent/osworld_agent.py +++ b/openhands/agenthub/gui_agent/osworld_agent.py @@ -1,5 +1,6 @@ import os from jinja2 import Template +from collections import deque from openhands.agenthub.gui_agent.tools import OSWORLD_TOOLS @@ -210,6 +211,8 @@ def __init__( self.llm.model_info['supports_vision'] = True else: self.llm.model_info = {'supports_vision': True} + + self.pending_actions: deque['Action'] = deque() self.reset() @@ -218,6 +221,7 @@ def reset(self) -> None: super().reset() self.cost_accumulator = 0 self.error_accumulator = 0 + self.pending_actions.clear() def _get_initial_user_message(self, history: list[Event]) -> MessageAction: """Get the initial user message from the conversation history. @@ -272,6 +276,9 @@ def _get_messages( include_a11y_tree = self.config.enable_a11y_tree total_screenshot_count = 0 + llm_response_ids_action = set() + llm_response_ids_observation = set() + # Build history prompts (alternating assistant/user messages) in reverse order for event in reversed(events): include_screenshot = self.config.enable_vision @@ -279,16 +286,24 @@ def _get_messages( include_screenshot = False if isinstance(event, OSWorldInteractiveAction): + llm_response_id = event.tool_call_metadata.model_response.id + if llm_response_id in llm_response_ids_action: + continue + llm_response_ids_action.add(llm_response_id) messages.append(convert_action_to_message(event)) elif isinstance(event, MessageAction): messages.append(convert_message_action_to_message( event, include_a11y_tree=include_a11y_tree, include_screenshot=include_screenshot)) total_screenshot_count += 1 elif isinstance(event, OSWorldOutputObservation) or isinstance(event, ErrorObservation): + llm_response_id = event.tool_call_metadata.model_response.id + if llm_response_id in llm_response_ids_observation: + continue msg = convert_observation_to_message( event, instruction, include_a11y_tree=include_a11y_tree, include_screenshot=include_screenshot) messages.append(msg) total_screenshot_count += 1 + llm_response_ids_observation.add(llm_response_id) # System message messages.append(Message(role='system', content=[TextContent(text=self.system_prompt)])) @@ -308,6 +323,9 @@ def step(self, state: State) -> Action: - MessageAction(content) - Message action to run (e.g. ask for clarification) - AgentFinishAction() - end the interaction """ + # Continue with pending actions if any + if self.pending_actions: + return self.pending_actions.popleft() format_error = state.get_last_agent_format_error() if format_error and isinstance(format_error, str): @@ -332,14 +350,20 @@ def step(self, state: State) -> Action: } params['tools'] = self.tools params['extra_body'] = {'metadata': state.to_llm_metadata(agent_name=self.name)} + for msg in params['messages']: + if msg.get('role') == 'tool': + msg['role'] = 'user' response = self.llm.completion(**params) logger.debug(f'Response from LLM: {response}') - action = codeact_function_calling.response_to_actions(response, timeout=self.config.action_timeout) - if self.pause_time > 0.5: - logger.info(f'Setting pause time to {self.pause_time} seconds for agentic action') - action.pause_time = self.pause_time - logger.debug(f'Actions after response_to_actions: {action}') - return action + actions = codeact_function_calling.response_to_actions(response, timeout=self.config.action_timeout) + logger.debug(f'Actions after response_to_actions: {actions}') + for action in actions: + if self.pause_time > 0.5: + logger.info(f'Setting pause time to {self.pause_time} seconds for agentic action') + action.pause_time = self.pause_time + self.pending_actions.append(action) + + return self.pending_actions.popleft() def _get_messages_from_agent_state( self, events: list[Event], initial_user_message: MessageAction, @@ -356,6 +380,7 @@ def _get_messages_from_agent_state( Returns: list[dict]: A list of formatted messages ready for LLM consumption """ + """ messages: list[Message] = [] # System message @@ -376,7 +401,42 @@ def _get_messages_from_agent_state( msg = convert_observation_to_message_full_state( event, instruction, include_a11y_tree=include_a11y_tree) messages.append(msg) + """ + messages: list[Message] = [] + + # Get instruction from initial user message + # User message is a MessageAction with content and image_urls, will be processed in events + instruction = get_instruction(initial_user_message) + include_a11y_tree = self.config.enable_a11y_tree + + llm_response_ids_action = set() + llm_response_ids_observation = set() + + # Build history prompts (alternating assistant/user messages) in reverse order + for event in reversed(events): + if isinstance(event, AgentFinishAction): + messages.append(convert_action_to_message(event)) + elif isinstance(event, OSWorldInteractiveAction): + llm_response_id = event.tool_call_metadata.model_response.id + if llm_response_id in llm_response_ids_action: + continue + llm_response_ids_action.add(llm_response_id) + messages.append(convert_action_to_message(event)) + elif isinstance(event, MessageAction): + messages.append(convert_message_action_to_message_full_state(event, include_a11y_tree=include_a11y_tree)) + elif isinstance(event, OSWorldOutputObservation) or isinstance(event, ErrorObservation): + llm_response_id = event.tool_call_metadata.model_response.id + if llm_response_id in llm_response_ids_observation: + continue + msg = convert_observation_to_message_full_state( + event, instruction, include_a11y_tree=include_a11y_tree) + messages.append(msg) + llm_response_ids_observation.add(llm_response_id) + + # System message + messages.append(Message(role='system', content=[TextContent(text=self.system_prompt)])) + messages = messages[::-1] # set flags to know how to serialize the messages for message in messages: message.cache_enabled = False diff --git a/openhands/agenthub/gui_agent/prompts/osworld.py b/openhands/agenthub/gui_agent/prompts/osworld.py index d738c6250..5cc3018ce 100644 --- a/openhands/agenthub/gui_agent/prompts/osworld.py +++ b/openhands/agenthub/gui_agent/prompts/osworld.py @@ -1,14 +1,10 @@ OSWORLD_OBSERVATION_FEEDBACK_PROMPT = """Action executed. Please generate the next move according to the UI screenshot and instruction. Instruction: {instruction} - -First describe the screenshot in detail, think step by step, then generate the next move. You need to at least make a tool call. """ ERROR_OBSERVATION_FEEDBACK_PROMPT = """Action failed. Please continue working on the task according to the instruction. Error message: {error_message} Instruction: {instruction} - -First describe the screenshot in detail, think step by step, then generate the next move. You need to at least make a tool call. """ \ No newline at end of file diff --git a/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2 b/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2 index 4a9feaa3d..ffa37d92a 100644 --- a/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2 +++ b/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2 @@ -40,57 +40,7 @@ In such cases, reassess and choose a different action. ### 4. Sudo Password -My computer’s password is **`{CLIENT_PASSWORD}`**. +My computer's password is **`{CLIENT_PASSWORD}`**. Use it whenever `sudo` rights are required. --- - -## Response Structure Requirements - -Each response **must** follow this structure: - ---- - -### Observation: -Provide a detailed description of the screenshot, including: -- Visible UI elements -- Buttons, menus, icons, windows, dialogs -- Pop-ups, notifications, warnings, errors -- Loading indicators or anything affecting progress - -Include **all relevant details**. - ---- - -### Thought: -Provide detailed reasoning before choosing an action. - -#### Step-by-Step Progress Assessment -- Summarize what has been accomplished so far. -- Identify unexpected outcomes or errors. -- If the previous action seems incorrect, explain how to recover. - -#### Next Action Analysis -- List possible next actions based on the current screen. -- Evaluate them considering the current state and previous actions. -- Select the **most logical** action. -- Anticipate the consequence of that action. - ---- - -### Action: -Output exactly **one** action using the allowed tools: -- Mouse actions -- Keyboard actions -- `wait()` -- `fail()` -- `finish()` - -Only **one** tool call per step. - ---- - -## Final Notes - -- Your decisions must be grounded **strictly in the screenshot**. -- Do not assume any UI that is not visible. \ No newline at end of file diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index dc541f64f..aa8f142ad 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -204,6 +204,7 @@ def __init__( kwargs: dict[str, Any] = { 'temperature': self.config.temperature, 'max_completion_tokens': self.config.max_output_tokens, + 'skip_special_tokens': False, } if self.config.top_k is not None: # openai doesn't expose top_k diff --git a/openhands/nvidia/async_server_osworld.py b/openhands/nvidia/async_server_osworld.py index 420148045..ebdf53e60 100644 --- a/openhands/nvidia/async_server_osworld.py +++ b/openhands/nvidia/async_server_osworld.py @@ -88,6 +88,11 @@ def __init__( 'No reward server IP provided. Evaluations would only work for swebench problems.' ) + # For sequential VM start-ups to mitigate boot storm + self._launch_lock = threading.Lock() + self._last_launch_time = 0 + self._launch_delay_seconds = 15.0 # Wait 15s between starts + def get_unique_id(self, instance, max_retries=10): base = f'{get_instance_id(instance)}_{instance["trajectory_id"]}' base_hash = hashlib.sha256(base.encode('utf-8')).hexdigest()[:16] @@ -396,6 +401,25 @@ async def _process_init(self, job_id: str, job_details, dataset_type: str, wid: if job_details.timer is None: raise RuntimeError('Timer is not initialized') + # Rate Limit Logic: Prevent Boot Storm + wait_time = 0.0 + with self._launch_lock: + now = time.time() + # The earliest this worker can start is either NOW, + # or 15s after the last scheduled launch. + target_start_time = max(now, self._last_launch_time + self._launch_delay_seconds) + + wait_time = target_start_time - now + + # Reserve this slot by updating the global timestamp immediately + self._last_launch_time = target_start_time + + # Perform the wait asynchronously (outside the lock) + if wait_time > 0: + if wait_time > 1.0: + logger.info(f"Delayed boot-up: waiting {wait_time:.1f}s...") + await asyncio.sleep(wait_time) + with phase_context(job_details.timer, 'init'): init_coro = func(job_details=job_details, sid=job_id) runtime, metadata, config = await run_with_timeout_awareness( diff --git a/openhands/nvidia/os_world/controllers/python.py b/openhands/nvidia/os_world/controllers/python.py index 7844d7307..292dfb441 100644 --- a/openhands/nvidia/os_world/controllers/python.py +++ b/openhands/nvidia/os_world/controllers/python.py @@ -5,18 +5,34 @@ import traceback import requests from openhands.core.logger import openhands_logger as logger +from openhands.runtime.utils.osworld_http_client import OSWorldHttpClient KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright'] class PythonController: def __init__(self, vm_ip: str, server_port: int, - pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"): + pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}", + http_client: OSWorldHttpClient = None): self.vm_ip = vm_ip self.http_server = f"http://{vm_ip}:{server_port}" self.pkgs_prefix = pkgs_prefix # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages self.retry_times = 3 self.retry_interval = 5 + # Use http_client if provided, otherwise fall back to direct requests + self.client = http_client + + def _get(self, endpoint: str, **kwargs) -> requests.Response: + """Make a GET request using client or direct requests.""" + if self.client: + return self.client.get(endpoint, **kwargs) + return requests.get(self.http_server + endpoint, **kwargs) + + def _post(self, endpoint: str, **kwargs) -> requests.Response: + """Make a POST request using client or direct requests.""" + if self.client: + return self.client.post(endpoint, **kwargs) + return requests.post(self.http_server + endpoint, **kwargs) @staticmethod def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool: @@ -43,7 +59,7 @@ def get_screenshot(self) -> Optional[bytes]: for attempt_idx in range(self.retry_times): try: - response = requests.get(self.http_server + "/screenshot", timeout=10) + response = self._get("/screenshot", timeout=10) if response.status_code == 200: content_type = response.headers.get("Content-Type", "") content = response.content @@ -71,7 +87,7 @@ def get_accessibility_tree(self) -> Optional[str]: for _ in range(self.retry_times): try: - response: requests.Response = requests.get(self.http_server + "/accessibility") + response: requests.Response = self._get("/accessibility") if response.status_code == 200: logger.info("Got accessibility tree successfully") return response.json()["AT"] @@ -93,7 +109,7 @@ def get_terminal_output(self) -> Optional[str]: for _ in range(self.retry_times): try: - response = requests.get(self.http_server + "/terminal") + response = self._get("/terminal") if response.status_code == 200: logger.info("Got terminal output successfully") return response.json()["output"] @@ -115,7 +131,7 @@ def get_file(self, file_path: str) -> Optional[bytes]: for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/file", data={"file_path": file_path}) + response = self._post("/file", data={"file_path": file_path}) if response.status_code == 200: logger.info("File downloaded successfully") return response.content @@ -141,8 +157,8 @@ def execute_python_command(self, command: str) -> None: for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'}, - data=payload, timeout=90) + response = self._post("/execute", headers={'Content-Type': 'application/json'}, + data=payload, timeout=90) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) return response.json() @@ -167,8 +183,8 @@ def run_python_script(self, script: str) -> Optional[Dict[str, Any]]: for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/run_python", headers={'Content-Type': 'application/json'}, - data=payload, timeout=90) + response = self._post("/run_python", headers={'Content-Type': 'application/json'}, + data=payload, timeout=90) if response.status_code == 200: return response.json() else: @@ -200,8 +216,8 @@ def run_bash_script(self, script: str, timeout: int = 30, working_dir: Optional[ for _ in range(self.retry_times): try: - response = requests.post( - self.http_server + "/run_bash_script", + response = self._post( + "/run_bash_script", headers={'Content-Type': 'application/json'}, data=payload, timeout=timeout + 100 # Add buffer to HTTP timeout @@ -416,7 +432,7 @@ def start_recording(self): for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/start_recording") + response = self._post("/start_recording") if response.status_code == 200: logger.info("Recording started successfully") return @@ -437,7 +453,7 @@ def end_recording(self, dest: str): for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/end_recording") + response = self._post("/end_recording") if response.status_code == 200: logger.info("Recording stopped successfully") with open(dest, 'wb') as f: @@ -469,7 +485,7 @@ def get_vm_screen_size(self): for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/screen_size") + response = self._post("/screen_size") if response.status_code == 200: logger.info("Got screen size successfully") return response.json() @@ -491,7 +507,7 @@ def get_vm_window_size(self, app_class_name: str): for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/window_size", data={"app_class_name": app_class_name}) + response = self._post("/window_size", data={"app_class_name": app_class_name}) if response.status_code == 200: logger.info("Got window size successfully") return response.json() @@ -513,7 +529,7 @@ def get_vm_wallpaper(self): for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/wallpaper") + response = self._post("/wallpaper") if response.status_code == 200: logger.info("Got wallpaper successfully") return response.content @@ -535,7 +551,7 @@ def get_vm_desktop_path(self) -> Optional[str]: for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/desktop_path") + response = self._post("/desktop_path") if response.status_code == 200: logger.info("Got desktop path successfully") return response.json()["desktop_path"] @@ -558,7 +574,7 @@ def get_vm_directory_tree(self, path) -> Optional[Dict[str, Any]]: for _ in range(self.retry_times): try: - response = requests.post(self.http_server + "/list_directory", headers={'Content-Type': 'application/json'}, data=payload) + response = self._post("/list_directory", headers={'Content-Type': 'application/json'}, data=payload) if response.status_code == 200: logger.info("Got directory tree successfully") return response.json()["directory_tree"] diff --git a/openhands/nvidia/os_world/controllers/setup.py b/openhands/nvidia/os_world/controllers/setup.py index 1cd5353a4..ab85b2b75 100644 --- a/openhands/nvidia/os_world/controllers/setup.py +++ b/openhands/nvidia/os_world/controllers/setup.py @@ -21,13 +21,14 @@ from openhands.events.action.os import OSWorldInteractiveAction from openhands.core.logger import openhands_logger as logger from openhands.nvidia.os_world.metrics.utils import compare_urls +from openhands.runtime.utils.osworld_http_client import OSWorldHttpClient FILE_PATH = os.path.dirname(os.path.abspath(__file__)) MAX_RETRIES = 20 class SetupController: - def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080, runtime=None): + def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080, runtime=None, http_client: OSWorldHttpClient = None): self.vm_ip: str = vm_ip self.server_port: int = server_port self.chromium_port: int = chromium_port @@ -40,6 +41,20 @@ def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 922 self.screen_height: int = screen_height self.runtime = runtime # Runtime object for interacting with the environment self.additional_wait_time = 3 + # Use http_client if provided, otherwise fall back to direct requests + self.client = http_client + + def _get(self, endpoint: str, **kwargs) -> requests.Response: + """Make a GET request using client or direct requests.""" + if self.client: + return self.client.get(endpoint, **kwargs) + return requests.get(self.http_server + endpoint, **kwargs) + + def _post(self, endpoint: str, **kwargs) -> requests.Response: + """Make a POST request using client or direct requests.""" + if self.client: + return self.client.post(endpoint, **kwargs) + return requests.post(self.http_server + endpoint, **kwargs) def reset_cache_dir(self, cache_dir: str): self.cache_dir = cache_dir @@ -227,8 +242,8 @@ def _upload_file_setup(self, files: List[Dict[str, str]]): logger.debug(form.content_type) # Explicit connect/read timeout to avoid hanging forever - response = requests.post( - self.http_server + "/setup" + "/upload", + response = self._post( + "/setup/upload", headers=headers, data=form, timeout=(10, 600) @@ -270,7 +285,7 @@ def _change_wallpaper_setup(self, path: str): # send request to server to change wallpaper # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/change_wallpaper", headers=headers, data=payload) + response = self._post("/setup/change_wallpaper", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -295,16 +310,30 @@ def _open_setup(self, path: str): # send request to server to open file # Note: This uses a custom /setup endpoint, not a standard OSWorld method - try: - # The server-side call is now blocking and can take time. - # We set a timeout that is slightly longer than the server's timeout (1800s). - response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810) - response.raise_for_status() # This will raise an exception for 4xx and 5xx status codes - logger.info("Command executed successfully: %s", response.text) - time.sleep(self.additional_wait_time) - except requests.exceptions.RequestException as e: - logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") - raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e + max_retries = 3 + last_error = None + for attempt in range(max_retries): + try: + # The server-side call is now blocking and can take time. + # We set a timeout that is slightly longer than the server's timeout (1800s). + response = self._post("/setup/open_file", headers=headers, data=payload, timeout=1810) + response.raise_for_status() # This will raise an exception for 4xx and 5xx status codes + logger.info("Command executed successfully: %s", response.text) + time.sleep(self.additional_wait_time) + return # Success + except requests.exceptions.RequestException as e: + last_error = e + status = getattr(getattr(e, 'response', None), 'status_code', None) + if status in (502, 503, 504) and attempt < max_retries - 1: + wait_time = 10 * (attempt + 1) + logger.warning( + f"open_file attempt {attempt + 1}/{max_retries} failed for '{path}' " + f"(HTTP {status}). Retrying in {wait_time}s..." + ) + time.sleep(wait_time) + continue + logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") + raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e def _ensure_launch_command_finish(self, command): if isinstance(command, list): @@ -436,13 +465,16 @@ def _launch_setup(self, command: Union[str, List[str]], shell: bool = False): logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.") command = command.split() + if self.client: + command = self.client.update_launch_command(command) + payload = json.dumps({"command": command, "shell": shell}) headers = {"Content-Type": "application/json"} # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/launch") - response = requests.post(self.http_server + "/setup" + "/launch", headers=headers, data=payload) + logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup/launch") + response = self._post("/setup/launch", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -516,7 +548,7 @@ def replace_screen_env_in_command(command): # Execute using runtime while not terminates: try: - response = requests.post(self.http_server + "/setup" + "/execute", headers=headers, data=payload) + response = self._post("/setup/execute", headers=headers, data=payload) if response.status_code == 200: results: Dict[str, str] = response.json() if stdout: @@ -588,8 +620,8 @@ def _execute_with_verification_setup( # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/execute_with_verification", - headers=headers, data=payload, timeout=max_wait_time + 10) + response = self._post("/setup/execute_with_verification", + headers=headers, data=payload, timeout=max_wait_time + 10) if response.status_code == 200: result = response.json() logger.info("Command executed and verified successfully: %s -> %s" @@ -639,7 +671,7 @@ def _activate_window_setup(self, window_name: str, strict: bool = False, by_clas # send request to server to open file # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload) + response = self._post("/setup/activate_window", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -663,7 +695,7 @@ def _close_window_setup(self, window_name: str, strict: bool = False, by_class: # send request to server to open file # Note: This uses a custom /setup endpoint, not a standard OSWorld method try: - response = requests.post(self.http_server + "/setup" + "/close_window", headers=headers, data=payload) + response = self._post("/setup/close_window", headers=headers, data=payload) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: @@ -676,10 +708,16 @@ async def _chrome_open_tabs_setup(self, urls_to_open: List[str]): if not self.runtime: raise Exception("Runtime is required for SetupController. Please provide a runtime object.") - host = self.vm_ip - port = self.chromium_port # fixme: this port is hard-coded, need to be changed from config file + # Get CDP URL and headers from client if available + if self.client: + remote_debugging_url = self.client.get_cdp_url() + cdp_headers = self.client.get_cdp_headers() + else: + host = self.vm_ip + port = self.chromium_port + remote_debugging_url = f"http://{host}:{port}" + cdp_headers = None - remote_debugging_url = f"http://{host}:{port}" logger.info("Connect to Chrome @: %s", remote_debugging_url) logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ)) for attempt in range(15): @@ -689,7 +727,10 @@ async def _chrome_open_tabs_setup(self, urls_to_open: List[str]): browser = None async with async_playwright() as p: try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp( + remote_debugging_url, + headers=cdp_headers or {} + ) # break except Exception as e: if attempt < 14: @@ -730,15 +771,24 @@ async def _chrome_close_tabs_setup(self, urls_to_close: List[str]): time.sleep(5) # Wait for Chrome to finish launching - host = self.vm_ip - port = self.chromium_port # fixme: this port is hard-coded, need to be changed from config file + # Get CDP URL and headers from client if available + if self.client: + remote_debugging_url = self.client.get_cdp_url() + cdp_headers = self.client.get_cdp_headers() + else: + host = self.vm_ip + port = self.chromium_port + remote_debugging_url = f"http://{host}:{port}" + cdp_headers = None - remote_debugging_url = f"http://{host}:{port}" async with async_playwright() as p: browser = None for attempt in range(15): try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp( + remote_debugging_url, + headers=cdp_headers or {} + ) break except Exception as e: if attempt < 14: @@ -867,15 +917,24 @@ async def _login_setup(self, **config): if not self.runtime: raise Exception("Runtime is required for SetupController. Please provide a runtime object.") - host = self.vm_ip - port = self.chromium_port + # Get CDP URL and headers from client if available + if self.client: + remote_debugging_url = self.client.get_cdp_url() + cdp_headers = self.client.get_cdp_headers() + else: + host = self.vm_ip + port = self.chromium_port + remote_debugging_url = f"http://{host}:{port}" + cdp_headers = None - remote_debugging_url = f"http://{host}:{port}" async with async_playwright() as p: browser = None for attempt in range(15): try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp( + remote_debugging_url, + headers=cdp_headers or {} + ) break except Exception as e: if attempt < 14: @@ -930,8 +989,8 @@ def execute_python_command(self, command: str): for _ in range(3): try: - response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'}, - data=payload, timeout=90) + response = self._post("/execute", headers={'Content-Type': 'application/json'}, + data=payload, timeout=90) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) return response.json() @@ -1042,8 +1101,8 @@ def _update_browse_history_setup(self, **config): # send request to server to upload file try: - logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload") - response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form) + logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup/upload") + response = self._post("/setup/upload", headers=headers, data=form) if response.status_code == 200: logger.info("Command executed successfully: %s", response.text) else: diff --git a/openhands/nvidia/os_world/evaluate.py b/openhands/nvidia/os_world/evaluate.py index cd3478021..f87bde184 100644 --- a/openhands/nvidia/os_world/evaluate.py +++ b/openhands/nvidia/os_world/evaluate.py @@ -25,12 +25,18 @@ def __init__(self, task_config: Dict[str, Any], controller): self.client_password = controller.client_password self.screen_width = controller.screen_width self.screen_height = controller.screen_height + + # Get http_client from controller for runtime-agnostic HTTP calls + self.client = getattr(controller, 'client', None) # Assume Linux platform for OS World VMs # TODO: get from runtime/controller, mismatch initial letter is lowercase self.vm_platform = 'Linux' + + # Current proxy setting (for chrome getters) + self.current_use_proxy = False - self.controller = PythonController(self.vm_ip, self.server_port) + self.controller = PythonController(self.vm_ip, self.server_port, http_client=self.client) self._set_evaluator_info(task_config) def _set_evaluator_info(self, task_config: Dict[str, Any]): diff --git a/openhands/nvidia/os_world/getters/chrome.py b/openhands/nvidia/os_world/getters/chrome.py index cd2acd906..40a707888 100644 --- a/openhands/nvidia/os_world/getters/chrome.py +++ b/openhands/nvidia/os_world/getters/chrome.py @@ -18,6 +18,28 @@ from openhands.core.logger import openhands_logger as logger + +def _get_cdp_connection_info(env): + """Get CDP URL and headers from env, using http_client if available.""" + if hasattr(env, 'client') and env.client: + return env.client.get_cdp_url(), env.client.get_cdp_headers() + else: + host = env.vm_ip + port = env.chromium_port + return f"http://{host}:{port}", None + + +def _make_post_request(env, endpoint: str, **kwargs): + """Make a POST request using env.client if available, otherwise direct.""" + if hasattr(env, 'client') and env.client: + return env.client.post(endpoint, **kwargs) + else: + host = env.vm_ip + port = env.server_port + url = f"http://{host}:{port}{endpoint}" + return requests.post(url, **kwargs) + + _accessibility_ns_map = { "st": "uri:deskat:state.at-spi.gnome.org", "attr": "uri:deskat:attributes.at-spi.gnome.org", @@ -58,11 +80,7 @@ async def get_info_from_website(env, config: Dict[Any, Any]) -> Any: logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}") try: - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port - remote_debugging_url = f"http://{host}:{port}" - backend_url = f"http://{host}:{server_port}" + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) use_proxy = env.current_use_proxy logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}") @@ -70,7 +88,7 @@ async def get_info_from_website(env, config: Dict[Any, Any]) -> Any: async with async_playwright() as p: # connect to remote Chrome instance try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}") @@ -89,10 +107,9 @@ async def get_info_from_website(env, config: Dict[Any, Any]) -> Any: logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}") payload = json.dumps({"command": command, "shell": False}) headers = {"Content-Type": "application/json"} - #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) - requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance") page = await browser.new_page() @@ -536,12 +553,8 @@ def get_extensions_installed_from_shop(env, config: Dict[str, str]): # port info to allow remote debugging, see README.md for details async def get_page_info(env, config: Dict[str, str]): - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) url = config["url"] - - remote_debugging_url = f"http://{host}:{port}" # Configuration for retry and timeout max_retries = 2 @@ -554,12 +567,11 @@ async def get_page_info(env, config: Dict[str, str]): async with async_playwright() as p: # connect to remote Chrome instance try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}") # If the connection fails, start a new browser instance - platform.machine() if "arm" in platform.machine(): # start a new browser instance if the connection fails payload = json.dumps({"command": [ @@ -573,9 +585,9 @@ async def get_page_info(env, config: Dict[str, str]): ], "shell": False}) headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance") page = await browser.new_page() @@ -621,11 +633,7 @@ async def get_page_info(env, config: Dict[str, str]): async def get_open_tabs_info(env, config: Dict[str, str]): - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) # Configuration for retry and timeout max_retries = 2 @@ -638,12 +646,11 @@ async def get_open_tabs_info(env, config: Dict[str, str]): async with async_playwright() as p: # connect to remote Chrome instance try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}") # If the connection fails, start a new browser instance - platform.machine() if "arm" in platform.machine(): # start a new browser instance if the connection fails payload = json.dumps({"command": [ @@ -657,10 +664,10 @@ async def get_open_tabs_info(env, config: Dict[str, str]): ], "shell": False}) headers = {"Content-Type": "application/json"} - requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance") except Exception as e: logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}") @@ -796,10 +803,7 @@ async def get_active_tab_info(env, config: Dict[str, str]): logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}") - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) # Configuration for retry and timeout max_retries = 2 @@ -812,7 +816,7 @@ async def get_active_tab_info(env, config: Dict[str, str]): async with async_playwright() as p: # connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance") except Exception as e: logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}") @@ -879,11 +883,7 @@ async def get_pdf_from_url(env, config: Dict[str, str]) -> str: logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}") logger.info(f"[PDF_FROM_URL] Target path: {_path}") - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) # Configuration for retry and timeout max_retries = 3 @@ -895,14 +895,13 @@ async def get_pdf_from_url(env, config: Dict[str, str]) -> str: async with async_playwright() as p: try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}") logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...") # If the connection fails, start a new browser instance - platform.machine() if "arm" in platform.machine(): # start a new browser instance if the connection fails payload = json.dumps({"command": [ @@ -916,9 +915,9 @@ async def get_pdf_from_url(env, config: Dict[str, str]) -> str: ], "shell": False}) headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance") page = await browser.new_page() @@ -983,11 +982,7 @@ async def get_pdf_from_url(env, config: Dict[str, str]) -> str: # fixme: needs to be changed (maybe through post-processing) since it's not working async def get_chrome_saved_address(env, config: Dict[str, str]): - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) # Configuration for retry and timeout max_retries = 2 @@ -1000,12 +995,11 @@ async def get_chrome_saved_address(env, config: Dict[str, str]): async with async_playwright() as p: # connect to remote Chrome instance try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}") # If the connection fails, start a new browser instance - platform.machine() if "arm" in platform.machine(): # start a new browser instance if the connection fails payload = json.dumps({"command": [ @@ -1019,9 +1013,9 @@ async def get_chrome_saved_address(env, config: Dict[str, str]): ], "shell": False}) headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance") page = await browser.new_page() @@ -1097,11 +1091,7 @@ def get_shortcuts_on_desktop(env, config: Dict[str, str]): async def get_number_of_search_results(env, config: Dict[str, str]): # todo: move into the config file url, result_selector = "https://google.com/search?q=query", '.search-result' - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port - - remote_debugging_url = f"http://{host}:{port}" + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) # Configuration for retry and timeout max_retries = 2 @@ -1113,12 +1103,11 @@ async def get_number_of_search_results(env, config: Dict[str, str]): async with async_playwright() as p: try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}") # If the connection fails, start a new browser instance - platform.machine() if "arm" in platform.machine(): # start a new browser instance if the connection fails payload = json.dumps({"command": [ @@ -1132,9 +1121,9 @@ async def get_number_of_search_results(env, config: Dict[str, str]): ], "shell": False}) headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance") page = await browser.new_page() @@ -1520,11 +1509,8 @@ async def get_active_tab_html_parse(env, config: Dict[str, Any]): if not isinstance(active_tab_url, str): logger.error(f"[DEBUG] active_tab_url is not a string, got {type(active_tab_url)}: {active_tab_url}") return None - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port - - remote_debugging_url = f"http://{host}:{port}" + + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) # DEBUG: Add logging for configuration logger.info(f"[DEBUG] get_active_tab_html_parse called with config: {config}") @@ -1532,10 +1518,9 @@ async def get_active_tab_html_parse(env, config: Dict[str, Any]): async with async_playwright() as p: # connect to remote Chrome instance try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) except Exception as e: # If the connection fails, start a new browser instance - platform.machine() if "arm" in platform.machine(): # start a new browser instance if the connection fails payload = json.dumps({"command": [ @@ -1549,9 +1534,9 @@ async def get_active_tab_html_parse(env, config: Dict[str, Any]): ], "shell": False}) headers = {"Content-Type": "application/json"} - requests.post("http://" + host + ":" + str(server_port) + "/setup" + "/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(5) - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) target_page = None for context in browser.contexts: for page in context.pages: @@ -1984,13 +1969,8 @@ async def get_gotoRecreationPage_and_get_html_content(env, config: Dict[str, Any logger.info(f"[RECREATION_PAGE] Starting recreation.gov page processing") logger.debug(f"[RECREATION_PAGE] Config: {config}") - host = env.vm_ip - port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file - server_port = env.server_port + remote_debugging_url, cdp_headers = _get_cdp_connection_info(env) use_proxy = env.current_use_proxy - - remote_debugging_url = f"http://{host}:{port}" - backend_url = f"http://{host}:{server_port}" # Configuration for retry and timeout max_retries = 3 @@ -2013,7 +1993,7 @@ async def get_gotoRecreationPage_and_get_html_content(env, config: Dict[str, Any async with async_playwright() as p: # Connect to remote Chrome instance try: - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[RECREATION_PAGE] Successfully connected to existing Chrome instance") except Exception as e: logger.warning(f"[RECREATION_PAGE] Failed to connect to existing Chrome instance: {e}") @@ -2038,9 +2018,9 @@ async def get_gotoRecreationPage_and_get_html_content(env, config: Dict[str, Any logger.info(f"[RECREATION_PAGE] Starting browser with command: {' '.join(command)}") payload = json.dumps({"command": command, "shell": False}) headers = {"Content-Type": "application/json"} - requests.post(backend_url + "/setup/launch", headers=headers, data=payload) + _make_post_request(env, "/setup/launch", headers=headers, data=payload) await asyncio.sleep(8) # Give more time for browser to start - browser = await p.chromium.connect_over_cdp(remote_debugging_url) + browser = await p.chromium.connect_over_cdp(remote_debugging_url, headers=cdp_headers or {}) logger.info(f"[RECREATION_PAGE] Successfully connected to new Chrome instance") page = await browser.new_page() diff --git a/openhands/nvidia/os_world/getters/general.py b/openhands/nvidia/os_world/getters/general.py index 30dc239b9..37e13024e 100644 --- a/openhands/nvidia/os_world/getters/general.py +++ b/openhands/nvidia/os_world/getters/general.py @@ -3,36 +3,54 @@ from openhands.core.logger import openhands_logger as logger + +def _make_post_request(env, endpoint: str, **kwargs): + """Make a POST request using env.client if available, otherwise direct.""" + if hasattr(env, 'client') and env.client: + return env.client.post(endpoint, **kwargs) + else: + vm_ip = env.vm_ip + port = env.server_port + url = f"http://{vm_ip}:{port}{endpoint}" + return requests.post(url, **kwargs) + + def get_vm_command_line(env, config: Dict[str, str]): - vm_ip = env.vm_ip - port = env.server_port command = config["command"] shell = config.get("shell", False) - response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command, "shell": shell}) - - print(response.json()) + response = _make_post_request(env, "/execute", json={"command": command, "shell": shell}) if response.status_code == 200: - return response.json()["output"] + try: + result = response.json() + logger.debug(f"VM command response: {result}") + return result.get("output") + except Exception as e: + logger.error(f"Failed to parse VM command response: {e}") + return None else: - logger.error("Failed to get vm command line. Status code: %d", response.status_code) + logger.error("Failed to get vm command line. Status code: %d, Response: %s", + response.status_code, response.text[:200] if response.text else "empty") return None def get_vm_command_error(env, config: Dict[str, str]): - vm_ip = env.vm_ip - port = env.server_port command = config["command"] shell = config.get("shell", False) - response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command, "shell": shell}) - - print(response.json()) + response = _make_post_request(env, "/execute", json={"command": command, "shell": shell}) if response.status_code == 200: - return response.json()["error"] + try: + result = response.json() + logger.debug(f"VM command error response: {result}") + return result.get("error") + except Exception as e: + logger.error(f"Failed to parse VM command error response: {e}") + return None else: - logger.error("Failed to get vm command line error. Status code: %d", response.status_code) + logger.error("Failed to get vm command line error. Status code: %d, Response: %s", + response.status_code, response.text[:200] if response.text else "empty") return None diff --git a/openhands/nvidia/os_world/getters/vlc.py b/openhands/nvidia/os_world/getters/vlc.py index c84816acd..d4177ceab 100644 --- a/openhands/nvidia/os_world/getters/vlc.py +++ b/openhands/nvidia/os_world/getters/vlc.py @@ -11,14 +11,21 @@ def get_vlc_playing_info(env, config: Dict[str, str]): """ Gets the current playing information from VLC's HTTP interface. """ - - host = env.vm_ip - port = env.vlc_port password = 'password' - _path = os.path.join(env.cache_dir, config["dest"]) - url = f'http://{host}:{port}/requests/status.xml' - response = requests.get(url, auth=('', password)) + + # Use http_client if available for VLC URL + if hasattr(env, 'client') and env.client: + vlc_url = env.client.get_vlc_url() + url = f'{vlc_url}/requests/status.xml' + headers = env.client.get_cdp_headers() or {} + response = requests.get(url, auth=('', password), headers=headers) + else: + host = env.vm_ip + port = env.vlc_port + url = f'http://{host}:{port}/requests/status.xml' + response = requests.get(url, auth=('', password)) + if response.status_code == 200: content = response.content else: diff --git a/openhands/nvidia/os_world/nvcf/__init__.py b/openhands/nvidia/os_world/nvcf/__init__.py new file mode 100644 index 000000000..637fea4b8 --- /dev/null +++ b/openhands/nvidia/os_world/nvcf/__init__.py @@ -0,0 +1,15 @@ +"""OSWorld NVCF: deploy and interact with OSWorld on NVIDIA Cloud Functions (NVCF).""" + +from openhands.nvidia.os_world.nvcf.config import ( + DEFAULT_CONTAINER_IMAGE, + OSWorldDeploymentConfig, + OSWorldFunctionConfig, +) +from openhands.nvidia.os_world.nvcf.deployer import OSWorldDeployer + +__all__ = [ + "OSWorldDeployer", + "OSWorldFunctionConfig", + "OSWorldDeploymentConfig", + "DEFAULT_CONTAINER_IMAGE", +] diff --git a/openhands/nvidia/os_world/nvcf/config.py b/openhands/nvidia/os_world/nvcf/config.py new file mode 100644 index 000000000..4924433bc --- /dev/null +++ b/openhands/nvidia/os_world/nvcf/config.py @@ -0,0 +1,93 @@ +"""Configuration dataclasses for OSWorld NVCF deployment.""" + +import os +import logging +logger = logging.getLogger(__name__) +from dataclasses import dataclass, field +from typing import List, Optional + +# NGC_ORG = os.environ.get("NGC_ORG", "nvidian") +# DEFAULT_CONTAINER_IMAGE = f"nvcr.io/{NGC_ORG}/nemo:osworld-linux-2" + +DEFAULT_CONTAINER_IMAGE = "nvcr.io/i01fc6pe8nwm/nemo:osworld-linux-2-debug" + + +@dataclass +class OSWorldFunctionConfig: + """Configuration for creating an OSWorld NVCF function. + + Attributes: + name: Display name of the function in NVCF. + container_image: NGC container image path. Defaults to DEFAULT_CONTAINER_IMAGE. + inference_url: Endpoint path for inference requests. + inference_port: Port the container exposes for inference. + health_uri: Endpoint path for health checks. + description: Optional description of the function. + container_args: Optional arguments to pass to the container. + container_environment_variables: Optional environment variables + in the format ["KEY1:value1", "KEY2:value2"]. + tags: Optional list of tags for the function. + """ + + name: str = "osworld-linux" + container_image: Optional[str] = None # Set via __post_init__ + inference_url: str = "/api" + inference_port: int = 8000 + health_uri: str = "/api/version" + description: str = "OSWorld Linux environment for AI agent evaluation" + container_args: Optional[str] = None + container_environment_variables: Optional[List[str]] = None + tags: Optional[List[str]] = field(default_factory=lambda: ["osworld"]) + + def __post_init__(self): + """Set default container image if not provided.""" + if self.container_image is None: + self.container_image = DEFAULT_CONTAINER_IMAGE + logger.info(f"Using container image: {self.container_image}") + + +@dataclass +class OSWorldDeploymentConfig: + """Configuration for deploying an OSWorld function. + + Attributes: + gpu: GPU type to use (e.g., "L40", "H100", "A100"). + instance_type: Specific instance type (e.g., "GFN.GPU.L40_1x"). + If not provided, will use a default based on GPU type. + min_instances: Minimum number of instances (0 allows scale-to-zero). + max_instances: Maximum number of instances for autoscaling. + max_request_concurrency: Maximum concurrent requests per instance. + backend: Cluster backend (e.g., "GFN", "AZURE", "GCP", "OCI"). + regions: Optional list of regions (e.g., ["us-west-2", "us-east-1"]). + clusters: Optional list of specific clusters. + attributes: Optional list of cluster attributes (e.g., ["HIPAA", "SOC2"]). + configuration: Optional dict of helm chart value overrides. + """ + + gpu: str = "L40S" + instance_type: Optional[str] = None + min_instances: int = 1 + max_instances: int = 1 + max_request_concurrency: int = 1 + backend: str = "GFN" + regions: Optional[List[str]] = None + clusters: Optional[List[str]] = None + attributes: Optional[List[str]] = None + configuration: Optional[dict] = None + + def get_instance_type(self) -> str: + """Get the instance type, using a default if not explicitly set.""" + if self.instance_type: + return self.instance_type + + # Default instance types for common GPU/backend combinations + # (Experiment.md: use exact type from --list-gpus when auto-detection fails) + defaults = { + ("GFN", "L40"): "gl40_1.br20_2xlarge", + ("GFN", "L40G"): "gl40g_1.br25_2xlarge", + ("GFN", "L40S"): "gl40s_4.br25_small", + ("GFN", "T10"): "g6.full", + ("AZURE", "H100"): "AZURE.GPU.H100_1x", + ("GCP", "H100"): "a3-highgpu-8g_1x", + } + return defaults.get((self.backend, self.gpu), f"{self.backend}.GPU.{self.gpu}_1x") diff --git a/openhands/nvidia/os_world/nvcf/deployer.py b/openhands/nvidia/os_world/nvcf/deployer.py new file mode 100644 index 000000000..6cbaed45c --- /dev/null +++ b/openhands/nvidia/os_world/nvcf/deployer.py @@ -0,0 +1,324 @@ +"""OSWorld deployer using the NGC SDK.""" + +import os +import time +from typing import Any, Dict, Iterator, List, Optional + +from openhands.nvidia.os_world.nvcf.config import ( + OSWorldDeploymentConfig, + OSWorldFunctionConfig, +) + +# NGC SDK imports (optional dependency: pip install ngcsdk) +try: + from ngcsdk import Client + from nvcf.api.deployment_spec import TargetedDeploymentSpecification +except ImportError as e: + Client = None # type: ignore[misc, assignment] + TargetedDeploymentSpecification = None # type: ignore[misc, assignment] + _NGCSDK_IMPORT_ERROR = e +else: + _NGCSDK_IMPORT_ERROR = None + + +class OSWorldDeployer: + """Manages OSWorld deployment to NVCF using the NGC SDK. + + This class provides a simplified interface for deploying OSWorld + containers to NVIDIA Cloud Functions. + + Example: + >>> deployer = OSWorldDeployer( + ... api_key="nvapi-xxx", + ... org_name="my-org", + ... ) + >>> result = deployer.create_function() + >>> func_id = result["function"]["id"] + >>> ver_id = result["function"]["versionId"] + >>> deployer.deploy(func_id, ver_id) + """ + + def __init__( + self, + api_key: Optional[str] = None, + org_name: Optional[str] = None, + team_name: str = "no-team", + ): + """Initialize the OSWorld deployer. + + Args: + api_key: NGC API key. If not provided, reads from NGC_API_KEY + environment variable. + org_name: NGC organization name. If not provided, reads from + NGC_ORG environment variable. + team_name: NGC team name. Defaults to "no-team". + + Raises: + ValueError: If api_key or org_name is not provided and not + found in environment variables. + ImportError: If ngcsdk is not installed. + """ + if _NGCSDK_IMPORT_ERROR is not None: + raise ImportError( + "ngcsdk is required for OSWorldDeployer. Install with: pip install ngcsdk" + ) from _NGCSDK_IMPORT_ERROR + + api_key = api_key or os.environ.get("NGC_API_KEY") + org_name = org_name or os.environ.get("NGC_ORG") + + if not api_key: + raise ValueError( + "NGC API key required. Provide api_key or set NGC_API_KEY env var." + ) + if not org_name: + raise ValueError( + "NGC org name required. Provide org_name or set NGC_ORG env var." + ) + + self._api_key = api_key + self._org_name = org_name + self._client = Client() + self._client.configure(api_key, org_name=org_name, team_name=team_name) + + @property + def client(self) -> "Client": + """Access the underlying NGC SDK client.""" + return self._client + + def create_function( + self, + config: Optional[OSWorldFunctionConfig] = None, + function_id: Optional[str] = None, + ) -> Dict[str, Any]: + """Create an OSWorld function in NVCF.""" + if config is None: + config = OSWorldFunctionConfig() + + kwargs: Dict[str, Any] = { + "name": config.name, + "inference_url": config.inference_url, + "container_image": config.container_image, + "inference_port": config.inference_port, + "health_uri": config.health_uri, + "description": config.description, + } + + if config.container_args: + kwargs["container_args"] = config.container_args + if config.container_environment_variables: + kwargs["container_environment_variables"] = ( + config.container_environment_variables + ) + if config.tags: + kwargs["tags"] = config.tags + if function_id: + kwargs["function_id"] = function_id + + return self._client.cloud_function.functions.create(**kwargs) + + def deploy( + self, + function_id: str, + function_version_id: str, + config: Optional[OSWorldDeploymentConfig] = None, + ) -> Dict[str, Any]: + """Deploy an OSWorld function with specified GPU configuration.""" + if config is None: + config = OSWorldDeploymentConfig() + + spec = TargetedDeploymentSpecification( + gpu=config.gpu, + instance_type=config.get_instance_type(), + min_instances=config.min_instances, + max_instances=config.max_instances, + max_request_concurrency=config.max_request_concurrency, + regions=config.regions, + clusters=config.clusters, + attributes=config.attributes, + configuration=config.configuration, + ) + + return self._client.cloud_function.functions.deployments.create( + function_id=function_id, + function_version_id=function_version_id, + targeted_deployment_specifications=[spec], + ) + + def update_deployment( + self, + function_id: str, + function_version_id: str, + config: OSWorldDeploymentConfig, + ) -> Dict[str, Any]: + """Update an existing deployment configuration.""" + spec = TargetedDeploymentSpecification( + gpu=config.gpu, + instance_type=config.get_instance_type(), + min_instances=config.min_instances, + max_instances=config.max_instances, + max_request_concurrency=config.max_request_concurrency, + regions=config.regions, + clusters=config.clusters, + attributes=config.attributes, + configuration=config.configuration, + ) + + return self._client.cloud_function.functions.deployments.update( + function_id=function_id, + function_version_id=function_version_id, + targeted_deployment_specifications=[spec], + ) + + def get_function_info( + self, + function_id: str, + function_version_id: str, + ) -> Dict[str, Any]: + """Get information about a function version.""" + return self._client.cloud_function.functions.info( + function_id=function_id, + function_version_id=function_version_id, + ) + + def get_deployment_info( + self, + function_id: str, + function_version_id: str, + ) -> Dict[str, Any]: + """Get information about a function's deployment.""" + return self._client.cloud_function.functions.deployments.info( + function_id=function_id, + function_version_id=function_version_id, + ) + + def get_status( + self, + function_id: str, + function_version_id: str, + ) -> str: + """Get the current status of a function.""" + info = self.get_function_info(function_id, function_version_id) + return info.get("function", {}).get("status", "UNKNOWN") + + def wait_for_active( + self, + function_id: str, + function_version_id: str, + timeout: int = 1800, + poll_interval: int = 30, + ) -> str: + """Wait for a function to become ACTIVE.""" + start_time = time.time() + while time.time() - start_time < timeout: + status = self.get_status(function_id, function_version_id) + if status == "ACTIVE": + return status + if status == "ERROR": + raise RuntimeError("Function entered ERROR state") + time.sleep(poll_interval) + + raise TimeoutError( + f"Function did not become ACTIVE within {timeout} seconds. " + f"Last status: {status}" + ) + + def list_functions( + self, + name_pattern: Optional[str] = None, + access_filter: Optional[List[str]] = None, + ) -> Dict[str, Any]: + """List functions available to the organization.""" + return self._client.cloud_function.functions.list( + name_pattern=name_pattern, + access_filter=access_filter or ["private"], + ) + + def list_available_gpus(self) -> Dict[str, Any]: + """List available GPU types for deployment.""" + return self._client.cloud_function.gpus.list() + + def get_gpu_info(self, gpu_name: str) -> Dict[str, Any]: + """Get detailed information about a specific GPU type.""" + return self._client.cloud_function.gpus.info(gpu_name) + + def invoke( + self, + function_id: str, + payload: Dict[str, Any], + api_key: Optional[str] = None, + function_version_id: Optional[str] = None, + ) -> Dict[str, Any]: + """Invoke an OSWorld function.""" + return self._client.cloud_function.functions.invoke( + function_id=function_id, + payload=payload, + starfleet_api_key=api_key or self._api_key, + function_version_id=function_version_id, + ) + + def invoke_stream( + self, + function_id: str, + payload: Dict[str, Any], + api_key: Optional[str] = None, + function_version_id: Optional[str] = None, + ) -> Iterator[bytes]: + """Invoke an OSWorld function with streaming response.""" + return self._client.cloud_function.functions.invoke_stream( + function_id=function_id, + payload=payload, + starfleet_api_key=api_key or self._api_key, + function_version_id=function_version_id, + ) + + def undeploy( + self, + function_id: str, + function_version_id: str, + graceful: bool = True, + ) -> None: + """Remove a deployment (undeploy a function).""" + self._client.cloud_function.functions.deployments.delete( + function_id=function_id, + function_version_id=function_version_id, + graceful=graceful, + ) + + def delete_function( + self, + function_id: str, + function_version_id: str, + ) -> None: + """Delete a function version. Must be undeployed first.""" + self._client.cloud_function.functions.delete( + function_id=function_id, + function_version_id=function_version_id, + ) + + def query_deployment_logs( + self, + function_id: str, + function_version_id: str, + duration: Optional[str] = None, + ) -> Iterator[Dict[str, Any]]: + """Query deployment logs for a function.""" + from datetime import timedelta + + td = None + if duration: + unit = duration[-1].upper() + value = int(duration[:-1]) + if unit == "H": + td = timedelta(hours=value) + elif unit == "M": + td = timedelta(minutes=value) + elif unit == "D": + td = timedelta(days=value) + elif unit == "S": + td = timedelta(seconds=value) + + return self._client.cloud_function.functions.deployments.query_logs( + function_id=function_id, + function_version_id=function_version_id, + duration=td, + ) diff --git a/openhands/nvidia/os_world/osworld_utils.py b/openhands/nvidia/os_world/osworld_utils.py index ff72c385e..da7654b3c 100644 --- a/openhands/nvidia/os_world/osworld_utils.py +++ b/openhands/nvidia/os_world/osworld_utils.py @@ -67,7 +67,7 @@ def get_config( sandbox_config = SandboxConfig( base_container_image='ubuntu:24.04', - run_as_fakeroot=True, + run_as_fakeroot=False, ) config = OpenHandsConfig( @@ -116,11 +116,9 @@ def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata, runtime: include_screenshot = True #runtime.config.agents['agent'].enable_vision include_a11y_tree = True #runtime.config.agents['agent'].enable_a11y_tree - instruction = f"""Work on the following task accourding to the UI screenshot. + instruction = f"""Work on the following task according to the UI screenshot. Instruction: {instance['instruction']} - -First describe the screenshot in detail, think step by step, then generate the next move. """ if include_a11y_tree: @@ -216,6 +214,7 @@ async def initialize_runtime(runtime: Runtime, instance: dict, metadata: EvalMet vm_ip="127.0.0.1", server_port=runtime._vm_server_port, chromium_port=runtime._chromium_port, + vlc_port=runtime._vlc_port, cache_dir="/tmp/osworld_example", client_password="password", runtime=runtime # Pass your runtime object here @@ -514,4 +513,4 @@ def eval_exception(job_details: JobDetails, e: Exception): } ############################################################################### # End of exception handling -############################################################################### \ No newline at end of file +############################################################################### diff --git a/openhands/runtime/__init__.py b/openhands/runtime/__init__.py index 4ff594d7f..8ce667d88 100644 --- a/openhands/runtime/__init__.py +++ b/openhands/runtime/__init__.py @@ -12,6 +12,7 @@ from openhands.runtime.impl.runloop.runloop_runtime import RunloopRuntime from openhands.runtime.impl.singularity.singularity_runtime import SingularityRuntime from openhands.runtime.impl.singularity.osworld_singularity_runtime import OSWorldSingularityRuntime +from openhands.runtime.impl.nvcf import OSWorldNVCFRuntime from openhands.utils.import_utils import get_impl # mypy: disable-error-code="type-abstract" @@ -27,6 +28,7 @@ 'enroot': EnrootRuntime, 'singularity': SingularityRuntime, 'osworld': OSWorldSingularityRuntime, + 'osworld_nvcf': OSWorldNVCFRuntime, 'cli': CLIRuntime, } @@ -60,5 +62,6 @@ def get_runtime_cls(name: str) -> type[Runtime]: 'EnrootRuntime', 'SingularityRuntime', 'OSWorldSingularityRuntime', + 'OSWorldNVCFRuntime', 'get_runtime_cls', ] diff --git a/openhands/runtime/impl/nvcf/__init__.py b/openhands/runtime/impl/nvcf/__init__.py new file mode 100644 index 000000000..660ea3afd --- /dev/null +++ b/openhands/runtime/impl/nvcf/__init__.py @@ -0,0 +1,9 @@ +"""NVCF (NVIDIA Cloud Functions) runtime implementation.""" + +from openhands.runtime.impl.nvcf.nvcf_runtime import NVCFRuntime +from openhands.runtime.impl.nvcf.osworld_nvcf_runtime import OSWorldNVCFRuntime + +__all__ = [ + "NVCFRuntime", + "OSWorldNVCFRuntime", +] diff --git a/openhands/runtime/impl/nvcf/nvcf_proxy.py b/openhands/runtime/impl/nvcf/nvcf_proxy.py new file mode 100644 index 000000000..dd2ead1b9 --- /dev/null +++ b/openhands/runtime/impl/nvcf/nvcf_proxy.py @@ -0,0 +1,222 @@ +"""Local proxy for NVCF services that injects authentication headers. + +This module provides a local HTTP/WebSocket proxy that forwards requests to NVCF +with the required Authorization and Function-ID headers. This is necessary for +services like Chrome DevTools Protocol and VLC that don't support custom headers. +""" + +import asyncio +import socket +import threading +import time +from http.server import HTTPServer, BaseHTTPRequestHandler +from typing import Optional +import urllib.request +import urllib.error +import ssl +import json + +from openhands.core.logger import openhands_logger as logger + + +class NVCFProxyHandler(BaseHTTPRequestHandler): + """HTTP request handler that forwards to NVCF with auth headers.""" + + # Class-level configuration (set by NVCFLocalProxy) + nvcf_base_url: str = "" + nvcf_path_prefix: str = "" + api_key: str = "" + function_id: str = "" + + def log_message(self, format, *args): + """Suppress default logging.""" + pass + + def _get_target_url(self, path: str) -> str: + """Convert local path to NVCF target URL.""" + # Remove leading slash for clean join + path = path.lstrip("/") + # Build target URL: base + path_prefix + path + prefix = self.nvcf_path_prefix.strip("/") + if prefix: + return f"{self.nvcf_base_url}/{prefix}/{path}" + return f"{self.nvcf_base_url}/{path}" + + def _forward_request(self, method: str) -> None: + """Forward request to NVCF.""" + target_url = self._get_target_url(self.path) + + # Read request body + content_length = int(self.headers.get('Content-Length', 0)) + body = self.rfile.read(content_length) if content_length > 0 else None + + # Build headers + headers = { + "Authorization": f"Bearer {self.api_key}", + "Function-ID": self.function_id, + } + + # Copy relevant headers from original request + for header in ['Content-Type', 'Accept', 'User-Agent']: + if header in self.headers: + headers[header] = self.headers[header] + + try: + # Create request + req = urllib.request.Request( + target_url, + data=body, + headers=headers, + method=method, + ) + + # Create SSL context that doesn't verify (for simplicity) + ctx = ssl.create_default_context() + + # Make request + with urllib.request.urlopen(req, context=ctx, timeout=60) as response: + # Send response status + self.send_response(response.status) + + # Forward response headers + for header, value in response.headers.items(): + if header.lower() not in ('transfer-encoding', 'connection'): + self.send_header(header, value) + self.end_headers() + + # Forward response body + self.wfile.write(response.read()) + + except urllib.error.HTTPError as e: + self.send_response(e.code) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + error_body = e.read() if e.fp else b'' + self.wfile.write(error_body or f"HTTP Error {e.code}: {e.reason}".encode()) + except urllib.error.URLError as e: + self.send_response(502) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(f"Proxy Error: {e.reason}".encode()) + except Exception as e: + self.send_response(502) + self.send_header('Content-Type', 'text/plain') + self.end_headers() + self.wfile.write(f"Proxy Error: {e}".encode()) + + def do_GET(self): + self._forward_request("GET") + + def do_POST(self): + self._forward_request("POST") + + def do_PUT(self): + self._forward_request("PUT") + + def do_DELETE(self): + self._forward_request("DELETE") + + def do_OPTIONS(self): + self._forward_request("OPTIONS") + + def do_HEAD(self): + self._forward_request("HEAD") + + +class NVCFLocalProxy: + """Local proxy that forwards requests to NVCF with auth headers. + + Supports HTTP connections. Used for Chrome DevTools and VLC web interface + access through NVCF. + + Note: This is a simple HTTP proxy. For full WebSocket support (needed for + Chrome DevTools), you may need a more sophisticated solution. + """ + + def __init__( + self, + nvcf_base_url: str, + nvcf_path_prefix: str, + api_key: str, + function_id: str, + local_port: Optional[int] = None, + ): + """Initialize the NVCF local proxy. + + Args: + nvcf_base_url: Base URL for NVCF (e.g., https://grpc.nvcf.nvidia.com) + nvcf_path_prefix: Path prefix for the service (e.g., /chrome, /vlc) + api_key: NGC API key for authentication + function_id: NVCF function ID + local_port: Local port to listen on (auto-assigned if None) + """ + self.nvcf_base_url = nvcf_base_url.rstrip("/") + self.nvcf_path_prefix = nvcf_path_prefix + self.api_key = api_key + self.function_id = function_id + self.local_port = local_port or self._find_available_port() + + self._server: Optional[HTTPServer] = None + self._thread: Optional[threading.Thread] = None + self._running = False + + @staticmethod + def _find_available_port() -> int: + """Find an available port on localhost.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + s.listen(1) + port = s.getsockname()[1] + return port + + @property + def local_url(self) -> str: + """Get the local proxy URL.""" + return f"http://127.0.0.1:{self.local_port}" + + def _create_handler_class(self): + """Create a handler class with configuration bound.""" + class ConfiguredHandler(NVCFProxyHandler): + nvcf_base_url = self.nvcf_base_url + nvcf_path_prefix = self.nvcf_path_prefix + api_key = self.api_key + function_id = self.function_id + return ConfiguredHandler + + def _run_server(self) -> None: + """Run the HTTP server in a thread.""" + handler_class = self._create_handler_class() + self._server = HTTPServer(("127.0.0.1", self.local_port), handler_class) + self._running = True + self._server.serve_forever() + + def start(self) -> None: + """Start the proxy server in a background thread.""" + if self._running: + return + self._thread = threading.Thread(target=self._run_server, daemon=True) + self._thread.start() + # Wait for server to be ready + for _ in range(50): # 5 seconds timeout + time.sleep(0.1) + if self._running: + break + logger.debug(f"NVCF proxy started on port {self.local_port}") + + def stop(self) -> None: + """Stop the proxy server.""" + if not self._running: + return + self._running = False + if self._server: + self._server.shutdown() + self._server = None + if self._thread: + self._thread.join(timeout=5) + self._thread = None + logger.debug(f"NVCF proxy stopped on port {self.local_port}") + + +def find_available_port() -> int: + """Find an available port on localhost.""" + return NVCFLocalProxy._find_available_port() diff --git a/openhands/runtime/impl/nvcf/nvcf_runtime.py b/openhands/runtime/impl/nvcf/nvcf_runtime.py new file mode 100644 index 000000000..536b4041e --- /dev/null +++ b/openhands/runtime/impl/nvcf/nvcf_runtime.py @@ -0,0 +1,175 @@ +"""Base NVCF runtime: extends ActionExecutionClient with deploy-on-connect and undeploy-on-close. + +This runtime does not run a local action execution server; connect() deploys an NVCF +function (or attaches to an existing one), and close() undeploys if we deployed in +this session. Subclasses (e.g. OSWorldNVCFRuntime) implement run_action by calling +the NVCF API. +""" + +import os +from typing import Any + +from openhands.core.config import OpenHandsConfig +from openhands.core.exceptions import AgentRuntimeDisconnectedError +from openhands.core.logger import openhands_logger as logger +from openhands.events import EventStream +from openhands.events.observation import ErrorObservation, Observation +from openhands.integrations.provider import PROVIDER_TOKEN_TYPE +from openhands.runtime.impl.action_execution.action_execution_client import ( + ActionExecutionClient, +) +from openhands.runtime.plugins import PluginRequirement + + +class NVCFRuntime(ActionExecutionClient): + """Base runtime for NVIDIA Cloud Functions (NVCF). + + - connect(): Deploys an NVCF function if nvcf_function_id is not set (uses + openhands.nvidia.os_world.nvcf), then marks runtime as initialized. + - close(): Undeploys the function if we deployed it in this session and + undeploy_on_close is True, then closes the session. + - check_if_alive(): Raises if runtime is not connected (no action server to ping). + - run_action(): Returns ErrorObservation (no local action server); subclasses + override to dispatch to NVCF API (e.g. OSWorld). + """ + + def __init__( + self, + config: OpenHandsConfig, + event_stream: EventStream, + sid: str = 'default', + plugins: list[PluginRequirement] | None = None, + env_vars: dict[str, str] | None = None, + status_callback: Any | None = None, + attach_to_existing: bool = False, + headless_mode: bool = True, + user_id: str | None = None, + git_provider_tokens: PROVIDER_TOKEN_TYPE | None = None, + nvcf_function_id: str | None = None, + nvcf_version_id: str | None = None, + nvcf_api_key: str | None = None, + nvcf_org: str | None = None, + nvcf_function_config: Any = None, + nvcf_deployment_config: Any = None, + undeploy_on_close: bool = True, + ): + self._nvcf_api_key = nvcf_api_key or os.environ.get("NGC_API_KEY") + self._nvcf_org = nvcf_org or os.environ.get("NGC_ORG") + _fid = nvcf_function_id or os.environ.get("NVCF_FUNCTION_ID") + self._nvcf_function_id = ( + _fid.strip() if isinstance(_fid, str) and _fid else (_fid or None) + ) + _vid = nvcf_version_id or os.environ.get("NVCF_VERSION_ID") + self._nvcf_version_id = ( + _vid.strip() if isinstance(_vid, str) and _vid else (_vid or None) + ) + self._nvcf_function_config = nvcf_function_config + self._nvcf_deployment_config = nvcf_deployment_config + self._undeploy_on_close = undeploy_on_close + self._we_deployed_this = False + self._deployer = None + + if not self._nvcf_api_key: + raise ValueError( + "NGC API key required for NVCF. Provide nvcf_api_key or set NGC_API_KEY." + ) + if not self._nvcf_function_id and not self._nvcf_org: + raise ValueError( + "When deploying on connect, NGC org required. " + "Provide nvcf_org or set NGC_ORG." + ) + + super().__init__( + config=config, + event_stream=event_stream, + sid=sid, + plugins=plugins, + env_vars=env_vars, + status_callback=status_callback, + attach_to_existing=attach_to_existing, + headless_mode=headless_mode, + user_id=user_id, + git_provider_tokens=git_provider_tokens, + ) + + def _deploy_nvcf(self) -> tuple[str, str]: + """Create and deploy an NVCF function; return (function_id, version_id).""" + from openhands.nvidia.os_world.nvcf import ( + OSWorldDeployer, + OSWorldFunctionConfig, + OSWorldDeploymentConfig, + ) + func_config = self._nvcf_function_config + if func_config is None: + func_config = OSWorldFunctionConfig( + name=f"nvcf-runtime-{self.sid}", + description="NVCF runtime deploy-on-connect", + ) + deploy_config = self._nvcf_deployment_config + if deploy_config is None: + deploy_config = OSWorldDeploymentConfig( + gpu="L40S", + min_instances=1, + max_instances=1, + ) + self.log("info", "Deploying NVCF function...") + self.send_status_message("STATUS$PREPARING_CONTAINER") + self._deployer = OSWorldDeployer( + api_key=self._nvcf_api_key, + org_name=self._nvcf_org, + ) + result = self._deployer.create_function(func_config) + function = result.get("function", {}) + function_id = function.get("id") + version_id = function.get("versionId") + if not function_id or not version_id: + raise RuntimeError(f"NVCF create_function did not return ids: {result}") + self.log("info", f"Created function {function_id}; deploying...") + self._deployer.deploy(function_id, version_id, deploy_config) + self.log("info", "Waiting for NVCF function to become ACTIVE...") + self._deployer.wait_for_active( + function_id, version_id, timeout=1800, poll_interval=30 + ) + self.log("info", "NVCF function is ACTIVE") + return function_id, version_id + + async def connect(self) -> None: + """Deploy NVCF function if needed, then mark runtime as connected.""" + self.send_status_message("STATUS$STARTING_RUNTIME") + from openhands.utils.async_utils import call_sync_from_async + if self._nvcf_function_id is None: + self._nvcf_function_id, self._nvcf_version_id = await call_sync_from_async( + self._deploy_nvcf + ) + self._we_deployed_this = True + self._runtime_initialized = True + self.log("info", f"NVCF runtime connected: {self._nvcf_function_id}") + + def check_if_alive(self) -> None: + """NVCF runtime has no local action server; require subclass to implement.""" + if not getattr(self, "_runtime_initialized", False): + raise AgentRuntimeDisconnectedError("NVCF runtime is not connected.") + + def run_action(self, action) -> Observation: + """Base NVCF runtime does not execute actions; subclasses override for NVCF API.""" + return ErrorObservation( + "This runtime does not support the requested action. " + "Use OSWorldNVCFRuntime for OSWorld actions." + ) + + def close(self, rm_all_containers: bool | None = None) -> None: + """Undeploy NVCF function if we deployed in this session, then close session.""" + if self._we_deployed_this and self._undeploy_on_close and self._deployer: + if self._nvcf_function_id and self._nvcf_version_id: + try: + self.log("info", "Undeploying NVCF function...") + self._deployer.undeploy( + self._nvcf_function_id, + self._nvcf_version_id, + graceful=True, + ) + self.log("info", "NVCF function undeployed") + except Exception as e: + logger.warning(f"Failed to undeploy NVCF function: {e}") + self._deployer = None + super().close() diff --git a/openhands/runtime/impl/nvcf/osworld_nvcf_runtime.py b/openhands/runtime/impl/nvcf/osworld_nvcf_runtime.py new file mode 100644 index 000000000..291f25a13 --- /dev/null +++ b/openhands/runtime/impl/nvcf/osworld_nvcf_runtime.py @@ -0,0 +1,961 @@ +"""OSWorld NVCF runtime: same API as OSWorldSingularityRuntime but dispatches to NVCF API.""" + +import os +import time +import threading +from typing import TYPE_CHECKING, Any, Optional + +import httpx + +from openhands.core.config import OpenHandsConfig +from openhands.core.exceptions import AgentRuntimeDisconnectedError +from openhands.core.logger import openhands_logger as logger +from openhands.events import EventStream +from openhands.events.tool import ToolCallMetadata +from openhands.runtime.impl.nvcf.nvcf_runtime import NVCFRuntime +from openhands.runtime.impl.nvcf.nvcf_proxy import NVCFLocalProxy +from openhands.runtime.plugins import PluginRequirement +from openhands.runtime.utils.osworld_http_client import NVCFHttpClient + +if TYPE_CHECKING: + from openhands.events.observation import Observation + +NVCF_API_BASE = "https://grpc.nvcf.nvidia.com/api" +NVCF_BASE_URL = "https://grpc.nvcf.nvidia.com" + + +class OSWorldNVCFRuntime(NVCFRuntime): + """Runtime for OSWorld via NVCF. Same API as OSWorldSingularityRuntime; dispatches to NVCF.""" + + def __init__( + self, + config: OpenHandsConfig, + event_stream: EventStream, + sid: str = 'default', + plugins: list[PluginRequirement] | None = None, + env_vars: dict[str, str] | None = None, + status_callback: Any | None = None, + attach_to_existing: bool = False, + headless_mode: bool = True, + user_id: str | None = None, + git_provider_tokens: Any = None, + nvcf_function_id: str | None = None, + nvcf_version_id: str | None = None, + nvcf_api_key: str | None = None, + nvcf_org: str | None = None, + nvcf_function_config: Any = None, + nvcf_deployment_config: Any = None, + undeploy_on_close: bool = True, + os_type: str = 'linux', + enable_chrome_proxy: bool = True, + enable_vlc_proxy: bool = True, + ): + self.os_type = os_type.lower() + self._nvcf_client: httpx.Client | None = None + self.screen_size: tuple[int, int] = (1920, 1080) # updated by get_vm_screen_size + + # Local proxy settings + self._enable_chrome_proxy = enable_chrome_proxy + self._enable_vlc_proxy = enable_vlc_proxy + self._chrome_proxy: Optional[NVCFLocalProxy] = None + self._vlc_proxy: Optional[NVCFLocalProxy] = None + + super().__init__( + config=config, + event_stream=event_stream, + sid=sid, + plugins=plugins, + env_vars=env_vars, + status_callback=status_callback, + attach_to_existing=attach_to_existing, + headless_mode=headless_mode, + user_id=user_id, + git_provider_tokens=git_provider_tokens, + nvcf_function_id=nvcf_function_id, + nvcf_version_id=nvcf_version_id, + nvcf_api_key=nvcf_api_key, + nvcf_org=nvcf_org, + nvcf_function_config=nvcf_function_config, + nvcf_deployment_config=nvcf_deployment_config, + undeploy_on_close=undeploy_on_close, + ) + + async def connect(self) -> None: + await super().connect() + self.log("debug", "Connecting to OSWorld NVCF function...") + headers = { + "Authorization": f"Bearer {self._nvcf_api_key}", + "Function-ID": self._nvcf_function_id, + } + self._nvcf_client = httpx.Client( + base_url=NVCF_API_BASE, + headers=headers, + timeout=60.0, + ) + # Verify endpoint and capture NVCF session headers + r = self._nvcf_client.get("/screenshot", timeout=15.0) + if r.status_code != 200: + body = r.text[:500] + self._nvcf_client.close() + self._nvcf_client = None + raise AgentRuntimeDisconnectedError( + f"NVCF function returned HTTP {r.status_code}: {body}" + ) + # NVCF stateful functions return session routing headers — persist them + self._nvcf_session_headers = {} + self.log("info", f"NVCF init response headers: {dict(r.headers)}") + for hdr in ("NVCF-REQID", "NVCF-SESSION-ID", "nvcf-reqid", "nvcf-session-id"): + val = r.headers.get(hdr) + if val: + self._nvcf_client.headers[hdr] = val + self._nvcf_session_headers[hdr] = val + self.log("info", f"Captured NVCF session header: {hdr}={val}") + self.log("info", f"NVCF session headers captured: {self._nvcf_session_headers}") + self.log("info", f"OSWorld NVCF client ready: {self._nvcf_function_id}") + + # Start NVCF session keepalive to prevent idle timeout (~30-60s) + self._keepalive_stop = threading.Event() + self._keepalive_thread = threading.Thread( + target=self._nvcf_keepalive_loop, daemon=True + ) + self._keepalive_thread.start() + + # Soft-reset VM state from previous job (kill leftover apps, clear temp files) + self._reset_vm() + # self.log("info", "Skipping VM soft-reset") + + # Start local proxies for Chrome DevTools and VLC + self._start_local_proxies() + + def _reset_vm(self) -> None: + """Soft-reset VM state between jobs: kill leftover apps, clear temp files. + + Important: Do NOT kill chrome, chromium, or socat -- these are part of the + VM base state and are expected by setup steps (_chrome_open_tabs_setup uses + socat to proxy Chrome DevTools). Setup will re-launch them if needed. + """ + script = """ +# Kill common leftover apps from OSWorld tasks +# NOTE: Do NOT kill chrome/chromium/socat - they are part of the VM base state +pkill -f thunderbird || true +pkill -f libreoffice || true +pkill -f vlc || true +pkill -f gimp || true +pkill -f nautilus || true +pkill -f gedit || true +pkill -f "code " || true +pkill -f evince || true +pkill -f eog || true + +# Clean temp files (screenshot temp files + general tmp) +rm -f /tmp/tmp*.png 2>/dev/null || true +rm -f /tmp/tmp*.jpg 2>/dev/null || true +rm -rf /tmp/tmp[A-Za-z0-9_]* 2>/dev/null || true +rm -rf /home/user/Downloads/* 2>/dev/null || true + +# Remove files uploaded to Desktop by previous jobs +find /home/user/Desktop -maxdepth 1 -newer /etc/hostname -not -name "*.desktop" -delete 2>/dev/null || true + +sleep 0.5 +""" + try: + r = self._nvcf_post("/run_bash_script", json={"script": script, "timeout": 30}) + if r.status_code == 200: + self.log("info", "VM soft-reset completed successfully") + else: + self.log("warning", f"VM soft-reset returned HTTP {r.status_code}") + except Exception as e: + self.log("warning", f"VM soft-reset failed: {e}") + + def _nvcf_keepalive_loop(self) -> None: + """Ping the NVCF function every 20s to prevent session idle timeout.""" + while not self._keepalive_stop.wait(20.0): + try: + r = self.http_client.get("/platform", timeout=10.0) + if r.status_code != 200: + self.log("warning", f"Keepalive got HTTP {r.status_code}") + except Exception as e: + self.log("warning", f"Keepalive failed: {e}") + + def _start_local_proxies(self) -> None: + """Start local proxies for Chrome DevTools and VLC web interface.""" + if self._enable_chrome_proxy: + try: + self._chrome_proxy = NVCFLocalProxy( + nvcf_base_url=NVCF_BASE_URL, + nvcf_path_prefix="/chrome", + api_key=self._nvcf_api_key, + function_id=self._nvcf_function_id, + ) + self._chrome_proxy.start() + self.log("info", f"Chrome DevTools proxy started at {self._chrome_proxy.local_url}") + except Exception as e: + self.log("warning", f"Failed to start Chrome proxy: {e}") + self._chrome_proxy = None + + if self._enable_vlc_proxy: + try: + self._vlc_proxy = NVCFLocalProxy( + nvcf_base_url=NVCF_BASE_URL, + nvcf_path_prefix="/vlc", + api_key=self._nvcf_api_key, + function_id=self._nvcf_function_id, + ) + self._vlc_proxy.start() + self.log("info", f"VLC web interface proxy started at {self._vlc_proxy.local_url}") + except Exception as e: + self.log("warning", f"Failed to start VLC proxy: {e}") + self._vlc_proxy = None + + def _stop_local_proxies(self) -> None: + """Stop all local proxies.""" + if self._chrome_proxy: + try: + self._chrome_proxy.stop() + self.log("debug", "Chrome DevTools proxy stopped") + except Exception as e: + self.log("warning", f"Failed to stop Chrome proxy: {e}") + self._chrome_proxy = None + + if self._vlc_proxy: + try: + self._vlc_proxy.stop() + self.log("debug", "VLC web interface proxy stopped") + except Exception as e: + self.log("warning", f"Failed to stop VLC proxy: {e}") + self._vlc_proxy = None + + def check_if_alive(self) -> None: + if not self._nvcf_client: + raise AgentRuntimeDisconnectedError("OSWorld NVCF runtime is not connected.") + r = self._nvcf_get("/screenshot", timeout=5.0) + if r.status_code != 200: + raise AgentRuntimeDisconnectedError("NVCF function is not responding") + + def close(self, rm_all_containers: bool | None = None) -> None: + # Stop keepalive thread + if hasattr(self, '_keepalive_stop'): + self._keepalive_stop.set() + + # Stop local proxies first + self._stop_local_proxies() + + # Clear shared http_client + if hasattr(self, '_http_client'): + self._http_client = None + + if self._nvcf_client: + try: + self._nvcf_client.close() + except Exception as e: + logger.warning(f"Failed to close NVCF client: {e}") + self._nvcf_client = None + super().close(rm_all_containers) + + # --- Properties (match OSWorldSingularityRuntime) --- + @property + def osworld_vm_url(self) -> str: + return NVCF_API_BASE + + @property + def vnc_url(self) -> str: + # VNC is not currently proxied (would need WebSocket support for noVNC) + return "vnc://nvcf-not-available" + + @property + def chromium_devtools_url(self) -> str: + """Get the Chrome DevTools URL (local proxy or placeholder).""" + if self._chrome_proxy and self._chrome_proxy._running: + return self._chrome_proxy.local_url + return "http://nvcf-not-available" + + @property + def chromium_port(self) -> int: + """Get the local Chrome DevTools proxy port.""" + if self._chrome_proxy and self._chrome_proxy._running: + return self._chrome_proxy.local_port + return 9222 # Default fallback + + @property + def vlc_url(self) -> str: + """Get the VLC web interface URL (local proxy or placeholder).""" + if self._vlc_proxy and self._vlc_proxy._running: + return self._vlc_proxy.local_url + return "http://nvcf-not-available" + + @property + def vlc_port(self) -> int: + """Get the local VLC proxy port.""" + if self._vlc_proxy and self._vlc_proxy._running: + return self._vlc_proxy.local_port + return 8080 # Default fallback + + @property + def vm_ip(self) -> str: + """Get the VM IP for setup controller compatibility. + + For NVCF, this returns localhost since we use local proxies. + """ + return "127.0.0.1" + + @property + def http_client(self): + """Get the HTTP client for runtime-agnostic communication. + + Returns a shared NVCFHttpClient that handles NVCF authentication and URL rewriting. + Reuses the same instance so NVCF session state is preserved across all callers. + """ + if not hasattr(self, '_http_client') or self._http_client is None: + self._http_client = NVCFHttpClient( + api_key=self._nvcf_api_key, + function_id=self._nvcf_function_id, + session_headers=getattr(self, '_nvcf_session_headers', None), + ) + return self._http_client + + # --- OSWorld API (NVCF HTTP) --- + def _nvcf_get(self, endpoint: str, **kwargs): + """GET via shared http_client (requests-based) to maintain NVCF session.""" + return self.http_client.get(endpoint, **kwargs) + + def _nvcf_post(self, endpoint: str, **kwargs): + """POST via shared http_client (requests-based) to maintain NVCF session.""" + return self.http_client.post(endpoint, **kwargs) + + def get_vm_screenshot(self) -> bytes | None: + max_retries = 5 + for attempt in range(max_retries): + try: + r = self._nvcf_get("/screenshot", timeout=30.0) + if r.status_code == 200: + return r.content + body = r.text[:200] if r.text else "" + self.log("warning", + f"Screenshot attempt {attempt + 1}/{max_retries} failed: " + f"HTTP {r.status_code} fn={self._nvcf_function_id} body={body}") + except Exception as e: + self.log("warning", f"Screenshot attempt {attempt + 1}/{max_retries} failed: {e}") + if attempt < max_retries - 1: + backoff = min(5 * (2 ** attempt), 30) # 5s, 10s, 20s, 30s + time.sleep(backoff) + self.log("error", f"Failed to get VM screenshot after {max_retries} retries (fn={self._nvcf_function_id})") + return None + + def get_vm_accessibility_tree(self) -> str | None: + try: + r = self._nvcf_get("/accessibility", timeout=30.0) + if r.status_code != 200: + return None + try: + return r.json().get("AT") or r.json().get("accessibility_tree") or r.text + except Exception: + return r.text + except Exception as e: + self.log("error", f"Failed to get VM accessibility tree: {e}") + return None + + def _execute_pyautogui_command(self, pyautogui_command: str) -> dict: + command = ( + "import pyautogui; import time; pyautogui.FAILSAFE = False; " + + pyautogui_command + ) + payload = {"command": ["python", "-c", command], "shell": False} + max_retries = 5 + for attempt in range(max_retries): + try: + r = self._nvcf_post("/execute", json=payload, timeout=30.0) + if r.status_code == 200: + return r.json() + body = r.text[:200] if r.text else "" + self.log("warning", + f"Execute attempt {attempt + 1}/{max_retries} failed: " + f"HTTP {r.status_code} fn={self._nvcf_function_id} body={body}") + except Exception as e: + self.log("warning", f"Execute attempt {attempt + 1}/{max_retries} failed: {e}") + if attempt < max_retries - 1: + backoff = min(5 * (2 ** attempt), 30) # 5s, 10s, 20s, 30s + time.sleep(backoff) + return {"status": "error", "message": f"Failed after {max_retries} retries"} + + def _action_to_pyautogui_command(self, action_type: str, parameters: dict) -> str | None: + import random + move_mode = random.choice([ + "pyautogui.easeInQuad", "pyautogui.easeOutQuad", "pyautogui.easeInOutQuad", + "pyautogui.easeInBounce", "pyautogui.easeInElastic", + ]) + if action_type == "CLICK": + x, y = parameters.get("x"), parameters.get("y") + button = parameters.get("button", "left") + num_clicks = parameters.get("clicks", 1) + interval = parameters.get("interval", 0.0) + duration = parameters.get("duration", 0.0) + if x is not None and y is not None: + return f"pyautogui.click(x={x}, y={y}, button='{button}', clicks={num_clicks}, interval={interval}, duration={duration})" + return "pyautogui.click()" + elif action_type == "DOUBLE_CLICK": + x, y = parameters.get("x"), parameters.get("y") + button = parameters.get("button", "left") + interval = parameters.get("interval", 0.0) + duration = parameters.get("duration", 0.0) + if x is not None and y is not None: + return f"pyautogui.doubleClick(x={x}, y={y}, button='{button}', interval={interval}, duration={duration})" + return "pyautogui.doubleClick()" + elif action_type == "TRIPLE_CLICK": + x, y = parameters.get("x"), parameters.get("y") + button = parameters.get("button", "left") + interval = parameters.get("interval", 0.0) + duration = parameters.get("duration", 0.0) + if x is not None and y is not None: + return f"pyautogui.tripleClick(x={x}, y={y}, button='{button}', interval={interval}, duration={duration})" + return "pyautogui.tripleClick()" + elif action_type == "RIGHT_CLICK": + x, y = parameters.get("x"), parameters.get("y") + interval = parameters.get("interval", 0.0) + duration = parameters.get("duration", 0.0) + if x is not None and y is not None: + return f"pyautogui.rightClick(x={x}, y={y}, interval={interval}, duration={duration})" + return "pyautogui.rightClick()" + elif action_type == "MIDDLE_CLICK": + x, y = parameters.get("x"), parameters.get("y") + interval = parameters.get("interval", 0.0) + duration = parameters.get("duration", 0.0) + if x is not None and y is not None: + return f"pyautogui.middleClick(x={x}, y={y}, interval={interval}, duration={duration})" + return "pyautogui.click(button='middle')" + elif action_type == "MOVE_TO": + x, y = parameters.get("x"), parameters.get("y") + duration = parameters.get("duration", 0.0) + if x is not None and y is not None: + return f"pyautogui.moveTo({x}, {y}, {duration}, {move_mode})" + return "pyautogui.moveTo()" + elif action_type == "DRAG_TO": + x, y = parameters.get("x"), parameters.get("y") + duration = parameters.get("duration", 0.0) + button = parameters.get("button", "left") + mouseDownUp = parameters.get("mouseDownUp", True) + if x is not None and y is not None: + return f"pyautogui.dragTo({x}, {y}, button='{button}', duration={duration}, mouseDownUp={mouseDownUp})" + return None + elif action_type == "SCROLL": + x, y = parameters.get("x"), parameters.get("y") + amount = parameters.get("amount", 1) + return f"pyautogui.scroll({amount}, x={x}, y={y})" + elif action_type == "HSCROLL": + x, y = parameters.get("x"), parameters.get("y") + amount = parameters.get("amount", 1) + return f"pyautogui.hscroll({amount}, x={x}, y={y})" + elif action_type == "TYPING": + text = parameters.get("text", "") + interval = parameters.get("interval", 0.0) + return f"pyautogui.typewrite({repr(text)}, interval={interval})" + elif action_type == "PRESS": + key = parameters.get("key", "") + presses = parameters.get("presses", 1) + if isinstance(key, list): + return f"pyautogui.hotkey({', '.join(repr(k) for k in key)})" + if presses > 1: + return f"pyautogui.press('{key}', presses={presses})" + return f"pyautogui.press('{key}')" + elif action_type == "HOTKEY": + keys = parameters.get("keys", []) + if isinstance(keys, list) and keys: + return f"pyautogui.hotkey({', '.join(repr(k) for k in keys)})" + return None + elif action_type == "KEY_DOWN": + return f"pyautogui.keyDown('{parameters.get('key', '')}')" + elif action_type == "KEY_UP": + return f"pyautogui.keyUp('{parameters.get('key', '')}')" + elif action_type == "MOUSE_DOWN": + return f"pyautogui.mouseDown(button='{parameters.get('button', 'left')}')" + elif action_type == "MOUSE_UP": + return f"pyautogui.mouseUp(button='{parameters.get('button', 'left')}')" + elif action_type == "WAIT": + return f"time.sleep({parameters.get('seconds', 1)})" + return None + + def execute_vm_action(self, action_data: dict) -> dict: + action_type = action_data.get("action_type") + parameters = action_data.get("parameters", {}) + cmd = self._action_to_pyautogui_command(action_type, parameters) + if cmd is None: + return {"status": "error", "message": f"Unknown action type: {action_type}"} + return self._execute_pyautogui_command(cmd) + + def run_action(self, action) -> "Observation": + from openhands.events.action.os import OSWorldInteractiveAction + if isinstance(action, OSWorldInteractiveAction): + return self.osworld_interactive(action) + return super().run_action(action) + + def osworld_interactive(self, action) -> "Observation": + from openhands.events.observation import ErrorObservation + method = action.method + params = action.params or {} + try: + if method == "execute_action": + return self._handle_execute_action(params) + if method == "execute_agentic_action": + return self._handle_execute_agentic_action( + params, action.tool_call_metadata, action.pause_time + ) + if method == "get_screenshot": + return self._handle_get_screenshot() + if method == "get_accessibility_tree": + return self._handle_get_accessibility_tree() + if method == "get_terminal_output": + return self._handle_get_terminal_output() + if method == "get_file": + return self._handle_get_file(params) + if method == "execute_python_command": + return self._handle_execute_python_command(params) + if method == "run_python_script": + return self._handle_run_python_script(params) + if method == "run_bash_script": + return self._handle_run_bash_script(params) + if method == "start_recording": + return self._handle_start_recording() + if method == "end_recording": + return self._handle_end_recording(params) + if method == "get_vm_platform": + return self._handle_get_vm_platform() + if method == "get_vm_screen_size": + return self._handle_get_vm_screen_size() + if method == "get_vm_window_size": + return self._handle_get_vm_window_size(params) + if method == "get_vm_wallpaper": + return self._handle_get_vm_wallpaper() + if method == "get_vm_desktop_path": + return self._handle_get_vm_desktop_path() + if method == "get_vm_directory_tree": + return self._handle_get_vm_directory_tree(params) + return ErrorObservation(f"Unknown OSWorld method: {method}") + except Exception as e: + self.log("error", f"OSWorld action failed: {e}") + return ErrorObservation(str(e)) + + def _handle_execute_action(self, params: dict) -> "Observation": + from openhands.events.observation import CmdOutputObservation + action_data = params.get("action", params) + result = self.execute_vm_action(action_data) + if result.get("status") == "success": + return CmdOutputObservation( + content=result.get("output", "Action executed successfully"), + command=str(action_data), + exit_code=0, + ) + return CmdOutputObservation( + content=result.get("error", result.get("message", "Unknown error")), + command=str(action_data), + exit_code=1, + ) + + def _handle_execute_agentic_action( + self, params: dict, tool_call_metadata: ToolCallMetadata | None, pause_time: float = 0.0 + ) -> "Observation": + from openhands.events.observation import ErrorObservation + from openhands.events.observation.osworld import OSWorldOutputObservation + import base64 + action_data = params.get("action", params) + if not getattr(self, "screen_size", None): + self._handle_get_vm_screen_size() + width, height = self.screen_size + if "parameters" in action_data: + p = action_data["parameters"] + if "x" in p: + p["x"] = int(p["x"] * width) + if "y" in p: + p["y"] = int(p["y"] * height) + result = self.execute_vm_action(action_data) + if result.get("status") != "success": + return ErrorObservation( + result.get("error", result.get("message", "Unknown error")), + error_id=getattr(tool_call_metadata, "tool_call_id", None), + name=getattr(tool_call_metadata, "function_name", None), + ) + if pause_time > 0.5: + time.sleep(pause_time) + screenshot_b64 = None + screenshot_bytes = self.get_vm_screenshot() + if screenshot_bytes: + screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8") + a11y = self.get_vm_accessibility_tree() + return OSWorldOutputObservation( + command=str(action_data), + screenshot=screenshot_b64, + accessibility_tree=a11y, + tool_call_id=tool_call_metadata.tool_call_id if tool_call_metadata else None, + name=tool_call_metadata.function_name if tool_call_metadata else None, + ) + + def _handle_get_screenshot(self) -> "Observation": + from openhands.events.observation import CmdOutputObservation, ErrorObservation + # NVCF may need a moment after connect; retry once to avoid transient failure + screenshot_bytes = self.get_vm_screenshot() + if not screenshot_bytes: + time.sleep(2.0) + screenshot_bytes = self.get_vm_screenshot() + if screenshot_bytes: + return CmdOutputObservation( + content="Screenshot captured", + command="get_screenshot", + exit_code=0, + ) + return ErrorObservation("Failed to capture screenshot") + + def _handle_get_accessibility_tree(self) -> "Observation": + from openhands.events.observation import CmdOutputObservation + # NVCF: GET /api/accessibility only + at = self.get_vm_accessibility_tree() + if at: + return CmdOutputObservation(content=at, command="get_accessibility_tree", exit_code=0) + return CmdOutputObservation(content="", command="get_accessibility_tree", exit_code=0) + + def _handle_get_terminal_output(self) -> "Observation": + from openhands.events.observation import CmdOutputObservation, ErrorObservation + for attempt in range(5): + try: + r = self._nvcf_get("/terminal", timeout=30.0) + if r.status_code == 200: + output = r.json().get("output") or "" + return CmdOutputObservation( + content=output, + command="get_terminal_output", + exit_code=0, + ) + body = r.text[:200] if r.text else "" + self.log("warning", f"Terminal output attempt {attempt + 1}/5: HTTP {r.status_code} body={body}") + except Exception as e: + self.log("warning", f"Terminal output attempt {attempt + 1}/5: {e}") + if attempt < 4: + time.sleep(5.0) + return ErrorObservation("Failed to get terminal output after 5 retries") + + def _handle_get_file(self, params: dict) -> "Observation": + from openhands.events.observation import CmdOutputObservation, ErrorObservation + import base64 + file_path = params.get("file_path", "") + if not file_path: + return ErrorObservation("file_path parameter required") + try: + r = self._nvcf_post( + "/file", + data={"file_path": file_path}, + timeout=30.0, + ) + if r.status_code == 200: + content_b64 = base64.b64encode(r.content).decode("utf-8") + return CmdOutputObservation( + content=f"base64:{content_b64}", + command=f"get_file {file_path}", + exit_code=0, + ) + return ErrorObservation(f"Failed to get file: {r.status_code}") + except Exception as e: + return ErrorObservation(f"Failed to get file: {e}") + + def _execute_pyautogui_command(self, pyautogui_command: str) -> dict: + """Execute a PyAutoGUI command string in the VM (same as singularity). + + Args: + pyautogui_command: Raw PyAutoGUI command(s) to execute + + Returns: + Response dictionary from OSWorld server (status, output, error, returncode or message). + """ + wrapped = ( + "import pyautogui; import time; pyautogui.FAILSAFE = False; " + f"{pyautogui_command}" + ) + payload = {"command": ["python", "-c", wrapped], "shell": False} + max_retries = 5 + for attempt in range(max_retries): + try: + r = self._nvcf_post("/execute", json=payload, timeout=30.0) + if r.status_code == 200: + return r.json() + body = r.text[:200] if r.text else "" + self.log("warning", + f"Execute attempt {attempt + 1}/{max_retries} failed: " + f"HTTP {r.status_code} fn={self._nvcf_function_id} body={body}") + except Exception as e: + self.log("warning", f"Execute attempt {attempt + 1}/{max_retries} failed: {e}") + if attempt < max_retries - 1: + time.sleep(5.0) + self.log("error", f"Failed to execute PyAutoGUI command after {max_retries} retries: {pyautogui_command[:100]}") + return {"status": "error", "message": f"Failed after {max_retries} retries"} + + def _handle_execute_python_command(self, params: dict) -> "Observation": + """Handle execute_python_command - raw Python command execution (PyAutoGUI-style, same as singularity).""" + from openhands.events.observation import CmdOutputObservation + + command = params.get("command", "") + if not command: + return CmdOutputObservation( + content="Error: command parameter required", + command="execute_python_command", + exit_code=1, + ) + + result = self._execute_pyautogui_command(command) + + if result.get("status") == "success": + return CmdOutputObservation( + content=result.get("output", ""), + command=command, + exit_code=result.get("returncode", 0), + ) + else: + error_msg = result.get("error", result.get("message", "Unknown error")) + return CmdOutputObservation( + content=f"Error: {error_msg}", + command=command, + exit_code=result.get("returncode", 1), + ) + + def _handle_run_python_script(self, params: dict) -> "Observation": + from openhands.events.observation import CmdOutputObservation + script = params.get("script", "") + if not script: + return CmdOutputObservation(content="script parameter required", command="run_python_script", exit_code=1) + try: + r = self._nvcf_post("/run_python", json={"code": script}, timeout=90.0) + if r.status_code == 200: + res = r.json() + out, err = res.get("output", ""), res.get("error", "") + return CmdOutputObservation( + content=f"Output:\n{out}" + (f"\nError:\n{err}" if err else ""), + command="run_python_script", + exit_code=res.get("returncode", 0), + ) + return CmdOutputObservation(content=r.text or "Run failed", command="run_python_script", exit_code=1) + except Exception as e: + return CmdOutputObservation(content=str(e), command="run_python_script", exit_code=1) + + def _handle_run_bash_script(self, params: dict) -> "Observation": + """Handle run_bash_script. Uses POST /api/run_bash_script.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + + script = params.get("script", "") + if not script: + return ErrorObservation("script parameter required") + timeout = params.get("timeout", 30) + working_dir = params.get("working_dir") + + payload = {"script": script, "timeout": timeout} + if working_dir is not None: + payload["working_dir"] = working_dir + + for attempt in range(5): + try: + r = self._nvcf_post( + "/run_bash_script", + json=payload, + timeout=timeout + 10.0, + ) + if r.status_code == 200: + result = r.json() + output = result.get("output", "") + error = result.get("error", "") + content = output + if error: + content = f"{content}\n{error}" if content else error + return CmdOutputObservation( + content=content, + command="run_bash_script", + exit_code=result.get("returncode", 0), + ) + if r.status_code in (404, 502, 503, 504): + body = r.text[:200] if r.text else "" + self.log("warning", + f"run_bash_script attempt {attempt + 1}/5: HTTP {r.status_code} body={body}") + if attempt < 4: + time.sleep(5.0) + continue + try: + error_detail = r.json() + error_msg = error_detail.get("output", error_detail.get("message", "Unknown error")) + except Exception: + error_msg = r.text or "Unknown error" + return ErrorObservation(f"Failed to run bash script (HTTP {r.status_code}): {error_msg}") + except Exception as e: + self.log("warning", f"run_bash_script attempt {attempt + 1}/5: {e}") + if attempt < 4: + time.sleep(5.0) + continue + return ErrorObservation(f"Failed to run bash script: {e}") + return ErrorObservation("Failed to run bash script after 5 retries") + + def _handle_start_recording(self) -> "Observation": + """Handle start_recording. Uses POST /api/start_recording.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + + try: + r = self._nvcf_post("/start_recording", timeout=10.0) + if r.status_code == 200: + return CmdOutputObservation( + content="Recording started", + command="start_recording", + exit_code=0, + ) + return ErrorObservation(f"Failed to start recording: {r.status_code}") + except Exception as e: + return ErrorObservation(f"Failed to start recording: {e}") + + def _handle_end_recording(self, params: dict) -> "Observation": + """Handle end_recording. POST /api/end_recording returns video file (binary); return as base64.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + import base64 + + try: + r = self._nvcf_post("/end_recording", timeout=60.0) + if r.status_code == 200: + video_content = r.content + content_b64 = base64.b64encode(video_content).decode("utf-8") + return CmdOutputObservation( + content=f"base64:{content_b64}", + command="end_recording", + exit_code=0, + ) + return ErrorObservation(f"Failed to end recording: {r.status_code}") + except Exception as e: + return ErrorObservation(f"Failed to end recording: {e}") + + def _handle_get_vm_platform(self) -> "Observation": + """Handle get_vm_platform. Uses GET /api/platform (returns plain text e.g. Linux).""" + from openhands.events.observation import CmdOutputObservation + + try: + r = self._nvcf_get("/platform", timeout=30.0) + if r.status_code == 200: + platform_str = (r.text or "").strip() + return CmdOutputObservation( + content=platform_str or "Unknown", + command="get_vm_platform", + exit_code=0, + ) + except Exception: + pass + result = self._execute_pyautogui_command("import platform; print(platform.system())") + if result.get("status") == "success": + return CmdOutputObservation( + content=result.get("output", "").strip() or "Unknown", + command="get_vm_platform", + exit_code=0, + ) + return CmdOutputObservation(content="Unknown", command="get_vm_platform", exit_code=1) + + def _handle_get_vm_screen_size(self) -> "Observation": + """Handle get_vm_screen_size. Uses POST /api/screen_size.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + + try: + if hasattr(self, "screen_size") and self.screen_size: + width, height = self.screen_size + else: + r = self._nvcf_post("/screen_size", timeout=30.0) + if r.status_code != 200: + return ErrorObservation(f"Failed to get screen size: {r.status_code}") + size = r.json() + width = size.get("width", 1920) + height = size.get("height", 1080) + self.screen_size = (width, height) + return CmdOutputObservation( + content=f"Width: {width}, Height: {height}", + command="get_vm_screen_size", + exit_code=0, + ) + except Exception as e: + return ErrorObservation(f"Failed to get screen size: {e}") + + def _handle_get_vm_window_size(self, params: dict) -> "Observation": + from openhands.events.observation import CmdOutputObservation + # NVCF has no /api/window_size; use /api/execute with wmctrl only + app = params.get("app_class_name", "") or "window" + try: + payload = {"command": ["wmctrl", "-l", "-G"], "shell": False} + r = self._nvcf_post("/execute", json=payload, timeout=30.0) + if r.status_code == 200: + res = r.json() + out = (res.get("output") or "").strip() + if out: + return CmdOutputObservation( + content=out[:2000], + command=f"get_vm_window_size {app}", + exit_code=0, + ) + return CmdOutputObservation( + content="Window geometry not available (use get_vm_screen_size for display size).", + command=f"get_vm_window_size {app}", + exit_code=0, + ) + except Exception: + return CmdOutputObservation( + content="Window geometry not available (use get_vm_screen_size for display size).", + command=f"get_vm_window_size {app}", + exit_code=0, + ) + + def _handle_get_vm_wallpaper(self) -> "Observation": + """Handle get_vm_wallpaper. POST /api/wallpaper returns wallpaper image (binary); return as base64.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + import base64 + + try: + r = self._nvcf_post("/wallpaper", timeout=30.0) + if r.status_code == 200: + wallpaper_bytes = r.content + content_b64 = base64.b64encode(wallpaper_bytes).decode("utf-8") + return CmdOutputObservation( + content=f"base64:{content_b64}", + command="get_vm_wallpaper", + exit_code=0, + ) + return ErrorObservation(f"Failed to get wallpaper: {r.status_code}") + except Exception as e: + return ErrorObservation(f"Failed to get wallpaper: {e}") + + def _handle_get_vm_desktop_path(self) -> "Observation": + """Handle get_vm_desktop_path. Uses POST /api/desktop_path.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + + try: + r = self._nvcf_post("/desktop_path", timeout=30.0) + if r.status_code == 200: + desktop_path = r.json().get("desktop_path", "") + return CmdOutputObservation( + content=desktop_path, + command="get_vm_desktop_path", + exit_code=0, + ) + return ErrorObservation(f"Failed to get desktop path: {r.status_code}") + except Exception as e: + return ErrorObservation(f"Failed to get desktop path: {e}") + + def _handle_get_vm_directory_tree(self, params: dict) -> "Observation": + """Handle get_vm_directory_tree. Uses POST /api/list_directory with JSON {path}.""" + from openhands.events.observation import CmdOutputObservation, ErrorObservation + import json + + path = params.get("path", "") + if not path: + return ErrorObservation("path parameter required") + try: + r = self._nvcf_post( + "/list_directory", + json={"path": path}, + timeout=30.0, + ) + if r.status_code == 200: + directory_tree = r.json().get("directory_tree", {}) + content = json.dumps(directory_tree, indent=2) + return CmdOutputObservation( + content=content, + command=f"get_vm_directory_tree {path}", + exit_code=0, + ) + return ErrorObservation(f"Failed to get directory tree: {r.status_code}") + except Exception as e: + return ErrorObservation(f"Failed to get directory tree: {e}") + + def get_microagents_from_selected_repo(self, selected_repository: str | None): + return [] diff --git a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py index 85b5c18d1..37260248d 100644 --- a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py +++ b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py @@ -32,6 +32,7 @@ from openhands.runtime.plugins import PluginRequirement from openhands.runtime.utils import find_available_tcp_port from openhands.runtime.utils.command import DEFAULT_MAIN_MODULE +from openhands.runtime.utils.osworld_http_client import DirectHttpClient from openhands.events.tool import ToolCallMetadata @@ -586,6 +587,19 @@ def vlc_url(self) -> str: """ return f'http://localhost:{self._vlc_port}' + + @property + def http_client(self): + """Get the HTTP client for runtime-agnostic communication. + + Returns a DirectHttpClient that makes direct HTTP requests to the VM. + """ + return DirectHttpClient( + base_url=self.osworld_vm_url, + chromium_port=self._chromium_port, + vlc_port=self._vlc_port + ) + def get_vm_screenshot(self) -> bytes | None: """Get screenshot from the VM. diff --git a/openhands/runtime/utils/osworld_http_client.py b/openhands/runtime/utils/osworld_http_client.py new file mode 100644 index 000000000..76b02e591 --- /dev/null +++ b/openhands/runtime/utils/osworld_http_client.py @@ -0,0 +1,216 @@ +"""HTTP client abstraction for OSWorld VM communication. + +This module provides a unified HTTP client interface that works with both: +- Singularity runtime (direct HTTP to localhost) +- NVCF runtime (authenticated HTTPS to NVIDIA cloud) + +This module is placed in openhands.runtime.utils to avoid circular imports +with openhands.nvidia, which has side effects on import (handler registration). +""" + +from typing import Protocol, Optional, Dict +import requests + +from openhands.core.logger import openhands_logger as logger + + +class OSWorldHttpClient(Protocol): + """Protocol for HTTP communication with OSWorld VM. + + This abstraction allows controllers and getters to use the same code + regardless of whether they're talking to a local VM or NVCF. + """ + + def get(self, endpoint: str, **kwargs) -> requests.Response: + """Make a GET request to the VM server.""" + ... + + def post(self, endpoint: str, **kwargs) -> requests.Response: + """Make a POST request to the VM server.""" + ... + + def get_cdp_url(self) -> str: + """Get the Chrome DevTools Protocol URL for Playwright.""" + ... + + def get_cdp_headers(self) -> Optional[Dict[str, str]]: + """Get headers needed for CDP connection (None for direct connection).""" + ... + + def get_vlc_url(self) -> str: + """Get the VLC web interface base URL.""" + ... + + def update_launch_command(self, command: str) -> str: + """Update the launch command to use the HTTP client.""" + ... + + +class DirectHttpClient: + """Direct HTTP client for Singularity/local runtime. + + Makes direct HTTP requests to the VM server running on localhost. + No special authentication or URL rewriting needed. + """ + + def __init__(self, base_url: str, chromium_port: int, vlc_port: int): + """Initialize the direct HTTP client. + + Args: + base_url: Base URL for the VM server (e.g., http://127.0.0.1:5000) + chromium_port: Port for Chrome DevTools Protocol + vlc_port: Port for VLC web interface + """ + self.base_url = base_url.rstrip('/') + self.chromium_port = chromium_port + self.vlc_port = vlc_port + + def get(self, endpoint: str, **kwargs) -> requests.Response: + """Make a GET request to the VM server.""" + url = self.base_url + endpoint + return requests.get(url, **kwargs) + + def post(self, endpoint: str, **kwargs) -> requests.Response: + """Make a POST request to the VM server.""" + url = self.base_url + endpoint + return requests.post(url, **kwargs) + + def get_cdp_url(self) -> str: + """Get the Chrome DevTools Protocol URL.""" + return f"http://127.0.0.1:{self.chromium_port}" + + def get_cdp_headers(self) -> Optional[Dict[str, str]]: + """No special headers needed for direct connection.""" + return None + + def get_vlc_url(self) -> str: + """Get the VLC web interface URL.""" + return f"http://127.0.0.1:{self.vlc_port}" + + def update_launch_command(self, command: str) -> str: + """Update the launch command to use the HTTP client.""" + return command + +class NVCFHttpClient: + """NVCF HTTP client with authentication and URL rewriting. + + Makes authenticated HTTPS requests to NVIDIA Cloud Functions. + Handles WebSocket URL rewriting for Chrome DevTools Protocol. + """ + + NVCF_API_BASE = "https://grpc.nvcf.nvidia.com/api" + NVCF_CHROME_BASE = "https://grpc.nvcf.nvidia.com/chrome" + NVCF_VLC_BASE = "https://grpc.nvcf.nvidia.com/vlc" + + def __init__(self, api_key: str, function_id: str, session_headers: Optional[Dict[str, str]] = None): + """Initialize the NVCF HTTP client. + + Args: + api_key: NGC API key for authentication + function_id: NVCF function ID + session_headers: Optional NVCF session routing headers (e.g. NVCF-SESSION-ID) + """ + self.headers = { + "Authorization": f"Bearer {api_key}", + "Function-ID": function_id, + } + if session_headers: + self.headers.update(session_headers) + self._session = requests.Session() + self._session.headers.update(self.headers) + self._cached_cdp_url: Optional[str] = None + logger.info(f"NVCFHttpClient created with headers: { {k: v[:20]+'...' if len(str(v))>20 else v for k,v in self.headers.items()} }") + + def get(self, endpoint: str, **kwargs) -> requests.Response: + """Make an authenticated GET request to NVCF.""" + url = self.NVCF_API_BASE + endpoint + # Merge auth headers with any provided headers + headers = {**self.headers, **kwargs.pop('headers', {})} + return self._session.get(url, headers=headers, **kwargs) + + def post(self, endpoint: str, **kwargs) -> requests.Response: + """Make an authenticated POST request to NVCF.""" + url = self.NVCF_API_BASE + endpoint + # Merge auth headers with any provided headers + headers = {**self.headers, **kwargs.pop('headers', {})} + return self._session.post(url, headers=headers, **kwargs) + + def get_cdp_url(self) -> str: + """Get the Chrome DevTools Protocol URL with WebSocket rewriting. + + Playwright's connect_over_cdp() performs discovery by fetching /json/version. + Chrome returns ws://localhost/... which doesn't work for NVCF. + We fetch the discovery ourselves and rewrite the WebSocket URL. + """ + try: + # Fetch the WebSocket URL from Chrome's discovery endpoint + response = requests.get( + f"{self.NVCF_CHROME_BASE}/json/version", + headers=self.headers, + timeout=30 + ) + response.raise_for_status() + data = response.json() + ws_url = data.get("webSocketDebuggerUrl", "") + + if not ws_url: + logger.warning("No webSocketDebuggerUrl in Chrome discovery response") + return self.NVCF_CHROME_BASE + + # Rewrite ws://localhost/... to wss://grpc.nvcf.nvidia.com/chrome/... + rewritten_url = ws_url.replace( + "ws://localhost/", + "wss://grpc.nvcf.nvidia.com/chrome/" + ) + logger.debug(f"CDP URL rewritten: {ws_url} -> {rewritten_url}") + return rewritten_url + + except Exception as e: + logger.error(f"Failed to get CDP URL from NVCF: {e}") + # Return base URL as fallback - Playwright will do its own discovery + return self.NVCF_CHROME_BASE + + def get_cdp_headers(self) -> Dict[str, str]: + """Get authentication headers for CDP connection.""" + return self.headers.copy() + + def get_vlc_url(self) -> str: + """Get the VLC web interface URL through NVCF.""" + return self.NVCF_VLC_BASE + + def update_launch_command(self, command) -> list: + """Update the launch command for NVCF VMs. + + Replaces google-chrome with google-chrome-wrapper and adds + required NVCF flags, while preserving the original + --remote-debugging-port from the setup config. + """ + if not command: + return command + + cmd_name = command[0] if isinstance(command, list) else command + if cmd_name != "google-chrome": + return command + + # Extract the original debugging port from the command args + original_args = command[1:] if isinstance(command, list) else [] + debug_port = "9222" # default + for arg in original_args: + if arg.startswith("--remote-debugging-port="): + debug_port = arg.split("=", 1)[1] + break + + command = [ + "google-chrome-wrapper", + f"--remote-debugging-port={debug_port}", + "--remote-debugging-address=127.0.0.1", + "--remote-allow-origins=*", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--disable-session-crashed-bubble", + "--disable-features=TranslateUI", + "--start-maximized", + ] + + return command diff --git a/pyproject.toml b/pyproject.toml index bbcc920fd..e71876497 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,6 +135,9 @@ datasets = "*" [tool.poetry.scripts] openhands = "openhands.cli.main:main" +[tool.poetry.group.osworld-nvcf.dependencies] +ngcsdk = ">=3.64.0" + [tool.poetry.group.testgeneval.dependencies] fuzzywuzzy = "^0.18.0" rouge = "^1.0.1" diff --git a/scripts/eval/README.md b/scripts/eval/README.md new file mode 100644 index 000000000..3e8d748b4 --- /dev/null +++ b/scripts/eval/README.md @@ -0,0 +1,227 @@ +# OSWorld Evaluation Pipeline + +This directory contains a three-stage pipeline for running OSWorld evaluations, processing results, and generating visualizations. + +## Pipeline Overview + +``` +osworld.py → unpack.py → generate_gifs.py + ↓ ↓ ↓ +JSON files Extracted MP4 videos + dirs with + PNGs & JSON +``` + +## Stage 1: osworld.py - Run OSWorld Evaluation + +The first stage runs OSWorld benchmark tasks using an OpenHands agent server with an LLM backend. + +### Purpose +- Submits OSWorld test instances to an agent server +- Processes tasks in parallel with configurable concurrency +- Saves complete interaction logs as JSON files +- Supports resumption (skips already-completed tasks) + +### Usage + +```bash +python osworld.py \ + --data-file /path/to/osworld_test.json \ + --output-dir ./osworld_results \ + --llm-server-address http://localhost:8000/v1 \ + --model "hosted_vllm/Qwen/Qwen3-VL-235B-A22B-Instruct" \ + --max-parallel-jobs 2 \ + --max-iterations 15 \ + --timeout 6000 +``` + +### Key Arguments +- `--data-file`: Path to OSWorld test data (JSONL format) +- `--output-dir`: Directory to save result JSON files (default: `./osworld_results`) +- `--llm-server-address`: LLM server endpoint (default: `http://localhost:8000/v1`) +- `--model`: Model identifier for the LLM +- `--max-parallel-jobs`: Number of concurrent tasks (default: 2) +- `--max-iterations`: Max agent iterations per task (default: 15) +- `--timeout`: Timeout per task in seconds (default: 6000) +- `--enable-vision/--no-vision`: Enable/disable vision capabilities (default: enabled) +- `--enable-a11y-tree`: Enable accessibility tree +- `--max-image-history`: Number of images to keep in context (default: 4) +- `--temperature`: Sampling temperature (default: 0.0) +- `--total-jobs`: Limit number of jobs to run (default: all) + +### Output +Creates one JSON file per task in `--output-dir`: +- `.json`: Complete interaction log including messages, tool calls, and images +- `error_.json`: Error logs for failed tasks + +Each JSON contains: +- `instance_id`: Task identifier +- `messages`: Complete conversation history with the agent +- `resolved`: Boolean indicating task success +- Embedded base64-encoded screenshots + +## Stage 2: unpack.py - Extract Images and Conversations + +The second stage unpacks the JSON files into directories with extracted images and structured conversation data. + +### Purpose +- Extracts base64-encoded images from JSON files +- Creates organized directory structure for each task +- Converts message history to readable conversation format +- Computes overall accuracy statistics + +### Usage + +```bash +python unpack.py \ + --dir_path ./osworld_results \ + --workers 64 +``` + +### Key Arguments +- `--dir_path`: Directory containing JSON files from Stage 1 (default: current directory) +- `--workers`: Number of parallel worker processes (default: 64) +- `--skip_save`: Skip saving result.txt file (useful for dry runs) + +### Output Structure +For each `.json`, creates: +``` +/ +├── conversation.json # Structured conversation with roles and content +├── result.txt # Boolean result (True/False) +├── 0.png # First screenshot +├── 1.png # Second screenshot +└── ... # Additional screenshots +``` + +The `conversation.json` format: +```json +[ + { + "role": "user", + "content": "Task instruction...", + "img": "0.png" + }, + { + "role": "assistant", + "content": "I'll help with that...", + "tool_calls": [ + { + "name": "mouse_click", + "arguments": "{\"x\": 0.5, \"y\": 0.3}" + } + ] + }, + ... +] +``` + +### Statistics +Prints summary statistics: +- Total correct tasks +- Total tasks processed +- Overall accuracy + +## Stage 3: generate_gifs.py - Create Video Visualizations + +The final stage generates MP4 videos showing the agent's actions with annotations. + +### Purpose +- Creates videos from conversation and screenshots +- Overlays user instructions and assistant responses +- Visualizes click actions with red markers +- Generates one video per task + +### Usage + +```bash +python generate_gifs.py ./osworld_results \ + --fps 0.2 \ + --output-name output.mp4 +``` + +### Key Arguments +- `work_dir`: Directory containing subdirectories from Stage 2 (positional) +- `--fps`: Frames per second for video (default: 0.2 = 5 seconds per frame) +- `--output-name`: Output filename (default: `output.mp4`) + +### Output +Creates `output.mp4` in each task subdirectory with: +- **Top overlay** (first frame only): User instruction +- **Bottom overlay** (all frames): Assistant response +- **Red markers**: Click locations with tool names +- **Duration**: Configurable via FPS (default 5s per frame) + +### Video Features +- Semi-transparent text backgrounds for readability +- Centered text with automatic wrapping +- Red circles mark click coordinates (normalized 0-1) +- Tool names labeled above click markers +- MP4 format with mp4v codec + +## Complete Pipeline Example + +```bash +# Step 1: Run evaluation +python osworld.py \ + --data-file /data/osworld_test.json \ + --output-dir ./osworld_results \ + --max-parallel-jobs 4 \ + --max-iterations 15 + +# Step 2: Extract images and conversations +python unpack.py \ + --dir_path ./osworld_results \ + --workers 64 + +# Step 3: Generate videos +python generate_gifs.py ./osworld_results --fps 0.2 +``` + +### Expected Directory Structure + +After running the complete pipeline: + +``` +osworld_results/ +├── .json +├── .json +├── / +│ ├── conversation.json +│ ├── result.txt +│ ├── 0.png +│ ├── 1.png +│ └── output.mp4 +├── / +│ ├── conversation.json +│ ├── result.txt +│ ├── 0.png +│ └── output.mp4 +└── ... +``` + +## Requirements + +### Python Dependencies +- `PIL` (Pillow): Image processing +- `cv2` (opencv-python): Video generation +- `numpy`: Array operations +- `openhands.nvidia.async_server_osworld`: Agent server interface + +### System Dependencies +- Fonts for text overlay (DejaVu or Liberation Sans) +- Sufficient disk space for videos + +## Tips and Best Practices + +1. **Resumption**: osworld.py automatically skips completed tasks, so you can safely re-run it after interruptions + +2. **Parallel Processing**: + - osworld.py: Set `--max-parallel-jobs` based on available GPU/CPU resources + - unpack.py: Uses multiprocessing; adjust `--workers` for your CPU count + +3. **Video Customization**: Adjust `--fps` in generate_gifs.py: + - Lower FPS (e.g., 0.2) = More time per frame = Easier to read + - Higher FPS (e.g., 1.0) = Faster playback = Shorter videos + +4. **Storage**: Each task with screenshots and video can be several MB; plan storage accordingly diff --git a/scripts/eval/generate_gifs.py b/scripts/eval/generate_gifs.py new file mode 100644 index 000000000..00a7a0848 --- /dev/null +++ b/scripts/eval/generate_gifs.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +""" +Generate videos from conversation data and screenshots. + +This script takes a work directory containing conversation.json and PNG files, +matches each assistant turn with corresponding PNG images, overlays user turn +content as text on the images, draws click markers for tool calls with coordinates, +and creates a video with configurable FPS (default 0.2 FPS = 5 seconds per frame). +""" +import json +import argparse +from pathlib import Path +from typing import List, Dict, Tuple +from PIL import Image, ImageDraw, ImageFont +import textwrap +import cv2 +import numpy as np + + +def parse_conversation(conversation_path: Path) -> Tuple[List[Dict], List[Dict]]: + """ + Parse conversation.json to extract user and assistant turns. + + Args: + conversation_path: Path to conversation.json file + + Returns: + Tuple of (user_turns, assistant_turns) where: + - user_turns is a list of dicts with 'content' key + - assistant_turns is a list of dicts with 'content' and 'tool_calls' keys + """ + with open(conversation_path, 'r', encoding='utf-8') as f: + conversation = json.load(f) + + user_turns = [] + assistant_turns = [] + + for msg in conversation: + role = msg.get('role', '') + content = msg.get('content', '') + + if role in ['user', 'tool']: + user_turns.append({ + 'content': content + }) + elif role == 'assistant': + assistant_turns.append({ + 'content': content, + 'tool_calls': msg.get('tool_calls', []) + }) + + return user_turns, assistant_turns + + +def add_text_to_image(image: Image.Image, top_text: str = None, bottom_text: str = None, font_size: int = 15) -> Image.Image: + """ + Add text overlay to an image at the top and/or bottom. + + Args: + image: PIL Image object + top_text: Text to overlay at the top of the image + bottom_text: Text to overlay at the bottom of the image + font_size: Font size for the text + + Returns: + Modified PIL Image with text overlay + """ + # Create a copy to avoid modifying the original + img_with_text = image.copy() + draw = ImageDraw.Draw(img_with_text) + + # Try to use a better font, fall back to default if not available + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) + except: + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", font_size) + except: + # Fall back to default font + font = ImageFont.load_default() + + # Get image dimensions + img_width, img_height = img_with_text.size + + # Helper function to add text at a specific position + def add_text_overlay(text: str, position: str): + """Add text overlay at 'top' or 'bottom' position""" + # Wrap text to fit image width (with some padding) + max_chars_per_line = int(img_width / (font_size * 0.6)) # Rough estimate + wrapped_text = textwrap.fill(text, width=max_chars_per_line) + + # Calculate text bounding box + # For multi-line text, we need to calculate height manually + lines = wrapped_text.split('\n') + line_height = font_size + 5 # Add some spacing between lines + text_height = len(lines) * line_height + + # Create a semi-transparent black background for the text + padding = 10 + text_bg_height = text_height + 2 * padding + text_bg = Image.new('RGBA', (img_width, text_bg_height), (0, 0, 0, 180)) + + # Composite the background onto the image at the appropriate position + if position == 'top': + y_position = 0 + img_with_text.paste(text_bg, (0, y_position), text_bg) + y_offset = padding + else: # bottom + y_position = img_height - text_bg_height + img_with_text.paste(text_bg, (0, y_position), text_bg) + y_offset = y_position + padding + + # Draw each line of text + for line in lines: + # Center each line horizontally + line_width = draw.textlength(line, font=font) + x_position = (img_width - line_width) / 2 + draw.text((x_position, y_offset), line, fill=(255, 255, 255), font=font) + y_offset += line_height + + # Add top text if provided + if top_text: + add_text_overlay(top_text, 'top') + + # Add bottom text if provided + if bottom_text: + add_text_overlay(bottom_text, 'bottom') + + return img_with_text + + +def add_click_marker(image: Image.Image, x: float, y: float, tool_name: str = None, radius: int = 10, font_size: int = 14) -> Image.Image: + """ + Add a filled red circle at the specified coordinates with optional tool name label. + + Args: + image: PIL Image object + x: X coordinate (normalized 0-1, will be scaled to image width) + y: Y coordinate (normalized 0-1, will be scaled to image height) + tool_name: Name of the tool call to display above the circle + radius: Radius of the circle in pixels + font_size: Font size for the tool name text + + Returns: + Modified PIL Image with circle marker and optional label + """ + # Create a copy to avoid modifying the original + img_with_marker = image.copy() + draw = ImageDraw.Draw(img_with_marker) + + # Get image dimensions + img_width, img_height = img_with_marker.size + + # Convert normalized coordinates to pixel coordinates + pixel_x = x * img_width + pixel_y = y * img_height + + # Draw filled red circle + left_up = (pixel_x - radius, pixel_y - radius) + right_down = (pixel_x + radius, pixel_y + radius) + draw.ellipse([left_up, right_down], fill=(255, 0, 0, 255), outline=(255, 0, 0, 255)) + + # Draw tool name above the circle if provided + if tool_name: + # Try to use a better font, fall back to default if not available + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) + except: + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", font_size) + except: + # Fall back to default font + font = ImageFont.load_default() + + # Calculate text position (centered above the circle) + text_width = draw.textlength(tool_name, font=font) + text_x = pixel_x - (text_width / 2) + text_y = pixel_y - radius - font_size - 5 # 5 pixels padding above circle + + # Draw text in red + draw.text((text_x, text_y), tool_name, fill=(255, 0, 0, 255), font=font) + + return img_with_marker + + +def create_video(work_dir: Path, output_path: Path = None, fps: float = 0.2) -> None: + """ + Create a video from conversation and PNG files. + + Args: + work_dir: Directory containing conversation.json and PNG files + output_path: Path for the output video (default: work_dir/output.mp4) + fps: Frames per second (default: 0.2 = 5 seconds per frame) + """ + work_dir = Path(work_dir) + conversation_path = work_dir / 'conversation.json' + + if not conversation_path.exists(): + raise FileNotFoundError(f"conversation.json not found in {work_dir}") + + # Parse conversation + print(f"Parsing conversation from {conversation_path}") + user_turns, assistant_turns = parse_conversation(conversation_path) + + print(f"Found {len(user_turns)} user turns and {len(assistant_turns)} assistant turns") + + # Find all PNG files, sorted numerically + png_files = sorted(work_dir.glob('*.png'), key=lambda x: int(x.stem) if x.stem.isdigit() else float('inf')) + + if not png_files: + raise FileNotFoundError(f"No PNG files found in {work_dir}") + + print(f"Found {len(png_files)} PNG files") + + # Process images and match with turns + processed_images = [] + + for i, png_path in enumerate(png_files): + # Match with assistant turn (0.png -> first assistant turn, etc.) + if i < len(assistant_turns): + # Get the corresponding user turn and assistant turn + user_turn = user_turns[i] if i < len(user_turns) else {'content': "No user instruction available"} + assistant_turn = assistant_turns[i] + + user_text = user_turn['content'] + assistant_text = assistant_turn['content'] + + ## Truncate long text for better display + #if len(user_text) > 500: + # user_text = user_text[:500] + "..." + #if len(assistant_text) > 500: + # assistant_text = assistant_text[:500] + "..." + + print(f"Processing {png_path.name} with user turn {i} and assistant turn {i}") + + # Load image + img = Image.open(png_path) + + # Check if there are tool calls with x, y coordinates in the assistant turn + tool_calls = assistant_turn.get('tool_calls', []) + if tool_calls and len(tool_calls) > 0: + first_tool_call = tool_calls[0] + tool_name = first_tool_call.get('name', '') # Default to '' if no name + arguments = first_tool_call.get('arguments', {}) + + # Parse arguments if it's a string (JSON) + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + print(f" Warning: Could not parse tool_call arguments for {png_path.name}") + arguments = {} + + # Check if x and y coordinates exist + if 'x' in arguments and 'y' in arguments: + try: + x = float(arguments['x']) + y = float(arguments['y']) + print(f" Drawing click marker at ({x}, {y}) for tool '{tool_name}'") + img = add_click_marker(img, x, y, tool_name=tool_name) + except (ValueError, TypeError) as e: + print(f" Warning: Invalid x/y coordinates in {png_path.name}: {e}") + + # Add text (user at top, assistant at bottom) + if i == 0: + img_with_text = add_text_to_image(img, top_text=user_text, bottom_text=assistant_text) + else: + img_with_text = add_text_to_image(img, bottom_text=assistant_text) + + # Convert to RGB if necessary (OpenCV uses BGR, but we'll convert later) + if img_with_text.mode != 'RGB': + img_with_text = img_with_text.convert('RGB') + + processed_images.append(img_with_text) + else: + print(f"Warning: No assistant turn for {png_path.name}, skipping") + + if not processed_images: + raise ValueError("No images were processed") + + # Set output path + if output_path is None: + output_path = work_dir / 'output.mp4' + else: + output_path = Path(output_path) + + # Get dimensions from first image + first_img = processed_images[0] + width, height = first_img.size + + # Create video writer + print(f"Creating video at {output_path}") + fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for MP4 + video_writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + + if not video_writer.isOpened(): + raise RuntimeError(f"Failed to open video writer for {output_path}") + + # Write frames to video + for img_pil in processed_images: + # Convert PIL Image to numpy array + img_array = np.array(img_pil) + # Convert RGB to BGR (OpenCV uses BGR) + img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) + # Write frame + video_writer.write(img_bgr) + + video_writer.release() + + duration_per_frame = 1.0 / fps + total_duration = len(processed_images) * duration_per_frame + + print(f"✓ Video created successfully: {output_path}") + print(f" - {len(processed_images)} frames") + print(f" - {fps} FPS ({duration_per_frame:.1f}s per frame)") + print(f" - Total duration: {total_duration:.1f}s") + print(f" - Resolution: {width}x{height}") + + +def main(): + parser = argparse.ArgumentParser( + description='Generate videos from conversation data and screenshots' + ) + parser.add_argument( + 'work_dir', + type=str, + help='Directory containing subdirectories with conversation.json and PNG files' + ) + parser.add_argument( + '--fps', + type=float, + default=0.2, + help='Frames per second (default: 0.2, which is 5 seconds per frame)' + ) + parser.add_argument( + '--output-name', + type=str, + default='output.mp4', + help='Output video filename (default: output.mp4)' + ) + + args = parser.parse_args() + + work_dir = Path(args.work_dir) + + if not work_dir.exists(): + print(f"Error: Directory {work_dir} does not exist") + return 1 + + if not work_dir.is_dir(): + print(f"Error: {work_dir} is not a directory") + return 1 + + # Find all subdirectories that contain conversation.json + subdirs_to_process = [] + for subdir in work_dir.iterdir(): + if subdir.is_dir(): + conversation_file = subdir / 'conversation.json' + if conversation_file.exists(): + subdirs_to_process.append(subdir) + + if not subdirs_to_process: + print(f"No subdirectories with conversation.json found in {work_dir}") + return 1 + + print(f"Found {len(subdirs_to_process)} subdirectories to process") + print("=" * 80) + + # Process each subdirectory + successful = 0 + failed = 0 + errors = [] + + for i, subdir in enumerate(subdirs_to_process, 1): + print(f"\n[{i}/{len(subdirs_to_process)}] Processing: {subdir.name}") + print("-" * 80) + + try: + output_path = subdir / args.output_name + create_video( + work_dir=subdir, + output_path=output_path, + fps=args.fps + ) + successful += 1 + except Exception as e: + print(f"✗ Error processing {subdir.name}: {e}") + failed += 1 + errors.append((subdir.name, str(e))) + + # Print summary + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"Total subdirectories: {len(subdirs_to_process)}") + print(f"Successful: {successful}") + print(f"Failed: {failed}") + + if errors: + print("\nFailed subdirectories:") + for subdir_name, error in errors: + print(f" - {subdir_name}: {error}") + + return 0 if failed == 0 else 1 + + +if __name__ == '__main__': + exit(main()) + diff --git a/scripts/eval/osworld.py b/scripts/eval/osworld.py index b4b48d674..06253c58a 100644 --- a/scripts/eval/osworld.py +++ b/scripts/eval/osworld.py @@ -186,8 +186,8 @@ def parse_args(): parser.add_argument( '--temperature', type=float, - default=0.0, - help='Sampling temperature (0.0 for deterministic)' + default=0.2, + help='Sampling temperature' ) parser.add_argument( '--max-iterations', @@ -218,7 +218,7 @@ def parse_args(): parser.add_argument( '--max-image-history', type=int, - default=4, + default=3, help='Maximum number of images to keep in history' ) diff --git a/scripts/eval/visualize/app.py b/scripts/eval/visualize/app.py new file mode 100644 index 000000000..7d3946f04 --- /dev/null +++ b/scripts/eval/visualize/app.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +Trajectory Viewer - A web app to visualize agent trajectories with screenshots. +""" + +import argparse +import json +from pathlib import Path +from flask import Flask, render_template, send_from_directory, jsonify + +app = Flask(__name__) + + +def get_results_dir(): + """Get the results directory from app config.""" + return app.config.get("RESULTS_DIR", Path(__file__).parent / "results") + + +def get_trajectories(): + """Get list of all trajectory folders.""" + results_dir = get_results_dir() + if not results_dir.exists(): + return [] + + trajectories = [] + for folder in sorted(results_dir.iterdir()): + if folder.is_dir(): + result_file = folder / "result.txt" + result = None + if result_file.exists(): + result = result_file.read_text().strip().lower() == "true" + + trajectories.append({ + "id": folder.name, + "result": result + }) + + return trajectories + + +def parse_coordinates(arguments_str): + """Extract normalized coordinates (0-1) from tool call arguments.""" + try: + args = json.loads(arguments_str) + coords = [] + + # Handle x, y coordinates (normalized 0-1) + if "x" in args and "y" in args: + x = float(args["x"]) + y = float(args["y"]) + coords.append({"x": x, "y": y}) + + return coords + except (json.JSONDecodeError, KeyError, TypeError): + return [] + + +def load_conversation(traj_id): + """Load conversation.json for a trajectory.""" + conv_file = get_results_dir() / traj_id / "conversation.json" + if not conv_file.exists(): + return [], "" + + with open(conv_file, "r") as f: + conversation = json.load(f) + + # Extract task instruction from first user message + task_instruction = "" + for msg in conversation: + if msg.get("role") == "user": + task_instruction = msg.get("content", "") + break + + # Build list of steps: each step pairs a screenshot with the next assistant action + steps = [] + + for i, msg in enumerate(conversation): + role = msg.get("role", "unknown") + img = msg.get("img") + + # Only process user/tool turns that have screenshots + if role in ("user", "tool") and img: + # Find the next assistant turn to get tool calls + next_assistant = None + for j in range(i + 1, len(conversation)): + if conversation[j].get("role") == "assistant": + next_assistant = conversation[j] + break + + # Extract tool calls from next assistant turn + tool_calls = [] + assistant_content = "" + if next_assistant: + assistant_content = next_assistant.get("content", "") + if "tool_calls" in next_assistant and next_assistant["tool_calls"]: + for tc in next_assistant["tool_calls"]: + tool_call = { + "name": tc.get("name", "unknown"), + "arguments": tc.get("arguments", "{}"), + "coordinates": [] + } + coords = parse_coordinates(tc.get("arguments", "{}")) + tool_call["coordinates"] = coords + tool_calls.append(tool_call) + + steps.append({ + "step_num": len(steps), + "img": img, + "tool_calls": tool_calls, + "assistant_content": assistant_content + }) + + return steps, task_instruction + + +def get_screenshot_list(traj_id): + """Get list of screenshot files for a trajectory.""" + traj_dir = get_results_dir() / traj_id + if not traj_dir.exists(): + return [] + + screenshots = [] + for f in traj_dir.iterdir(): + if f.suffix.lower() == ".png" and f.stem.isdigit(): + screenshots.append(f.name) + + # Sort by numeric value + screenshots.sort(key=lambda x: int(Path(x).stem)) + return screenshots + + +@app.route("/") +def index(): + """Home page listing all trajectories.""" + trajectories = get_trajectories() + return render_template("index.html", trajectories=trajectories) + + +@app.route("/trajectory/") +def view_trajectory(traj_id): + """View a specific trajectory.""" + steps, task_instruction = load_conversation(traj_id) + + # Get result + result_file = get_results_dir() / traj_id / "result.txt" + result = None + if result_file.exists(): + result = result_file.read_text().strip().lower() == "true" + + return render_template( + "trajectory.html", + traj_id=traj_id, + steps=steps, + task_instruction=task_instruction, + result=result + ) + + +@app.route("/api/trajectory/") +def api_trajectory(traj_id): + """API endpoint to get trajectory data as JSON.""" + steps, task_instruction = load_conversation(traj_id) + + result_file = get_results_dir() / traj_id / "result.txt" + result = None + if result_file.exists(): + result = result_file.read_text().strip().lower() == "true" + + return jsonify({ + "id": traj_id, + "result": result, + "task_instruction": task_instruction, + "steps": steps + }) + + +@app.route("/results//") +def serve_screenshot(traj_id, filename): + """Serve screenshot files.""" + return send_from_directory(get_results_dir() / traj_id, filename) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Trajectory Viewer - Visualize agent trajectories") + parser.add_argument( + "--results-dir", "-r", + type=str, + default="./results", + help="Path to the results folder containing trajectory directories (default: ./results)" + ) + parser.add_argument( + "--port", "-p", + type=int, + default=5000, + help="Port to run the server on (default: 5000)" + ) + parser.add_argument( + "--host", + type=str, + default="0.0.0.0", + help="Host to bind the server to (default: 0.0.0.0)" + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + results_dir = Path(args.results_dir).resolve() + + # Store in app config so it's accessible in routes + app.config["RESULTS_DIR"] = results_dir + + if not results_dir.exists(): + print(f"Warning: Results directory '{results_dir}' does not exist.") + else: + print(f"Serving trajectories from: {results_dir}") + + app.run(debug=True, host=args.host, port=args.port) diff --git a/scripts/eval/visualize/templates/index.html b/scripts/eval/visualize/templates/index.html new file mode 100644 index 000000000..1ecea9e09 --- /dev/null +++ b/scripts/eval/visualize/templates/index.html @@ -0,0 +1,128 @@ + + + + + + Trajectory Viewer + + + +
+

Trajectory Viewer

+ + {% if trajectories %} + + {% else %} +
+

No trajectories found

+

Add trajectory folders to the results/ directory to get started.

+
+ {% endif %} +
+ + diff --git a/scripts/eval/visualize/templates/trajectory.html b/scripts/eval/visualize/templates/trajectory.html new file mode 100644 index 000000000..d4ea60e56 --- /dev/null +++ b/scripts/eval/visualize/templates/trajectory.html @@ -0,0 +1,427 @@ + + + + + + Trajectory: {{ traj_id }} + + + +
+
+ ← Back to Trajectories + {{ traj_id }} + {% if result is none %} + Unknown + {% elif result %} + PASS + {% else %} + FAIL + {% endif %} +
+
+ +
+ {% if task_instruction %} +
+
Task Instruction
+
{{ task_instruction }}
+
+ {% endif %} + +
+ {% for step in steps %} +
+
+ Step {{ step.step_num }} + {{ step.img }} +
+
+
+
+ Screenshot {{ step.img }} +
+
+
+ +
+
Next Action
+ {% if step.tool_calls %} + {% for tc in step.tool_calls %} +
+ {{ tc.name }} + {{ tc.arguments }} +
+ {% endfor %} + {% else %} +
No action taken
+ {% endif %} + + {% if step.assistant_content %} +
+
Assistant Reasoning
+
{{ step.assistant_content }}
+
+ {% endif %} +
+
+
+ {% endfor %} +
+
+ + + + diff --git a/tests/unit/test_osworld_nvcf_runtime.py b/tests/unit/test_osworld_nvcf_runtime.py new file mode 100644 index 000000000..8057724a3 --- /dev/null +++ b/tests/unit/test_osworld_nvcf_runtime.py @@ -0,0 +1,237 @@ +"""Unit tests for OSWorld NVCF Runtime (openhands.runtime.impl.nvcf).""" + +import os +import unittest +from unittest.mock import MagicMock, Mock, patch + +from openhands.core.config import OpenHandsConfig +from openhands.events import EventStream +from openhands.events.action.os import OSWorldInteractiveAction +from openhands.runtime.impl.nvcf import NVCFRuntime, OSWorldNVCFRuntime + + +class TestNVCFRuntime(unittest.TestCase): + """Test base NVCFRuntime (ActionExecutionClient, deploy/close).""" + + def setUp(self): + self.config = OpenHandsConfig() + self.config.runtime = "osworld_nvcf" + self.event_stream = Mock(spec=EventStream) + self.event_stream.file_store = Mock() + self.event_stream.file_store.write = Mock() + self.event_stream.file_store.read = Mock(side_effect=FileNotFoundError) + self.event_stream.file_store.delete = Mock() + + def test_nvcf_runtime_extends_action_execution_client(self): + """NVCFRuntime must extend ActionExecutionClient, not SingularityRuntime.""" + from openhands.runtime.impl.action_execution.action_execution_client import ( + ActionExecutionClient, + ) + self.assertEqual(NVCFRuntime.__bases__[0], ActionExecutionClient) + self.assertTrue(issubclass(NVCFRuntime, ActionExecutionClient)) + + def test_nvcf_runtime_requires_api_key(self): + """Without NGC_API_KEY / nvcf_api_key, init raises.""" + with patch.dict(os.environ, {}, clear=False): + for k in ("NGC_API_KEY", "NVCF_FUNCTION_ID", "NGC_ORG"): + if k in os.environ: + del os.environ[k] + with self.assertRaises(ValueError) as ctx: + NVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + self.assertIn("api key", str(ctx.exception).lower()) + + def test_nvcf_runtime_init_with_env(self): + """With NGC_API_KEY and NVCF_FUNCTION_ID, init succeeds.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "test-key", "NVCF_FUNCTION_ID": "test-fid"}, + clear=False, + ): + r = NVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + self.assertEqual(r._nvcf_function_id, "test-fid") + self.assertEqual(r._nvcf_api_key, "test-key") + + +class TestOSWorldNVCFRuntime(unittest.TestCase): + """Test OSWorld NVCF runtime (same API as OSWorld Singularity, NVCF backend).""" + + def setUp(self): + self.config = OpenHandsConfig() + self.config.runtime = "osworld_nvcf" + self.event_stream = Mock(spec=EventStream) + self.event_stream.file_store = Mock() + self.event_stream.file_store.write = Mock() + self.event_stream.file_store.read = Mock(side_effect=FileNotFoundError) + self.event_stream.file_store.delete = Mock() + + def test_osworld_nvcf_extends_nvcf_runtime(self): + """OSWorldNVCFRuntime extends NVCFRuntime.""" + self.assertTrue(issubclass(OSWorldNVCFRuntime, NVCFRuntime)) + + def test_osworld_nvcf_init_os_type_and_screen_size(self): + """OS type and default screen_size are set.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + os_type="linux", + ) + self.assertEqual(r.os_type, "linux") + self.assertEqual(r.screen_size, (1920, 1080)) + + def test_osworld_vm_url(self): + """osworld_vm_url is NVCF API base.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + self.assertEqual(r.osworld_vm_url, "https://grpc.nvcf.nvidia.com/api") + + def test_action_to_pyautogui_click(self): + """_action_to_pyautogui_command produces click command.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + cmd = r._action_to_pyautogui_command( + "CLICK", + {"x": 100, "y": 200, "button": "left"}, + ) + self.assertIn("pyautogui.click", cmd) + self.assertIn("100", cmd) + self.assertIn("200", cmd) + + def test_action_to_pyautogui_typing(self): + """_action_to_pyautogui_command produces typewrite for TYPING.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + cmd = r._action_to_pyautogui_command( + "TYPING", + {"text": "hello"}, + ) + self.assertIn("pyautogui.typewrite", cmd) + self.assertIn("hello", cmd) + + def test_run_action_get_screenshot_mocked_client(self): + """run_action(get_screenshot) uses _nvcf_client and returns observation.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + mock_client = MagicMock() + mock_client.get.return_value = MagicMock(status_code=200, content=b"png") + r._nvcf_client = mock_client + r._runtime_initialized = True + + action = OSWorldInteractiveAction( + method="get_screenshot", + params={}, + thought="test", + ) + obs = r.run_action(action) + self.assertIsNotNone(obs) + self.assertEqual(obs.content, "Screenshot captured") + mock_client.get.assert_called() + + def test_run_action_execute_action_mocked_client(self): + """run_action(execute_action) calls execute_vm_action and returns observation.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + mock_client = MagicMock() + mock_client.post.return_value = MagicMock( + status_code=200, + json=lambda: {"status": "success", "output": "ok"}, + ) + r._nvcf_client = mock_client + r._runtime_initialized = True + + action = OSWorldInteractiveAction( + method="execute_action", + params={ + "action": { + "action_type": "CLICK", + "parameters": {"x": 10, "y": 10}, + } + }, + thought="test", + ) + obs = r.run_action(action) + self.assertIsNotNone(obs) + self.assertEqual(obs.exit_code, 0) + mock_client.post.assert_called() + + def test_run_action_unknown_method_returns_error_observation(self): + """Unknown method returns ErrorObservation.""" + with patch.dict( + os.environ, + {"NGC_API_KEY": "k", "NVCF_FUNCTION_ID": "f"}, + clear=False, + ): + r = OSWorldNVCFRuntime( + config=self.config, + event_stream=self.event_stream, + sid="test", + ) + r._nvcf_client = MagicMock() + r._runtime_initialized = True + + action = OSWorldInteractiveAction( + method="no_such_method", + params={}, + thought="test", + ) + obs = r.run_action(action) + from openhands.events.observation import ErrorObservation + self.assertIsInstance(obs, ErrorObservation) + self.assertIn("Unknown", obs.content) + + +if __name__ == "__main__": + unittest.main()