From 43b58f0ff9c9cff6451f7686b3b11a3e1ef319c7 Mon Sep 17 00:00:00 2001 From: Paul Jiang Date: Tue, 3 Dec 2024 16:36:42 +0800 Subject: [PATCH 1/5] Inital version of support agent on server side --- .gitignore | 7 +- scripts/build-container-image.sh | 4 + scripts/clear-agents.sh | 59 ++++++++++++ scripts/prepare-agents.sh | 65 +++++++++++++ .../client/lib_run_single.py | 92 ++++++++++--------- .../client/mm_agents/server_agents/agent.py | 13 +++ src/win-arena-container/client/run.py | 3 +- .../vm/setup/server/main.py | 60 ++++++++++++ src/win-arena-container/vm/setup/server/s | 0 src/win-arena-container/vm/setup/setup.ps1 | 7 ++ .../vm/setup/setupAgents.ps1 | 39 ++++++++ "\342\200\216AgentRepoConfig.json" | 19 ++++ 12 files changed, 323 insertions(+), 45 deletions(-) create mode 100644 scripts/clear-agents.sh create mode 100644 scripts/prepare-agents.sh create mode 100644 src/win-arena-container/client/mm_agents/server_agents/agent.py create mode 100644 src/win-arena-container/vm/setup/server/s create mode 100644 src/win-arena-container/vm/setup/setupAgents.ps1 create mode 100644 "\342\200\216AgentRepoConfig.json" diff --git a/.gitignore b/.gitignore index 0d26666..5a96edb 100644 --- a/.gitignore +++ b/.gitignore @@ -282,4 +282,9 @@ src/win-arena-container/client/.vscode/launch.json # src/win-arena-container/client/evaluation_examples_windows/examples/chrome/12086550-11c0-466b-b367-1d9e75b3910e-wos.json # src/win-arena-container/client/evaluation_examples_windows/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc-wos.json # src/win-arena-container/client/evaluation_examples_windows/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930-wos.json -# src/win-arena-container/client/evaluation_examples_windows/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825-wos.json \ No newline at end of file +# src/win-arena-container/client/evaluation_examples_windows/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825-wos.json + +# Ignore the files when preparing agents. +src/win-arena-container/vm/setup/agents.json +src/win-arena-container/vm/setup/mm_agents/* +src/win-arena-container/vm/setup/Logs.txt diff --git a/scripts/build-container-image.sh b/scripts/build-container-image.sh index 7c04423..322b820 100644 --- a/scripts/build-container-image.sh +++ b/scripts/build-container-image.sh @@ -35,6 +35,10 @@ done SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +# Run prepare-agents.sh +echo "Running prepare-agents.sh..." +$SCRIPT_DIR/prepare-agents.sh + echo "$SCRIPT_DIR/../" if [ "$build_base_image" = true ]; then diff --git a/scripts/clear-agents.sh b/scripts/clear-agents.sh new file mode 100644 index 0000000..9efea3e --- /dev/null +++ b/scripts/clear-agents.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Get the directory of the current script +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +# Path to the JSON configuration file +CONFIG_FILE="$SCRIPT_DIR/../AgentRepoConfig.json" +AGENTS_JSON_FILE="$SCRIPT_DIR/../src/win-arena-container/vm/setup/agents.json" +CLIENT_AGENT_FOLDER="$SCRIPT_DIR/../src/win-arena-container/client/mm_agents" +SERVER_AGENT_FOLDER="$SCRIPT_DIR/../src/win-arena-container/vm/setup/mm_agents" +WIN_STORAGE="$SCRIPT_DIR/../src/win-arena-container/vm/storage" + +# Remove the AGENTS_JSON_FILE if it exists +if [ -f "$AGENTS_JSON_FILE" ]; then + echo "Remove $AGENTS_JSON_FILE." + rm "$AGENTS_JSON_FILE" +fi + +# Remove the WIN_STORAGE if it exists +if [ -d "$WIN_STORAGE" ]; then + echo "Remove $WIN_STORAGE." + sudo rm -r $WIN_STORAGE +fi + +# Check if jq is installed +if ! command -v jq &> /dev/null +then + echo "jq could not be found, installing jq..." + sudo apt-get update && sudo apt-get install -y jq +fi + +# Initialize an empty array to hold server repositories +server_repos=() + +# Read the JSON file and clone the repositories +repos=$(jq -c '.repositories[]' "$CONFIG_FILE") +for repo in $repos; do + REPO_DIR_NAME=$(echo "$repo" | jq -r '.name') + RUNNING_MODE=$(echo "$repo" | jq -r '.runningmode') + + # Set the target folder based on the running mode + if [ "$RUNNING_MODE" == "client" ]; then + TARGET_FOLDER="$CLIENT_AGENT_FOLDER" + elif [ "$RUNNING_MODE" == "server" ]; then + TARGET_FOLDER="$SERVER_AGENT_FOLDER" + server_repos+=("$repo") + else + echo "Invalid running mode: $RUNNING_MODE" + exit 1 + fi + + REPO_DIR="$TARGET_FOLDER/$REPO_DIR_NAME" + + if [ -d "$REPO_DIR" ]; then + echo "Remove $REPO_DIR." + # Remove the repository directory + sudo rm -r $REPO_DIR + fi +done \ No newline at end of file diff --git a/scripts/prepare-agents.sh b/scripts/prepare-agents.sh new file mode 100644 index 0000000..d279393 --- /dev/null +++ b/scripts/prepare-agents.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Get the directory of the current script +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +# Path to the JSON configuration file +CONFIG_FILE="$SCRIPT_DIR/../AgentRepoConfig.json" +CLIENT_AGENT_FOLDER="$SCRIPT_DIR/../src/win-arena-container/client/mm_agents" +SERVER_AGENT_FOLDER="$SCRIPT_DIR/../src/win-arena-container/vm/setup/mm_agents" +AGENTS_JSON_FILE="$SCRIPT_DIR/../src/win-arena-container/vm/setup/agents.json" + +# Remove the AGENTS_JSON_FILE if it exists +if [ -f "$AGENTS_JSON_FILE" ]; then + rm "$AGENTS_JSON_FILE" +fi + +# Check if jq is installed +if ! command -v jq &> /dev/null +then + echo "jq could not be found, installing jq..." + sudo apt-get update && sudo apt-get install -y jq +fi + +# Initialize an empty array to hold server repositories +server_repos=() + +# Read the JSON file and clone the repositories +repos=$(jq -c '.repositories[]' "$CONFIG_FILE") +for repo in $repos; do + REPO_URL=$(echo "$repo" | jq -r '.url') + REPO_DIR_NAME=$(echo "$repo" | jq -r '.name') + REPO_FOLDER=$(echo "$repo" | jq -r '.foldertocopy') + RUNNING_MODE=$(echo "$repo" | jq -r '.runningmode') + + # Set the target folder based on the running mode + if [ "$RUNNING_MODE" == "client" ]; then + TARGET_FOLDER="$CLIENT_AGENT_FOLDER" + elif [ "$RUNNING_MODE" == "server" ]; then + TARGET_FOLDER="$SERVER_AGENT_FOLDER" + server_repos+=("$repo") + else + echo "Invalid running mode: $RUNNING_MODE" + exit 1 + fi + + REPO_DIR="$TARGET_FOLDER/$REPO_DIR_NAME" + + if [ -d "$REPO_DIR" ]; then + echo "Directory $REPO_DIR already exists. Skipping clone." + else + echo "Cloning $REPO_URL into $REPO_DIR..." + git clone --no-checkout "$REPO_URL" "$REPO_DIR" + pushd "$REPO_DIR" + git sparse-checkout init --cone + echo "$REPO_FOLDER" > .git/info/sparse-checkout + git checkout + popd + fi +done + +# Print the server_repos array +printf '%s\n' "Repo is : ${server_repos[@]}" + +# Create the agents.json file with the list of server repositories +jq -n --argjson repos "$(printf '%s\n' "${server_repos[@]}" | jq -s .)" '{"server_repositories": $repos}' > "$AGENTS_JSON_FILE" diff --git a/src/win-arena-container/client/lib_run_single.py b/src/win-arena-container/client/lib_run_single.py index a1146a4..344c852 100644 --- a/src/win-arena-container/client/lib_run_single.py +++ b/src/win-arena-container/client/lib_run_single.py @@ -31,53 +31,59 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl init_timestamp = start_time.strftime("%Y%m%d@%H%M%S") recorder.record_init(obs, example, init_timestamp) - while not done and step_idx < max_steps: - if obs is None: - logger.error("Observation is None. Waiting a little to do next step.") - time.sleep(5) - step_idx += 1 - continue - - logger.info("Agent: Thinking...") - response, actions, logs, computer_update_args = agent.predict( - instruction, - obs - ) - - # update the computer object, used by navi's action space - if computer_update_args: - env.controller.update_computer(**computer_update_args) + from mm_agents.server_agents.agent import ServerAgent + if isinstance(agent, ServerAgent): + logger.info("Agent: Running server agent %s...", agent.agent_name) + env.controller.run_agent(agent.agent_name, instruction) - # step environment with agent actions - for action in actions: - # Capture the timestamp before executing the action - action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") - elapsed_timestamp = f"{datetime.datetime.now() - start_time}" - logger.info("Step %d: %s", step_idx + 1, action) - - obs, reward, done, info = env.step(action, args.sleep_after_execution) + else: + while not done and step_idx < max_steps: + if obs is None: + logger.error("Observation is None. Waiting a little to do next step.") + time.sleep(5) + step_idx += 1 + continue - logger.info("Reward: %.2f", reward) - logger.info("Done: %s", done) - - # Record step data - recorder.record_step( - obs, - logs, - step_idx, - action_timestamp, - elapsed_timestamp, - action, - reward, - done, - info + logger.info("Agent: Thinking...") + response, actions, logs, computer_update_args = agent.predict( + instruction, + obs ) - if done: - logger.info("The episode is done.") - break - # inc step counter - step_idx += 1 + # update the computer object, used by navi's action space + if computer_update_args: + env.controller.update_computer(**computer_update_args) + + # step environment with agent actions + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + elapsed_timestamp = f"{datetime.datetime.now() - start_time}" + logger.info("Step %d: %s", step_idx + 1, action) + + obs, reward, done, info = env.step(action, args.sleep_after_execution) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + + # Record step data + recorder.record_step( + obs, + logs, + step_idx, + action_timestamp, + elapsed_timestamp, + action, + reward, + done, + info + ) + + if done: + logger.info("The episode is done.") + break + # inc step counter + step_idx += 1 logger.info("Running evaluator(s)...") result = env.evaluate() diff --git a/src/win-arena-container/client/mm_agents/server_agents/agent.py b/src/win-arena-container/client/mm_agents/server_agents/agent.py new file mode 100644 index 0000000..87b2119 --- /dev/null +++ b/src/win-arena-container/client/mm_agents/server_agents/agent.py @@ -0,0 +1,13 @@ +from typing import Dict, List +from uuid import uuid4 + +class ServerAgent: + def __init__(self, agent_name="") -> None: + self.action_space = "code_block" + self.agent_name = agent_name + + def predict(self, instruction: str, obs: Dict) -> List: + return (None, ['DONE'], {}, {}) + + def reset(self): + pass \ No newline at end of file diff --git a/src/win-arena-container/client/run.py b/src/win-arena-container/client/run.py index 15259f7..a9c2a07 100644 --- a/src/win-arena-container/client/run.py +++ b/src/win-arena-container/client/run.py @@ -190,7 +190,8 @@ def test( from mm_agents.claude.agent import ClaudeAgent agent = ClaudeAgent() else: - raise ValueError(f"Unknown agent name: {cfg_args['agent_name']}") + from mm_agents.server_agents.agent import ServerAgent + agent = ServerAgent(agent_name=cfg_args["agent_name"]) env = DesktopEnv( action_space=agent.action_space, diff --git a/src/win-arena-container/vm/setup/server/main.py b/src/win-arena-container/vm/setup/server/main.py index 8d218ff..af5395d 100644 --- a/src/win-arena-container/vm/setup/server/main.py +++ b/src/win-arena-container/vm/setup/server/main.py @@ -1755,10 +1755,70 @@ def get_check_if_world_clock_exists(): else: return jsonify({'status': 'error', 'message': 'World clock does not exist'}), 400 +# Load JSON configuration from file +import json +def load_config(): + with open('../agents.json') as f: + return json.load(f) +config = load_config() + +@app.route('/run_server_agent', methods=['POST']) +def run_server_agent(): + try: + data = request.json + instruction = data.get('instruction', "") + agent_name = data.get('agent', "") + + # Find the repository configuration for the specified agent + repo_config = next((repo for repo in config['server_repositories'] if repo['name'] == agent_name), None) + + if repo_config is None: + return jsonify({"status": "error", "message": "Agent not found"}), 404 + + # Extract the entry point from the repository configuration + startup_type = repo_config.get('startuptype', "") + startup_point = os.path.join(r'\\host.lan\Data', 'mm_agents', agent_name, repo_config.get('startuppoint', "").replace('/', '\\')) + + # Determine the command based on the startup type + if startup_type.lower() == 'powershell': + command = ['powershell', '-ExecutionPolicy', 'Bypass', '-File', startup_point, '-instruction', instruction] + elif startup_type.lower() == 'python': + command = ['py', startup_point, '--instruction', instruction] + else: + return jsonify({"status": "error", "message": "Unsupported startup type"}), 400 + + # Set environment variable to ensure UTF-8 encoding + env = os.environ.copy() + env['PYTHONIOENCODING'] = 'utf-8' + + # Run the entry point script with the instruction argument + result = subprocess.run(command, capture_output=True, text=True, encoding='utf-8', env=env) + + # Check if the script ran successfully + if result.returncode == 0: + response = { + "status": "success", + "output": result.stdout + } + else: + response = { + "status": "error", + "output": result.stderr + } + + return jsonify(response) + + except Exception as e: + return jsonify({"status": "error", "message": str(e)}) + + except Exception as e: + print(f"Error running agent: {e}") + if __name__ == '__main__': app.run(debug=True, host="0.0.0.0", port=args.port) + # example command to test server. get platform # curl -X GET http://127.0.0.1:5000/platform # on windows: diff --git a/src/win-arena-container/vm/setup/server/s b/src/win-arena-container/vm/setup/server/s new file mode 100644 index 0000000..e69de29 diff --git a/src/win-arena-container/vm/setup/setup.ps1 b/src/win-arena-container/vm/setup/setup.ps1 index 39550d8..5c93fbd 100644 --- a/src/win-arena-container/vm/setup/setup.ps1 +++ b/src/win-arena-container/vm/setup/setup.ps1 @@ -398,6 +398,13 @@ if (Test-Path $requirementsFile) { exit } +# Install all setup for agents. +try { + & "$PSScriptRoot\setupAgents.ps1" +} catch { + Write-Host "An error occurred while executing setupAgents.ps1: $($_.Exception.Message)" +} + # Add a firewall rule to allow incoming connections on the specified port for the Python executable $pythonServerRuleName = "PythonHTTPServer-$pythonServerPort" if (-not (Get-NetFirewallRule -Name $pythonServerRuleName -ErrorAction SilentlyContinue)) { diff --git a/src/win-arena-container/vm/setup/setupAgents.ps1 b/src/win-arena-container/vm/setup/setupAgents.ps1 new file mode 100644 index 0000000..8436a0e --- /dev/null +++ b/src/win-arena-container/vm/setup/setupAgents.ps1 @@ -0,0 +1,39 @@ +$scriptFolder = "\\host.lan\Data" + +Import-Module (Join-Path $scriptFolder -ChildPath "setup-tools.psm1") + +# Load the JSON configuration from the same folder as the current script +$config = Get-Content -Raw -Path (Join-Path $scriptFolder -ChildPath "agents.json") | ConvertFrom-Json + +# Path to the Logs.txt file in the same folder as the script +$logFilePath = Join-Path -Path $PSScriptRoot -ChildPath "Logs.txt" + +# Remove the Logs.txt file if it exists +if (Test-Path -Path $logFilePath -PathType Leaf) { + Remove-Item -Path $logFilePath +} + +# Iterate through each server repository and execute the setup script +foreach ($repo in $config.server_repositories) { + $setupScriptPath = Join-Path -Path $scriptFolder -ChildPath "mm_agents" + $setupScriptPath = Join-Path -Path $setupScriptPath -ChildPath $repo.name + $setupScriptPath = Join-Path -Path $setupScriptPath -ChildPath $repo.setupscript + + try { + if (Test-Path -Path $setupScriptPath -PathType Leaf) { + Write-Host "Executing setup script for $($repo.name)..." + powershell -ExecutionPolicy Bypass -File $setupScriptPath + $successMessage = "Successfully executed setup script for $($repo.name)" + Write-Host $successMessage + $successMessage | Out-File -FilePath $logFilePath -Append + } else { + $notFoundMessage = "Setup script not found for $setupScriptPath." + Write-Host $notFoundMessage + $notFoundMessage | Out-File -FilePath $logFilePath -Append + } + } catch { + $errorMessage = "Failed to execute setup script for $($repo.name): $($_.Exception.Message)" + Write-Host $errorMessage + $errorMessage | Out-File -FilePath $logFilePath -Append + } +} \ No newline at end of file diff --git "a/\342\200\216AgentRepoConfig.json" "b/\342\200\216AgentRepoConfig.json" new file mode 100644 index 0000000..4f3f838 --- /dev/null +++ "b/\342\200\216AgentRepoConfig.json" @@ -0,0 +1,19 @@ +{ + "repositories": [ + { + "name": "Agent_s", + "url": "https://github.com/simular-ai/Agent-S", + "foldertocopy": "agent_s", + "runningmode": "client" + }, + { + "name": "UFO", + "url": "https://github.com/PaulJiangMS/UFO", + "foldertocopy": "ufo", + "runningmode": "server", + "setupscript": "ufo/setup.ps1", + "startuptype": "powershell", + "startuppoint": "ufo/startup.ps1" + } + ] + } \ No newline at end of file From 6edf059410087bbef96e8b984e7b455753fefe9e Mon Sep 17 00:00:00 2001 From: Paul Jiang Date: Thu, 2 Jan 2025 14:13:09 +0800 Subject: [PATCH 2/5] Add service model onboard and agent settings --- AgentRepoConfig.json | 12 +++++++++++ scripts/prepare-agents.sh | 8 +------ scripts/run-local.sh | 16 +++++++++++++- scripts/run.sh | 18 +++++++++++++++- .../client/desktop_env/controllers/python.py | 20 ++++++++++++++++++ .../client/lib_run_single.py | 3 ++- .../client/mm_agents/server_agents/agent.py | 3 ++- src/win-arena-container/client/run.py | 21 +++++++++++++++++-- src/win-arena-container/entry.sh | 11 ++++++++-- src/win-arena-container/start_client.sh | 12 +++++++++-- .../vm/setup/server/main.py | 11 ++++++++-- "\342\200\216AgentRepoConfig.json" | 19 ----------------- 12 files changed, 116 insertions(+), 38 deletions(-) create mode 100644 AgentRepoConfig.json delete mode 100644 "\342\200\216AgentRepoConfig.json" diff --git a/AgentRepoConfig.json b/AgentRepoConfig.json new file mode 100644 index 0000000..ca0d86f --- /dev/null +++ b/AgentRepoConfig.json @@ -0,0 +1,12 @@ +{ + "repositories": [ + { + "name": "UFO", + "url": "https://github.com/PaulJiangMS/UFO", + "runningmode": "server", + "setupscript": "windows-arena/setup.ps1", + "startuptype": "powershell", + "startuppoint": "windows-arena/startup.ps1" + } + ] +} \ No newline at end of file diff --git a/scripts/prepare-agents.sh b/scripts/prepare-agents.sh index d279393..a2610d5 100644 --- a/scripts/prepare-agents.sh +++ b/scripts/prepare-agents.sh @@ -29,7 +29,6 @@ repos=$(jq -c '.repositories[]' "$CONFIG_FILE") for repo in $repos; do REPO_URL=$(echo "$repo" | jq -r '.url') REPO_DIR_NAME=$(echo "$repo" | jq -r '.name') - REPO_FOLDER=$(echo "$repo" | jq -r '.foldertocopy') RUNNING_MODE=$(echo "$repo" | jq -r '.runningmode') # Set the target folder based on the running mode @@ -49,12 +48,7 @@ for repo in $repos; do echo "Directory $REPO_DIR already exists. Skipping clone." else echo "Cloning $REPO_URL into $REPO_DIR..." - git clone --no-checkout "$REPO_URL" "$REPO_DIR" - pushd "$REPO_DIR" - git sparse-checkout init --cone - echo "$REPO_FOLDER" > .git/info/sparse-checkout - git checkout - popd + git clone "$REPO_URL" "$REPO_DIR" fi done diff --git a/scripts/run-local.sh b/scripts/run-local.sh index 100382e..ddcf6ef 100644 --- a/scripts/run-local.sh +++ b/scripts/run-local.sh @@ -26,6 +26,8 @@ model="gpt-4-vision-preview" som_origin="oss" a11y_backend="uia" gpu_enabled=false +json_name="evaluation_examples_windows/test_all.json" +agent_settings="" # Parse the command line arguments while [[ $# -gt 0 ]]; do @@ -110,6 +112,14 @@ while [[ $# -gt 0 ]]; do mode=$2 shift 2 ;; + --json-name) + json_name=$2 + shift 2 + ;; + --agent-settings) + agent_settings=$2 + shift 2 + ;; --help) echo "Usage: $0 [options]" echo "Options:" @@ -133,6 +143,8 @@ while [[ $# -gt 0 ]]; do echo " --a11y-backend : The a11y accessibility backend to use (default: uia, available options are: uia, win32)" echo " --gpu-enabled : Enable GPU support (default: false)" echo " --mode : Mode (default: azure)" + echo " --json-name The name of the JSON file to use (default: test_all.json)" + echo " --agent-settings The additional agent settings, which should be a json string." exit 0 ;; *) @@ -161,4 +173,6 @@ if [[ -z "$OPENAI_API_KEY" && (-z "$AZURE_API_KEY" || -z "$AZURE_ENDPOINT") ]]; log_error_exit "Either OPENAI_API_KEY must be set or both AZURE_API_KEY and AZURE_ENDPOINT must be set: $1" fi -./run.sh --mode $mode --prepare-image $prepare_image --container-name $container_name --skip-build $skip_build --interactive $interactive --connect $connect --use-kvm $use_kvm --ram-size $ram_size --cpu-cores $cpu_cores --mount-vm-storage $mount_vm_storage --mount-client $mount_client --mount-server $mount_server --browser-port $browser_port --rdp-port $rdp_port --start-client $start_client --agent $agent --model $model --som-origin $som_origin --a11y-backend $a11y_backend --gpu-enabled $gpu_enabled --openai-api-key $OPENAI_API_KEY --azure-api-key $AZURE_API_KEY --azure-endpoint $AZURE_ENDPOINT \ No newline at end of file +echo "$agent_settings" + +./run.sh --mode $mode --prepare-image $prepare_image --container-name $container_name --skip-build $skip_build --interactive $interactive --connect $connect --use-kvm $use_kvm --ram-size $ram_size --cpu-cores $cpu_cores --mount-vm-storage $mount_vm_storage --mount-client $mount_client --mount-server $mount_server --browser-port $browser_port --rdp-port $rdp_port --start-client $start_client --agent $agent --model $model --som-origin $som_origin --a11y-backend $a11y_backend --gpu-enabled $gpu_enabled --openai-api-key $OPENAI_API_KEY --azure-api-key $AZURE_API_KEY --azure-endpoint $AZURE_ENDPOINT --json-name $json_name --agent-settings "$agent_settings" \ No newline at end of file diff --git a/scripts/run.sh b/scripts/run.sh index 3fe93eb..c4a065a 100644 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -26,6 +26,8 @@ model="gpt-4-vision-preview" som_origin="oss" a11y_backend="uia" gpu_enabled=false +agent_settings="" +json_name="evaluation_examples_windows/test_all.json" OPENAI_API_KEY="" AZURE_API_KEY="" AZURE_ENDPOINT="" @@ -109,6 +111,10 @@ while [[ $# -gt 0 ]]; do gpu_enabled=$2 shift 2 ;; + --agent-settings) + agent_settings=$2 + shift 2 + ;; --openai-api-key) OPENAI_API_KEY="$2" shift 2 @@ -125,6 +131,10 @@ while [[ $# -gt 0 ]]; do mode=$2 shift 2 ;; + --json-name) + json_name=$2 + shift 2 + ;; --help) echo "Usage: $0 [options]" echo "Options:" @@ -151,6 +161,8 @@ while [[ $# -gt 0 ]]; do echo " --azure-api-key : The Azure OpenAI API key" echo " --azure-endpoint : The Azure OpenAI Endpoint" echo " --mode : Mode (default: azure)" + echo " --json-name The name of the JSON file to use (default: test_all.json)" + echo " --agent-settings The additional agent settings, which should be a json string." exit 0 ;; *) @@ -201,6 +213,7 @@ echo "Using VM Setup Image path: $vm_setup_image_path" echo "Using VM storage mount path: $vm_storage_mount_path" echo "Using server mount path: $server_mount_path" echo "Using client mount path: $client_mount_path" +echo "$agent_settings" # Check if /dev/kvm exists if [ ! -e /dev/kvm ]; then @@ -301,8 +314,11 @@ invoke_docker_container() { # Add the image name with tag docker_command+=" $winarena_full_image_name:$winarena_image_tag" + # Escape double quotes + escaped_agent_settings=$(echo "$agent_settings" | sed 's/"/\\"/g') + # Set the entrypoint arguments - entrypoint_args=" -c './entry.sh --prepare-image $prepare_image --start-client $start_client --agent $agent --model $model --som-origin $som_origin --a11y-backend $a11y_backend'" + entrypoint_args=" -c './entry.sh --prepare-image $prepare_image --start-client $start_client --agent $agent --model $model --som-origin $som_origin --a11y-backend $a11y_backend --json-name $json_name --agent-settings \"$escaped_agent_settings\"'" if [ "$interactive" = true ]; then entrypoint_args="" fi diff --git a/src/win-arena-container/client/desktop_env/controllers/python.py b/src/win-arena-container/client/desktop_env/controllers/python.py index 596b752..01ed495 100644 --- a/src/win-arena-container/client/desktop_env/controllers/python.py +++ b/src/win-arena-container/client/desktop_env/controllers/python.py @@ -713,3 +713,23 @@ def execute_shell_command(self, command): except requests.exceptions.RequestException as e: logger.error("An error occurred while trying to execute the command: %s", e) return None + + def run_agent(self, agent_name, instruction, agent_settings): + """ + Run the agent. + """ + # Prepare the data payload + payload = { + "agent": agent_name, + "instruction": instruction, + "agent_settings": agent_settings + } + + response = requests.post(self.http_server + "/run_server_agent", json=payload) + if response.status_code == 200: + logger.info("Successed running agent: %s", agent_name) + logger.info("Agent response: %s", response.json()) + return response.json() + else: + logger.error("Failed to run agent. Status code: %s", response) + return None diff --git a/src/win-arena-container/client/lib_run_single.py b/src/win-arena-container/client/lib_run_single.py index 344c852..a9ac3a0 100644 --- a/src/win-arena-container/client/lib_run_single.py +++ b/src/win-arena-container/client/lib_run_single.py @@ -34,7 +34,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl from mm_agents.server_agents.agent import ServerAgent if isinstance(agent, ServerAgent): logger.info("Agent: Running server agent %s...", agent.agent_name) - env.controller.run_agent(agent.agent_name, instruction) + logger.info("Agent settings: %s...", agent.agent_settings) + env.controller.run_agent(agent.agent_name, instruction, agent.agent_settings) else: while not done and step_idx < max_steps: diff --git a/src/win-arena-container/client/mm_agents/server_agents/agent.py b/src/win-arena-container/client/mm_agents/server_agents/agent.py index 87b2119..2e11448 100644 --- a/src/win-arena-container/client/mm_agents/server_agents/agent.py +++ b/src/win-arena-container/client/mm_agents/server_agents/agent.py @@ -2,9 +2,10 @@ from uuid import uuid4 class ServerAgent: - def __init__(self, agent_name="") -> None: + def __init__(self, agent_name="", agent_settings = {}) -> None: self.action_space = "code_block" self.agent_name = agent_name + self.agent_settings = agent_settings def predict(self, instruction: str, obs: Dict) -> List: return (None, ['DONE'], {}, {}) diff --git a/src/win-arena-container/client/run.py b/src/win-arena-container/client/run.py index a9c2a07..74765f9 100644 --- a/src/win-arena-container/client/run.py +++ b/src/win-arena-container/client/run.py @@ -122,7 +122,10 @@ def config() -> argparse.Namespace: parser.add_argument("--num_workers", type=int, default=1, help="Total number of workers") # benchmark difficulty level - parser.add_argument("--diff_lvl", type=str, default="normal", help="Difficulty level of the benchmark") + parser.add_argument("--diff_lvl", type=str, default="normal", help="Difficulty level of the benchmark") + + # agent setting + parser.add_argument("--agent_settings", type=str, default='', help="JSON string of agent settings in key-value pairs") args, unknownargs = parser.parse_known_args() @@ -162,6 +165,16 @@ def test( "num_workers": args.num_workers, } + # Convert the JSON string to a dictionary + if args.agent_settings: + try: + agent_settings = json.loads(args.agent_settings) + except json.JSONDecodeError: + print("Invalid JSON string provided for --agent_settings") + agent_settings = {} + else: + agent_settings = {} + if cfg_args["agent_name"] == "navi": if cfg_args["som_origin"] in ["a11y", "omni", "mixed-omni"]: som_config = None @@ -191,7 +204,11 @@ def test( agent = ClaudeAgent() else: from mm_agents.server_agents.agent import ServerAgent - agent = ServerAgent(agent_name=cfg_args["agent_name"]) + if agent_settings is not None and len(agent_settings) > 0: + agent_name = agent_settings.get("agent_name", cfg_args["agent_name"]) + else: + agent_name = cfg_args["agent_name"] + agent = ServerAgent(agent_name=agent_name, agent_settings=agent_settings) env = DesktopEnv( action_space=agent.action_space, diff --git a/src/win-arena-container/entry.sh b/src/win-arena-container/entry.sh index 3b58116..33ae744 100644 --- a/src/win-arena-container/entry.sh +++ b/src/win-arena-container/entry.sh @@ -15,7 +15,8 @@ clean_results=true worker_id="0" num_workers="1" result_dir="./results" -json_name="evaluation_examples_windows/test_all.json" +json_name="evaluation_examples_windows/test_all.json" +agent_settings="" while [[ $# -gt 0 ]]; do case "$1" in @@ -63,6 +64,10 @@ while [[ $# -gt 0 ]]; do json_name=$2 shift 2 ;; + --agent-settings) + agent_settings=$2 + shift 2 + ;; --help) echo "Usage: $0 [options]" echo "Options:" @@ -77,6 +82,7 @@ while [[ $# -gt 0 ]]; do echo " --num-workers The number of workers" echo " --result-dir The directory to store the results (default: ./results)" echo " --json-name The name of the JSON file to use (default: test_all.json)" + echo " --agent-settings The additional agent settings, which should be a json string." exit 0 ;; *) @@ -103,7 +109,8 @@ else # Start the client script if [ "$start_client" = "true" ]; then echo "Starting client..." - ./start_client.sh --agent $agent --model $model --som-origin $som_origin --a11y-backend $a11y_backend --clean-results $clean_results --worker-id $worker_id --num-workers $num_workers --result-dir $result_dir --json-name $json_name + echo $agent_settings + ./start_client.sh --agent $agent --model $model --som-origin $som_origin --a11y-backend $a11y_backend --clean-results $clean_results --worker-id $worker_id --num-workers $num_workers --result-dir $result_dir --json-name $json_name --agent-settings "$agent_settings" else echo "Keeping container alive" while true; do diff --git a/src/win-arena-container/start_client.sh b/src/win-arena-container/start_client.sh index 49378a9..ffb825d 100644 --- a/src/win-arena-container/start_client.sh +++ b/src/win-arena-container/start_client.sh @@ -10,6 +10,7 @@ num_workers="1" result_dir="./results" json_name="evaluation_examples_windows/test_all.json" diff_lvl="normal" +agent_settings="" # parse agent argument while [[ $# -gt 0 ]]; do @@ -53,7 +54,11 @@ while [[ $# -gt 0 ]]; do --diff-lvl) diff_lvl=$2 shift 2 - ;; + ;; + --agent-settings) + agent_settings=$2 + shift 2 + ;; --help) echo "Usage: $0 [options]" echo "Options:" @@ -67,12 +72,15 @@ while [[ $# -gt 0 ]]; do echo " --result-dir The directory to store the results (default: ./results)" echo " --json-name The name of the JSON file to use (default: test_all.json)" echo " --diff-lvl The difficulty level of benchmark (default: normal, available options are: normal, hard)" + echo " --agent-settings The additional agent settings, which should be a json string." exit 0 ;; *) esac done +echo "Running agent $agent..." + cd /client if [ "$clean_results" = true ]; then echo "Cleaning results directory..." @@ -80,4 +88,4 @@ if [ "$clean_results" = true ]; then fi echo "Running agent $agent..." -python run.py --agent "$agent" --model "$model" --som_origin "$som_origin" --a11y_backend "$a11y_backend" --worker_id "$worker_id" --num_workers "$num_workers" --result_dir "$result_dir" --test_all_meta_path "$json_name" --diff_lvl "$diff_lvl" \ No newline at end of file +python run.py --agent_name "$agent" --model "$model" --som_origin "$som_origin" --a11y_backend "$a11y_backend" --worker_id "$worker_id" --num_workers "$num_workers" --result_dir "$result_dir" --test_all_meta_path "$json_name" --diff_lvl "$diff_lvl" --agent_settings "$agent_settings" \ No newline at end of file diff --git a/src/win-arena-container/vm/setup/server/main.py b/src/win-arena-container/vm/setup/server/main.py index af5395d..4262ed8 100644 --- a/src/win-arena-container/vm/setup/server/main.py +++ b/src/win-arena-container/vm/setup/server/main.py @@ -1768,6 +1768,7 @@ def run_server_agent(): data = request.json instruction = data.get('instruction', "") agent_name = data.get('agent', "") + agent_settings = data.get('agent_settings', {}) # Find the repository configuration for the specified agent repo_config = next((repo for repo in config['server_repositories'] if repo['name'] == agent_name), None) @@ -1781,9 +1782,15 @@ def run_server_agent(): # Determine the command based on the startup type if startup_type.lower() == 'powershell': - command = ['powershell', '-ExecutionPolicy', 'Bypass', '-File', startup_point, '-instruction', instruction] + if agent_settings == "": + command = ['powershell', '-ExecutionPolicy', 'Bypass', '-File', startup_point, '-instruction', instruction] + else: + command = ['powershell', '-ExecutionPolicy', 'Bypass', '-File', startup_point, '-instruction', instruction, '-agent_settings', json.dumps(agent_settings)] elif startup_type.lower() == 'python': - command = ['py', startup_point, '--instruction', instruction] + if agent_settings == "": + command = ['py', startup_point, '--instruction', instruction] + else: + command = ['py', startup_point, '--instruction', instruction, '--agent_setting', agent_settings] else: return jsonify({"status": "error", "message": "Unsupported startup type"}), 400 diff --git "a/\342\200\216AgentRepoConfig.json" "b/\342\200\216AgentRepoConfig.json" deleted file mode 100644 index 4f3f838..0000000 --- "a/\342\200\216AgentRepoConfig.json" +++ /dev/null @@ -1,19 +0,0 @@ -{ - "repositories": [ - { - "name": "Agent_s", - "url": "https://github.com/simular-ai/Agent-S", - "foldertocopy": "agent_s", - "runningmode": "client" - }, - { - "name": "UFO", - "url": "https://github.com/PaulJiangMS/UFO", - "foldertocopy": "ufo", - "runningmode": "server", - "setupscript": "ufo/setup.ps1", - "startuptype": "powershell", - "startuppoint": "ufo/startup.ps1" - } - ] - } \ No newline at end of file From 58dd27ddd6ee729dd9f4b6b585d05c340a0d4326 Mon Sep 17 00:00:00 2001 From: Paul Jiang Date: Fri, 3 Jan 2025 15:50:30 +0800 Subject: [PATCH 3/5] Refine code to make sure azure mode runing correctly --- AgentRepoConfig.json | 4 ++-- src/win-arena-container/Dockerfile-WinArena | 3 +++ src/win-arena-container/vm/setup/on-logon.vbs | 10 ++++++++++ src/win-arena-container/vm/setup/server/main.py | 2 +- src/win-arena-container/vm/setup/setup.ps1 | 2 +- 5 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 src/win-arena-container/vm/setup/on-logon.vbs diff --git a/AgentRepoConfig.json b/AgentRepoConfig.json index ca0d86f..8bfcd96 100644 --- a/AgentRepoConfig.json +++ b/AgentRepoConfig.json @@ -4,9 +4,9 @@ "name": "UFO", "url": "https://github.com/PaulJiangMS/UFO", "runningmode": "server", - "setupscript": "windows-arena/setup.ps1", + "setupscript": "windows_arena/setup.ps1", "startuptype": "powershell", - "startuppoint": "windows-arena/startup.ps1" + "startuppoint": "windows_arena/startup.ps1" } ] } \ No newline at end of file diff --git a/src/win-arena-container/Dockerfile-WinArena b/src/win-arena-container/Dockerfile-WinArena index 00b15ce..ab082ef 100644 --- a/src/win-arena-container/Dockerfile-WinArena +++ b/src/win-arena-container/Dockerfile-WinArena @@ -30,6 +30,9 @@ RUN if [ "${DEPLOY_MODE}" = "azure" ]; then \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/install.bat"; \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/on-logon.ps1"; \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/setup.ps1"; \ + sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/setupAgents.ps1"; \ + sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/server/main.py"; \ + sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/on-logon.vbs"; \ fi # Copy client application diff --git a/src/win-arena-container/vm/setup/on-logon.vbs b/src/win-arena-container/vm/setup/on-logon.vbs new file mode 100644 index 0000000..7813a30 --- /dev/null +++ b/src/win-arena-container/vm/setup/on-logon.vbs @@ -0,0 +1,10 @@ +Dim scriptFolder, pythonScriptFile, pythonServerPort +scriptFolder = "\\host.lan\Data" +pythonScriptFile = scriptFolder & "\server\main.py" +pythonServerPort = 5000 + +' Start the Caddy reverse proxy in a non-blocking manner +CreateObject("WScript.Shell").Run "cmd /c caddy reverse-proxy --from :9222 --to :1337", 0, False + +' Start the WinArena server +CreateObject("WScript.Shell").Run "cmd /c py """ & pythonScriptFile & """ --port " & pythonServerPort, 0, False diff --git a/src/win-arena-container/vm/setup/server/main.py b/src/win-arena-container/vm/setup/server/main.py index 4262ed8..320090c 100644 --- a/src/win-arena-container/vm/setup/server/main.py +++ b/src/win-arena-container/vm/setup/server/main.py @@ -1758,7 +1758,7 @@ def get_check_if_world_clock_exists(): # Load JSON configuration from file import json def load_config(): - with open('../agents.json') as f: + with open(os.path.join(r'\\host.lan\Data', 'agents.json')) as f: return json.load(f) config = load_config() diff --git a/src/win-arena-container/vm/setup/setup.ps1 b/src/win-arena-container/vm/setup/setup.ps1 index 5c93fbd..1ba41e7 100644 --- a/src/win-arena-container/vm/setup/setup.ps1 +++ b/src/win-arena-container/vm/setup/setup.ps1 @@ -423,7 +423,7 @@ if (-not (Get-NetFirewallRule -Name $caddyProxyRuleName -ErrorAction SilentlyCon Write-Host "Firewall rule already exists. $caddyProxyRuleName " } -$onLogonScriptPath = "$scriptFolder\on-logon.ps1" +$onLogonScriptPath = "$scriptFolder\on-logon.vbs" # Check if the scheduled task exists before unregistering it if (Get-ScheduledTask -TaskName $onLogonTaskName -ErrorAction SilentlyContinue) { Write-Host "Scheduled task $onLogonTaskName already exists." From 1b3d6f63a068350aea8ea739ddc8dacef03a2f4c Mon Sep 17 00:00:00 2001 From: Paul Jiang Date: Thu, 16 Jan 2025 12:59:12 +0800 Subject: [PATCH 4/5] Remove unused code --- src/win-arena-container/Dockerfile-WinArena | 1 - src/win-arena-container/vm/setup/on-logon.ps1 | 11 ----------- src/win-arena-container/vm/setup/server/s | 0 3 files changed, 12 deletions(-) delete mode 100644 src/win-arena-container/vm/setup/on-logon.ps1 delete mode 100644 src/win-arena-container/vm/setup/server/s diff --git a/src/win-arena-container/Dockerfile-WinArena b/src/win-arena-container/Dockerfile-WinArena index ab082ef..ef54e23 100644 --- a/src/win-arena-container/Dockerfile-WinArena +++ b/src/win-arena-container/Dockerfile-WinArena @@ -28,7 +28,6 @@ RUN if [ "${DEPLOY_MODE}" = "azure" ]; then \ WINDOWS_OEM_FOLDER='C:\\oem'; \ OEM_FOLDER='oem'; \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/install.bat"; \ - sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/on-logon.ps1"; \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/setup.ps1"; \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/setupAgents.ps1"; \ sed -i "s|${WINDOWS_DATA_FOLDER}|${WINDOWS_OEM_FOLDER}|g" "/${OEM_FOLDER}/server/main.py"; \ diff --git a/src/win-arena-container/vm/setup/on-logon.ps1 b/src/win-arena-container/vm/setup/on-logon.ps1 deleted file mode 100644 index 33be278..0000000 --- a/src/win-arena-container/vm/setup/on-logon.ps1 +++ /dev/null @@ -1,11 +0,0 @@ -$scriptFolder = "\\host.lan\Data" -$pythonScriptFile = "$scriptFolder\server\main.py" -$pythonServerPort = 5000 - -# Start the Caddy reverse proxy in a non-blocking manner -Write-Host "Running the Caddy reverse proxy from port 9222 to port 1337" -Start-Process -NoNewWindow -FilePath "powershell" -ArgumentList "-Command", "caddy reverse-proxy --from :9222 --to :1337" - -# Start the WinArena server -Write-Host "Running the WinArena server on port $pythonServerPort..." -python $pythonScriptFile --port $pythonServerPort diff --git a/src/win-arena-container/vm/setup/server/s b/src/win-arena-container/vm/setup/server/s deleted file mode 100644 index e69de29..0000000 From 9111c141b7ad077d57df42ce96238bf5fa07b7c7 Mon Sep 17 00:00:00 2001 From: Paul Jiang Date: Sun, 2 Feb 2025 19:08:05 +0800 Subject: [PATCH 5/5] Add docs for server mode onboarding --- docs/Develop-Agent.md | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/docs/Develop-Agent.md b/docs/Develop-Agent.md index b855dfc..9d0609e 100644 --- a/docs/Develop-Agent.md +++ b/docs/Develop-Agent.md @@ -2,8 +2,11 @@ Want to test your own agents in Windows Agent Arena? You can use our default agent as a template and create your own folder under `src/win-arena-container/client/mm_agents`. You just need to ensure that your `agent.py` file includes the `predict()` and `reset()` functions. -## Steps to Create Your Custom Agent +# Steps to Create Your Custom Agent +The windows Agent Arena support two types of the agent, first is to only to have the prediction and utitlize the Desktop environment sdk to do the action exection. the second one is to +totally running on server mode. +## Client mode onboarding ### 1. Create a New Agent Folder Navigate to the `mm_agents` directory: @@ -105,7 +108,40 @@ execute_actions(actions) # Function to execute the predicted actions Once your agent is ready, submit a Pull Request (PR) to the repository with your new agent folder and code changes. Ensure your code follows the project's guidelines and is well-documented. -## Important Considerations +## Server mode onboarding +### Define your environment script + +Prepare a script to setup the windows for your agent. you can refer to [UFO setup](https://github.com/microsoft/UFO/blob/dev/waa/windows_arena/setup.ps1) + +### Define the start up script to accept the WAA prompt + +The script is to accept the WAA prompt to run your agent on windows, refer to [UFO startup](https://github.com/microsoft/UFO/blob/dev/waa/windows_arena/startup.ps1) + +### Define your agent repo setting to easily clone the agent code base + +Add json element for your agent as below in `AgentRepoConfig.json` + +```json +{ + "repositories": [ + { + "name": "UFO", + "url": "https://github.com/PaulJiangMS/UFO", + "runningmode": "server", + "setupscript": "windows_arena/setup.ps1", + "startuptype": "powershell", + "startuppoint": "windows_arena/startup.ps1" + } + ] +} +``` + +### Note for service mode + +You need to finish the steps for server mode first, then following the steps to prepare windows image. + + +## Important Considerations - **Observation Data**: The `obs` dictionary contains vital information like screenshots, window titles, and clipboard content. Use this data to inform your agent's decisions. - **Action Format**: The list returned by `predict()` should contain executable actions or code blocks that the environment can interpret.