Skip to content

Commit 488806e

Browse files
SmartManojxingyaowwopenhands-agent
authored
Add image file viewing support to FileEditor (#1016)
Co-authored-by: Xingyao Wang <[email protected]> Co-authored-by: openhands <[email protected]>
1 parent 30449e1 commit 488806e

File tree

11 files changed

+361
-23
lines changed

11 files changed

+361
-23
lines changed

openhands-tools/openhands/tools/browser_use/definition.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,29 @@
2424
# Maximum output size for browser observations
2525
MAX_BROWSER_OUTPUT_SIZE = 50000
2626

27+
# Mapping of base64 prefixes to MIME types for image detection
28+
BASE64_IMAGE_PREFIXES = {
29+
"/9j/": "image/jpeg",
30+
"iVBORw0KGgo": "image/png",
31+
"R0lGODlh": "image/gif",
32+
"UklGR": "image/webp",
33+
}
34+
35+
36+
def detect_image_mime_type(base64_data: str) -> str:
37+
"""Detect MIME type from base64-encoded image data.
38+
39+
Args:
40+
base64_data: Base64-encoded image data
41+
42+
Returns:
43+
Detected MIME type, defaults to "image/png" if not detected
44+
"""
45+
for prefix, mime_type in BASE64_IMAGE_PREFIXES.items():
46+
if base64_data.startswith(prefix):
47+
return mime_type
48+
return "image/png"
49+
2750

2851
class BrowserObservation(Observation):
2952
"""Base observation for browser operations."""
@@ -48,15 +71,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
4871
)
4972

5073
if self.screenshot_data:
51-
mime_type = "image/png"
52-
if self.screenshot_data.startswith("/9j/"):
53-
mime_type = "image/jpeg"
54-
elif self.screenshot_data.startswith("iVBORw0KGgo"):
55-
mime_type = "image/png"
56-
elif self.screenshot_data.startswith("R0lGODlh"):
57-
mime_type = "image/gif"
58-
elif self.screenshot_data.startswith("UklGR"):
59-
mime_type = "image/webp"
74+
mime_type = detect_image_mime_type(self.screenshot_data)
6075
# Convert base64 to data URL format for ImageContent
6176
data_url = f"data:{mime_type};base64,{self.screenshot_data}"
6277
llm_content.append(ImageContent(image_urls=[data_url]))

openhands-tools/openhands/tools/file_editor/definition.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,28 @@ def create(
209209
# Initialize the executor
210210
executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir)
211211

212+
# Build the tool description with conditional image viewing support
213+
# Split TOOL_DESCRIPTION to insert image viewing line after the second bullet
214+
description_lines = TOOL_DESCRIPTION.split("\n")
215+
base_description = "\n".join(description_lines[:2]) # First two lines
216+
remaining_description = "\n".join(description_lines[2:]) # Rest of description
217+
218+
# Add image viewing line if LLM supports vision
219+
if conv_state.agent.llm.vision_is_active():
220+
tool_description = (
221+
f"{base_description}\n"
222+
"* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, "
223+
".bmp), `view` displays the image content\n"
224+
f"{remaining_description}"
225+
)
226+
else:
227+
tool_description = TOOL_DESCRIPTION
228+
212229
# Add working directory information to the tool description
213230
# to guide the agent to use the correct directory instead of root
214231
working_dir = conv_state.workspace.working_dir
215232
enhanced_description = (
216-
f"{TOOL_DESCRIPTION}\n\n"
233+
f"{tool_description}\n\n"
217234
f"Your current working directory is: {working_dir}\n"
218235
f"When exploring project structure, start with this directory "
219236
f"instead of the root filesystem."

openhands-tools/openhands/tools/file_editor/editor.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import base64
2+
import mimetypes
13
import os
24
import re
35
import shutil
@@ -7,6 +9,7 @@
79

810
from binaryornot.check import is_binary
911

12+
from openhands.sdk import ImageContent, TextContent
1013
from openhands.sdk.logger import get_logger
1114
from openhands.sdk.utils.truncate import maybe_truncate
1215
from openhands.tools.file_editor.definition import (
@@ -36,6 +39,9 @@
3639

3740
logger = get_logger(__name__)
3841

42+
# Supported image extensions for viewing as base64-encoded content
43+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
44+
3945

4046
class FileEditor:
4147
"""
@@ -327,6 +333,34 @@ def view(
327333
prev_exist=True,
328334
)
329335

336+
# Check if the file is an image
337+
file_extension = path.suffix.lower()
338+
if file_extension in IMAGE_EXTENSIONS:
339+
# Read image file as base64
340+
try:
341+
with open(path, "rb") as f:
342+
image_bytes = f.read()
343+
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
344+
345+
mime_type, _ = mimetypes.guess_type(str(path))
346+
if not mime_type or not mime_type.startswith("image/"):
347+
mime_type = "image/png"
348+
output_msg = (
349+
f"Image file {path} read successfully. Displaying image content."
350+
)
351+
image_url = f"data:{mime_type};base64,{image_base64}"
352+
return FileEditorObservation(
353+
command="view",
354+
content=[
355+
TextContent(text=output_msg),
356+
ImageContent(image_urls=[image_url]),
357+
],
358+
path=str(path),
359+
prev_exist=True,
360+
)
361+
except Exception as e:
362+
raise ToolError(f"Failed to read image file {path}: {e}") from None
363+
330364
# Validate file and count lines
331365
self.validate_file(path)
332366
num_lines = self._count_lines(path)
@@ -609,8 +643,9 @@ def validate_file(self, path: Path) -> None:
609643
),
610644
)
611645

612-
# Check file type
613-
if is_binary(str(path)):
646+
# Check file type - allow image files
647+
file_extension = path.suffix.lower()
648+
if is_binary(str(path)) and file_extension not in IMAGE_EXTENSIONS:
614649
raise FileValidationError(
615650
path=str(path),
616651
reason=(

tests/integration/base.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,23 @@
2626
from openhands.sdk.tool import Tool
2727

2828

29+
class SkipTest(Exception):
30+
"""
31+
Exception raised to indicate that a test should be skipped.
32+
33+
This is useful for tests that require specific capabilities (e.g., vision)
34+
that may not be available in all LLMs.
35+
"""
36+
37+
pass
38+
39+
2940
class TestResult(BaseModel):
3041
"""Result of an integration test."""
3142

3243
success: bool
3344
reason: str | None = None
45+
skipped: bool = False
3446

3547

3648
class BaseIntegrationTest(ABC):

tests/integration/run_infer.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from pydantic import BaseModel, ConfigDict
1818

1919
from openhands.sdk.logger import get_logger
20-
from tests.integration.base import BaseIntegrationTest, TestResult
20+
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
2121
from tests.integration.schemas import ModelTestResults
2222
from tests.integration.utils.format_costs import format_cost
2323

@@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
171171
log_file_path=log_file_path,
172172
)
173173

174+
except SkipTest as e:
175+
# Test should be skipped (e.g., LLM doesn't support required capabilities)
176+
logger.info("Test %s skipped: %s", instance.instance_id, str(e))
177+
return EvalOutput(
178+
instance_id=instance.instance_id,
179+
test_result=TestResult(
180+
success=False,
181+
reason=str(e),
182+
skipped=True,
183+
),
184+
llm_model=llm_config.get("model", "unknown"),
185+
cost=0.0,
186+
)
187+
174188
except Exception as e:
175189
logger.error("Error running test %s: %s", instance.instance_id, e)
176190
return EvalOutput(
@@ -274,11 +288,17 @@ def generate_structured_results(
274288
# Print summary for console output
275289
success_rate = structured_results.success_rate
276290
successful = structured_results.successful_tests
291+
skipped = structured_results.skipped_tests
277292
total = structured_results.total_tests
278293
logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
294+
if skipped > 0:
295+
logger.info("Skipped tests: %d", skipped)
279296
logger.info("Evaluation Results:")
280297
for instance in structured_results.test_instances:
281-
status = "✓" if instance.test_result.success else "✗"
298+
if instance.test_result.skipped:
299+
status = "⊘" # Skipped symbol
300+
else:
301+
status = "✓" if instance.test_result.success else "✗"
282302
reason = instance.test_result.reason or "N/A"
283303
logger.info("%s: %s - %s", instance.instance_id, status, reason)
284304
logger.info("Total cost: %s", format_cost(structured_results.total_cost))

tests/integration/schemas.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class TestResultData(BaseModel):
2020

2121
success: bool
2222
reason: str | None = None
23+
skipped: bool = False
2324

2425

2526
class TestInstanceResult(BaseModel):
@@ -46,6 +47,7 @@ class ModelTestResults(BaseModel):
4647
# Summary statistics
4748
total_tests: int
4849
successful_tests: int
50+
skipped_tests: int
4951
success_rate: float
5052
total_cost: float
5153

@@ -75,6 +77,7 @@ def from_eval_outputs(
7577
test_result=TestResultData(
7678
success=output.test_result.success,
7779
reason=output.test_result.reason,
80+
skipped=output.test_result.skipped,
7881
),
7982
cost=output.cost,
8083
error_message=output.error_message,
@@ -84,6 +87,7 @@ def from_eval_outputs(
8487
# Calculate summary statistics
8588
total_tests = len(test_instances)
8689
successful_tests = sum(1 for t in test_instances if t.test_result.success)
90+
skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
8791
success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
8892
total_cost = sum(t.cost for t in test_instances)
8993

@@ -94,6 +98,7 @@ def from_eval_outputs(
9498
test_instances=test_instances,
9599
total_tests=total_tests,
96100
successful_tests=successful_tests,
101+
skipped_tests=skipped_tests,
97102
success_rate=success_rate,
98103
total_cost=total_cost,
99104
eval_note=eval_note,
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""Test that an agent can view and analyze image files using FileEditor."""
2+
3+
import os
4+
import urllib.request
5+
6+
from openhands.sdk import TextContent, get_logger
7+
from openhands.sdk.event.llm_convertible import MessageEvent
8+
from openhands.sdk.tool import Tool, register_tool
9+
from openhands.tools.file_editor import FileEditorTool
10+
from openhands.tools.terminal import TerminalTool
11+
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
12+
13+
14+
INSTRUCTION = (
15+
"Please view the logo.png file in the current directory and tell me what "
16+
"colors you see in it. Is the logo blue, yellow, or green? Please analyze "
17+
"the image and provide your answer."
18+
)
19+
20+
IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png"
21+
22+
logger = get_logger(__name__)
23+
24+
25+
class ImageFileViewingTest(BaseIntegrationTest):
26+
"""Test that an agent can view and analyze image files."""
27+
28+
INSTRUCTION: str = INSTRUCTION
29+
30+
def __init__(self, *args, **kwargs):
31+
super().__init__(*args, **kwargs)
32+
self.logo_path: str = os.path.join(self.workspace, "logo.png")
33+
34+
# Verify that the LLM supports vision
35+
if not self.llm.vision_is_active():
36+
raise SkipTest(
37+
"This test requires a vision-capable LLM model. "
38+
"Please use a model that supports image input."
39+
)
40+
41+
@property
42+
def tools(self) -> list[Tool]:
43+
"""List of tools available to the agent."""
44+
register_tool("TerminalTool", TerminalTool)
45+
register_tool("FileEditorTool", FileEditorTool)
46+
return [
47+
Tool(name="TerminalTool"),
48+
Tool(name="FileEditorTool"),
49+
]
50+
51+
def setup(self) -> None:
52+
"""Download the OpenHands logo for the agent to analyze."""
53+
try:
54+
urllib.request.urlretrieve(IMAGE_URL, self.logo_path)
55+
logger.info(f"Downloaded test logo to: {self.logo_path}")
56+
except Exception as e:
57+
logger.error(f"Failed to download logo: {e}")
58+
raise
59+
60+
def verify_result(self) -> TestResult:
61+
"""Verify that the agent identified yellow as one of the logo colors."""
62+
if not os.path.exists(self.logo_path):
63+
return TestResult(
64+
success=False, reason="Logo file not found after agent execution"
65+
)
66+
67+
# Check the agent's responses in collected events
68+
# Look for messages mentioning yellow color
69+
agent_responses = []
70+
for event in self.collected_events:
71+
if isinstance(event, MessageEvent):
72+
message = event.llm_message
73+
if message.role == "assistant":
74+
for content_item in message.content:
75+
if isinstance(content_item, TextContent):
76+
agent_responses.append(content_item.text.lower())
77+
78+
combined_response = " ".join(agent_responses)
79+
80+
if "yellow" in combined_response:
81+
return TestResult(
82+
success=True,
83+
reason="Agent successfully identified yellow color in the logo",
84+
)
85+
else:
86+
return TestResult(
87+
success=False,
88+
reason=(
89+
f"Agent did not identify yellow color in the logo. "
90+
f"Response: {combined_response[:500]}"
91+
),
92+
)

0 commit comments

Comments
 (0)