Skip to content

Commit 89b63b4

Browse files
Fix RemoteConversation stats field mismatch and state updates (#1042)
Co-authored-by: openhands <[email protected]>
1 parent a44f769 commit 89b63b4

File tree

3 files changed

+249
-0
lines changed

3 files changed

+249
-0
lines changed

openhands-agent-server/openhands/agent_server/event_service.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ async def run(self):
271271
raise ValueError("inactive_service")
272272
loop = asyncio.get_running_loop()
273273
await loop.run_in_executor(None, self._conversation.run)
274+
# Publish state update after run completes to ensure stats are updated
275+
await self._publish_state_update()
274276

275277
async def respond_to_confirmation(self, request: ConfirmationResponseRequest):
276278
if request.accept:
@@ -282,6 +284,8 @@ async def pause(self):
282284
if self._conversation:
283285
loop = asyncio.get_running_loop()
284286
await loop.run_in_executor(None, self._conversation.pause)
287+
# Publish state update after pause to ensure stats are updated
288+
await self._publish_state_update()
285289

286290
async def update_secrets(self, secrets: dict[str, SecretValue]):
287291
"""Update secrets in the conversation."""

tests/cross/test_remote_conversation_live_server.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,3 +426,146 @@ def test_file_upload_endpoint_with_live_server(server_env, tmp_path: Path):
426426
assert check_result.stdout == test_content, (
427427
f"File content mismatch. Expected:\n{test_content}\nGot:\n{check_result.stdout}"
428428
)
429+
430+
431+
def test_conversation_stats_with_live_server(
432+
server_env, monkeypatch: pytest.MonkeyPatch
433+
):
434+
"""Integration test verifying conversation stats are correctly propagated.
435+
436+
This test validates the fix for issue #1041 where accumulated cost was
437+
always 0. It checks:
438+
1. RemoteConversation reads stats from the correct 'stats' field (not
439+
'conversation_stats')
440+
2. Stats updates are propagated after run() completes
441+
3. Accumulated cost and token usage are correctly tracked
442+
443+
This is a regression test for the field mismatch and state update issues.
444+
"""
445+
446+
def fake_completion_with_cost(
447+
self,
448+
messages,
449+
tools,
450+
return_metrics=False,
451+
add_security_risk_prediction=False,
452+
**kwargs,
453+
): # type: ignore[no-untyped-def]
454+
from openhands.sdk.llm.llm_response import LLMResponse
455+
from openhands.sdk.llm.message import Message
456+
from openhands.sdk.llm.utils.metrics import TokenUsage
457+
458+
# Create a minimal ModelResponse with a single assistant message
459+
litellm_msg = LiteLLMMessage.model_validate(
460+
{"role": "assistant", "content": "Test response"}
461+
)
462+
raw_response = ModelResponse(
463+
id="test-resp-with-cost",
464+
created=int(time.time()),
465+
model="test-model",
466+
choices=[Choices(index=0, finish_reason="stop", message=litellm_msg)],
467+
)
468+
469+
# Convert to OpenHands Message
470+
message = Message.from_llm_chat_message(litellm_msg)
471+
472+
# Simulate cost accumulation in the LLM's metrics
473+
# The LLM should have metrics that track cost
474+
from openhands.sdk.llm.utils.metrics import MetricsSnapshot
475+
476+
if self.metrics:
477+
self.metrics.add_cost(0.0025)
478+
self.metrics.add_token_usage(
479+
prompt_tokens=100,
480+
completion_tokens=50,
481+
cache_read_tokens=0,
482+
cache_write_tokens=0,
483+
context_window=8192,
484+
response_id="test-resp-with-cost",
485+
reasoning_tokens=0,
486+
)
487+
metrics_snapshot = self.metrics.get_snapshot()
488+
else:
489+
# Create a default metrics snapshot if no metrics exist
490+
metrics_snapshot = MetricsSnapshot(
491+
model_name=self.model,
492+
accumulated_cost=0.0025,
493+
accumulated_token_usage=TokenUsage(
494+
model=self.model,
495+
prompt_tokens=100,
496+
completion_tokens=50,
497+
response_id="test-resp-with-cost",
498+
),
499+
)
500+
501+
return LLMResponse(
502+
message=message, metrics=metrics_snapshot, raw_response=raw_response
503+
)
504+
505+
# Patch LLM.completion with our cost-tracking version
506+
monkeypatch.setattr(LLM, "completion", fake_completion_with_cost, raising=True)
507+
508+
# Create an Agent with a real LLM object
509+
llm = LLM(model="gpt-4", api_key=SecretStr("test"))
510+
agent = Agent(llm=llm, tools=[])
511+
512+
# Create conversation via factory pointing at the live server
513+
workspace = RemoteWorkspace(
514+
host=server_env["host"], working_dir="/tmp/workspace/project"
515+
)
516+
conv: RemoteConversation = Conversation(agent=agent, workspace=workspace)
517+
518+
# Verify initial stats are empty/zero
519+
initial_stats = conv.conversation_stats
520+
assert initial_stats is not None
521+
initial_cost = initial_stats.get_combined_metrics().accumulated_cost
522+
assert initial_cost == 0.0, f"Expected initial cost to be 0.0, got {initial_cost}"
523+
524+
# Send a message and run the conversation
525+
conv.send_message("Test message")
526+
conv.run()
527+
528+
# Wait for the conversation to finish and stats to update
529+
# The fix ensures stats are published after run() completes
530+
max_attempts = 50
531+
for attempt in range(max_attempts):
532+
try:
533+
stats = conv.conversation_stats
534+
combined_metrics = stats.get_combined_metrics()
535+
accumulated_cost = combined_metrics.accumulated_cost
536+
537+
# Check if we got non-zero cost (stats have been updated)
538+
if accumulated_cost > 0:
539+
# Verify the stats are correctly populated
540+
assert accumulated_cost > 0, (
541+
f"Expected accumulated_cost > 0 after run(), got {accumulated_cost}"
542+
)
543+
544+
# Verify token usage is tracked
545+
if combined_metrics.accumulated_token_usage:
546+
assert combined_metrics.accumulated_token_usage.prompt_tokens > 0, (
547+
"Expected prompt_tokens > 0 after run()"
548+
)
549+
assert (
550+
combined_metrics.accumulated_token_usage.completion_tokens > 0
551+
), "Expected completion_tokens > 0 after run()"
552+
553+
# Success - we got updated stats
554+
break
555+
except (KeyError, AttributeError, AssertionError) as e:
556+
if attempt == max_attempts - 1:
557+
raise AssertionError(
558+
f"Stats not properly updated after {max_attempts} attempts. "
559+
f"Last error: {e}"
560+
)
561+
time.sleep(0.1)
562+
563+
# Final verification: stats are read from 'stats' field, not 'conversation_stats'
564+
info = conv.state._get_conversation_info()
565+
assert "stats" in info, "Expected 'stats' field in conversation info"
566+
567+
# Verify the RemoteConversation is correctly reading from 'stats'
568+
stats_from_field = info.get("stats", {})
569+
assert stats_from_field, "Expected non-empty stats in the 'stats' field after run()"
570+
571+
conv.close()

tests/sdk/conversation/test_remote_conversation_state_updates.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,3 +329,105 @@ def test_state_update_callback_integration():
329329
# Verify the state was updated
330330
assert conv.state._cached_state is not None
331331
assert conv.state._cached_state["execution_status"] == "running"
332+
333+
334+
def test_conversation_stats_reads_from_stats_field():
335+
"""Test that conversation_stats property reads from 'stats' field."""
336+
agent = create_test_agent()
337+
338+
# Mock httpx client with stats data
339+
mock_client_instance = create_mock_http_client()
340+
341+
# Mock conversation info response with stats field
342+
mock_info_response = {
343+
"conversation_id": "test-id",
344+
"execution_status": "idle",
345+
"stats": {
346+
"usage_to_metrics": {
347+
"test-llm": {
348+
"model_name": "gpt-4o-mini",
349+
"accumulated_cost": 1.23,
350+
"accumulated_token_usage": {
351+
"prompt_tokens": 100,
352+
"completion_tokens": 50,
353+
},
354+
}
355+
}
356+
},
357+
}
358+
359+
with (
360+
patch("httpx.Client", return_value=mock_client_instance),
361+
patch(
362+
"openhands.sdk.conversation.impl.remote_conversation"
363+
".WebSocketCallbackClient"
364+
),
365+
):
366+
# Create RemoteConversation
367+
conv = RemoteConversation(
368+
agent=agent,
369+
workspace=RemoteWorkspace(working_dir="/tmp", host="http://localhost:3000"),
370+
)
371+
372+
# Manually set cached state to simulate REST API response
373+
conv.state._cached_state = mock_info_response
374+
375+
# Access conversation_stats property
376+
stats = conv.conversation_stats
377+
378+
# Verify stats are correctly read from "stats" field
379+
assert stats is not None
380+
assert "test-llm" in stats.usage_to_metrics
381+
assert stats.usage_to_metrics["test-llm"].accumulated_cost == 1.23
382+
383+
384+
def test_stats_update_via_state_event():
385+
"""Test that stats updates are received via ConversationStateUpdateEvent."""
386+
agent = create_test_agent()
387+
388+
# Mock httpx client
389+
mock_client_instance = create_mock_http_client()
390+
391+
with (
392+
patch("httpx.Client", return_value=mock_client_instance),
393+
patch(
394+
"openhands.sdk.conversation.impl.remote_conversation"
395+
".WebSocketCallbackClient"
396+
),
397+
):
398+
# Create RemoteConversation
399+
conv = RemoteConversation(
400+
agent=agent,
401+
workspace=RemoteWorkspace(working_dir="/tmp", host="http://localhost:3000"),
402+
)
403+
404+
# Set initial state with empty stats
405+
initial_state = {
406+
"execution_status": "running",
407+
"stats": {"usage_to_metrics": {}},
408+
}
409+
event1 = ConversationStateUpdateEvent(key="full_state", value=initial_state)
410+
conv.state.update_state_from_event(event1)
411+
412+
# Verify initial stats are empty
413+
stats = conv.conversation_stats
414+
assert stats is not None
415+
assert stats.usage_to_metrics == {}
416+
417+
# Simulate state update with new stats
418+
updated_stats = {
419+
"usage_to_metrics": {
420+
"test-llm": {
421+
"model_name": "gpt-4o-mini",
422+
"accumulated_cost": 2.45,
423+
}
424+
}
425+
}
426+
event2 = ConversationStateUpdateEvent(key="stats", value=updated_stats)
427+
conv.state.update_state_from_event(event2)
428+
429+
# Verify stats are updated
430+
stats = conv.conversation_stats
431+
assert stats is not None
432+
assert "test-llm" in stats.usage_to_metrics
433+
assert stats.usage_to_metrics["test-llm"].accumulated_cost == 2.45

0 commit comments

Comments
 (0)