Fix RemoteConversation stats field mismatch and state updates (#1042)

xingyaoww · openhands-agent · web-flow · commit 89b63b45b792 · 2025-11-06T11:30:19.000-03:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-agent-server/openhands/agent_server/event_service.py b/openhands-agent-server/openhands/agent_server/event_service.py
@@ -271,6 +271,8 @@ async def run(self):
             raise ValueError("inactive_service")
         loop = asyncio.get_running_loop()
         await loop.run_in_executor(None, self._conversation.run)
+        # Publish state update after run completes to ensure stats are updated
+        await self._publish_state_update()
 
     async def respond_to_confirmation(self, request: ConfirmationResponseRequest):
         if request.accept:
@@ -282,6 +284,8 @@ async def pause(self):
         if self._conversation:
             loop = asyncio.get_running_loop()
             await loop.run_in_executor(None, self._conversation.pause)
+            # Publish state update after pause to ensure stats are updated
+            await self._publish_state_update()
 
     async def update_secrets(self, secrets: dict[str, SecretValue]):
         """Update secrets in the conversation."""
diff --git a/tests/cross/test_remote_conversation_live_server.py b/tests/cross/test_remote_conversation_live_server.py
@@ -426,3 +426,146 @@ def test_file_upload_endpoint_with_live_server(server_env, tmp_path: Path):
     assert check_result.stdout == test_content, (
         f"File content mismatch. Expected:\n{test_content}\nGot:\n{check_result.stdout}"
     )
+
+
+def test_conversation_stats_with_live_server(
+    server_env, monkeypatch: pytest.MonkeyPatch
+):
+    """Integration test verifying conversation stats are correctly propagated.
+
+    This test validates the fix for issue #1041 where accumulated cost was
+    always 0. It checks:
+    1. RemoteConversation reads stats from the correct 'stats' field (not
+       'conversation_stats')
+    2. Stats updates are propagated after run() completes
+    3. Accumulated cost and token usage are correctly tracked
+
+    This is a regression test for the field mismatch and state update issues.
+    """
+
+    def fake_completion_with_cost(
+        self,
+        messages,
+        tools,
+        return_metrics=False,
+        add_security_risk_prediction=False,
+        **kwargs,
+    ):  # type: ignore[no-untyped-def]
+        from openhands.sdk.llm.llm_response import LLMResponse
+        from openhands.sdk.llm.message import Message
+        from openhands.sdk.llm.utils.metrics import TokenUsage
+
+        # Create a minimal ModelResponse with a single assistant message
+        litellm_msg = LiteLLMMessage.model_validate(
+            {"role": "assistant", "content": "Test response"}
+        )
+        raw_response = ModelResponse(
+            id="test-resp-with-cost",
+            created=int(time.time()),
+            model="test-model",
+            choices=[Choices(index=0, finish_reason="stop", message=litellm_msg)],
+        )
+
+        # Convert to OpenHands Message
+        message = Message.from_llm_chat_message(litellm_msg)
+
+        # Simulate cost accumulation in the LLM's metrics
+        # The LLM should have metrics that track cost
+        from openhands.sdk.llm.utils.metrics import MetricsSnapshot
+
+        if self.metrics:
+            self.metrics.add_cost(0.0025)
+            self.metrics.add_token_usage(
+                prompt_tokens=100,
+                completion_tokens=50,
+                cache_read_tokens=0,
+                cache_write_tokens=0,
+                context_window=8192,
+                response_id="test-resp-with-cost",
+                reasoning_tokens=0,
+            )
+            metrics_snapshot = self.metrics.get_snapshot()
+        else:
+            # Create a default metrics snapshot if no metrics exist
+            metrics_snapshot = MetricsSnapshot(
+                model_name=self.model,
+                accumulated_cost=0.0025,
+                accumulated_token_usage=TokenUsage(
+                    model=self.model,
+                    prompt_tokens=100,
+                    completion_tokens=50,
+                    response_id="test-resp-with-cost",
+                ),
+            )
+
+        return LLMResponse(
+            message=message, metrics=metrics_snapshot, raw_response=raw_response
+        )
+
+    # Patch LLM.completion with our cost-tracking version
+    monkeypatch.setattr(LLM, "completion", fake_completion_with_cost, raising=True)
+
+    # Create an Agent with a real LLM object
+    llm = LLM(model="gpt-4", api_key=SecretStr("test"))
+    agent = Agent(llm=llm, tools=[])
+
+    # Create conversation via factory pointing at the live server
+    workspace = RemoteWorkspace(
+        host=server_env["host"], working_dir="/tmp/workspace/project"
+    )
+    conv: RemoteConversation = Conversation(agent=agent, workspace=workspace)
+
+    # Verify initial stats are empty/zero
+    initial_stats = conv.conversation_stats
+    assert initial_stats is not None
+    initial_cost = initial_stats.get_combined_metrics().accumulated_cost
+    assert initial_cost == 0.0, f"Expected initial cost to be 0.0, got {initial_cost}"
+
+    # Send a message and run the conversation
+    conv.send_message("Test message")
+    conv.run()
+
+    # Wait for the conversation to finish and stats to update
+    # The fix ensures stats are published after run() completes
+    max_attempts = 50
+    for attempt in range(max_attempts):
+        try:
+            stats = conv.conversation_stats
+            combined_metrics = stats.get_combined_metrics()
+            accumulated_cost = combined_metrics.accumulated_cost
+
+            # Check if we got non-zero cost (stats have been updated)
+            if accumulated_cost > 0:
+                # Verify the stats are correctly populated
+                assert accumulated_cost > 0, (
+                    f"Expected accumulated_cost > 0 after run(), got {accumulated_cost}"
+                )
+
+                # Verify token usage is tracked
+                if combined_metrics.accumulated_token_usage:
+                    assert combined_metrics.accumulated_token_usage.prompt_tokens > 0, (
+                        "Expected prompt_tokens > 0 after run()"
+                    )
+                    assert (
+                        combined_metrics.accumulated_token_usage.completion_tokens > 0
+                    ), "Expected completion_tokens > 0 after run()"
+
+                # Success - we got updated stats
+                break
+        except (KeyError, AttributeError, AssertionError) as e:
+            if attempt == max_attempts - 1:
+                raise AssertionError(
+                    f"Stats not properly updated after {max_attempts} attempts. "
+                    f"Last error: {e}"
+                )
+        time.sleep(0.1)
+
+    # Final verification: stats are read from 'stats' field, not 'conversation_stats'
+    info = conv.state._get_conversation_info()
+    assert "stats" in info, "Expected 'stats' field in conversation info"
+
+    # Verify the RemoteConversation is correctly reading from 'stats'
+    stats_from_field = info.get("stats", {})
+    assert stats_from_field, "Expected non-empty stats in the 'stats' field after run()"
+
+    conv.close()
diff --git a/tests/sdk/conversation/test_remote_conversation_state_updates.py b/tests/sdk/conversation/test_remote_conversation_state_updates.py
@@ -329,3 +329,105 @@ def test_state_update_callback_integration():
         # Verify the state was updated
         assert conv.state._cached_state is not None
         assert conv.state._cached_state["execution_status"] == "running"
+
+
+def test_conversation_stats_reads_from_stats_field():
+    """Test that conversation_stats property reads from 'stats' field."""
+    agent = create_test_agent()
+
+    # Mock httpx client with stats data
+    mock_client_instance = create_mock_http_client()
+
+    # Mock conversation info response with stats field
+    mock_info_response = {
+        "conversation_id": "test-id",
+        "execution_status": "idle",
+        "stats": {
+            "usage_to_metrics": {
+                "test-llm": {
+                    "model_name": "gpt-4o-mini",
+                    "accumulated_cost": 1.23,
+                    "accumulated_token_usage": {
+                        "prompt_tokens": 100,
+                        "completion_tokens": 50,
+                    },
+                }
+            }
+        },
+    }
+
+    with (
+        patch("httpx.Client", return_value=mock_client_instance),
+        patch(
+            "openhands.sdk.conversation.impl.remote_conversation"
+            ".WebSocketCallbackClient"
+        ),
+    ):
+        # Create RemoteConversation
+        conv = RemoteConversation(
+            agent=agent,
+            workspace=RemoteWorkspace(working_dir="/tmp", host="http://localhost:3000"),
+        )
+
+        # Manually set cached state to simulate REST API response
+        conv.state._cached_state = mock_info_response
+
+        # Access conversation_stats property
+        stats = conv.conversation_stats
+
+        # Verify stats are correctly read from "stats" field
+        assert stats is not None
+        assert "test-llm" in stats.usage_to_metrics
+        assert stats.usage_to_metrics["test-llm"].accumulated_cost == 1.23
+
+
+def test_stats_update_via_state_event():
+    """Test that stats updates are received via ConversationStateUpdateEvent."""
+    agent = create_test_agent()
+
+    # Mock httpx client
+    mock_client_instance = create_mock_http_client()
+
+    with (
+        patch("httpx.Client", return_value=mock_client_instance),
+        patch(
+            "openhands.sdk.conversation.impl.remote_conversation"
+            ".WebSocketCallbackClient"
+        ),
+    ):
+        # Create RemoteConversation
+        conv = RemoteConversation(
+            agent=agent,
+            workspace=RemoteWorkspace(working_dir="/tmp", host="http://localhost:3000"),
+        )
+
+        # Set initial state with empty stats
+        initial_state = {
+            "execution_status": "running",
+            "stats": {"usage_to_metrics": {}},
+        }
+        event1 = ConversationStateUpdateEvent(key="full_state", value=initial_state)
+        conv.state.update_state_from_event(event1)
+
+        # Verify initial stats are empty
+        stats = conv.conversation_stats
+        assert stats is not None
+        assert stats.usage_to_metrics == {}
+
+        # Simulate state update with new stats
+        updated_stats = {
+            "usage_to_metrics": {
+                "test-llm": {
+                    "model_name": "gpt-4o-mini",
+                    "accumulated_cost": 2.45,
+                }
+            }
+        }
+        event2 = ConversationStateUpdateEvent(key="stats", value=updated_stats)
+        conv.state.update_state_from_event(event2)
+
+        # Verify stats are updated
+        stats = conv.conversation_stats
+        assert stats is not None
+        assert "test-llm" in stats.usage_to_metrics
+        assert stats.usage_to_metrics["test-llm"].accumulated_cost == 2.45