diff --git a/src/agentunit/adapters/agentops_adapter.py b/src/agentunit/adapters/agentops_adapter.py index f02d2b4..768fda6 100644 --- a/src/agentunit/adapters/agentops_adapter.py +++ b/src/agentunit/adapters/agentops_adapter.py @@ -67,13 +67,15 @@ def __init__( self.project_id = project_id self.default_tags = default_tags or [] self.auto_start_session = auto_start_session + self.enable_tracing = kwargs.get("enable_tracing", True) + self.client: Any = None self.platform = MonitoringPlatform.AGENTOPS """ Initialize LangSmith adapter. Args: api_key: LangSmith API key - project_name: Project name for organizing traces + project_id: Langsmith project ID endpoint: Optional custom LangSmith endpoint enable_tracing: Whether to enable automatic tracing enable_feedback: Whether to collect feedback data @@ -98,9 +100,10 @@ def _initialize_agentops(self): """Initialize AgentOps client and verify connection.""" try: # Import AgentOps SDK - import agentops + import agentops # type: ignore[import-not-found] self.agentops = agentops + self.client = agentops # Initialize AgentOps if self.api_key: @@ -124,11 +127,6 @@ def _initialize_agentops(self): logger.error(f"Failed to connect to AgentOps: {e}") raise - @property - def platform(self) -> MonitoringPlatform: - """Return the monitoring platform type.""" - return MonitoringPlatform.AGENTOPS - def create_agent(self, role: AgentRole, agent_id: str | None = None, **kwargs) -> AgentMetadata: """ Create an agent for AgentOps monitoring. @@ -244,7 +242,7 @@ def send_message( interaction = AgentInteraction( interaction_id=interaction_id, from_agent=from_agent, - to_agent=to_agent, + to_agent=to_agent or "broadcast", content=message, timestamp=timestamp, metadata=metadata or {}, @@ -339,12 +337,15 @@ def calculate_coordination_metrics(self) -> dict[str, float]: for interaction in self.session_interactions: unique_agents.add(interaction.from_agent) if interaction.to_agent: - unique_agents.add(interaction.to_agent) + if isinstance(interaction.to_agent, list): + unique_agents.update(interaction.to_agent) + else: + unique_agents.add(interaction.to_agent) agent_participation = len(unique_agents) # Calculate message distribution - agent_counts = {} + agent_counts: dict[str, int] = {} for interaction in self.session_interactions: from_agent = interaction.from_agent agent_counts[from_agent] = agent_counts.get(from_agent, 0) + 1 @@ -374,7 +375,7 @@ def calculate_coordination_metrics(self) -> dict[str, float]: else 0.0, } - def run_scenario(self, scenario: Scenario) -> ScenarioResult: + async def run_scenario(self, scenario: Scenario) -> ScenarioResult: """ Run a scenario with LangSmith integration. @@ -390,14 +391,14 @@ def run_scenario(self, scenario: Scenario) -> ScenarioResult: scenario_run_id = None if self.enable_tracing: try: - run = self.client.create_run( + run_id = self.agentops.start_trace( name=f"Scenario: {scenario.name}", run_type="chain", - project_name=self.project_name, - inputs={"scenario": scenario.name, "description": scenario.description}, + project_name=self.project_id, + inputs={"scenario": scenario.name}, tags=["agentunit", "scenario"], ) - scenario_run_id = str(run.id) + scenario_run_id = str(run_id) except Exception as e: logger.warning(f"Failed to create scenario run: {e}") @@ -483,15 +484,11 @@ def run_scenario(self, scenario: Scenario) -> ScenarioResult: # Update LangSmith run with results if scenario_run_id and self.enable_tracing: try: - self.client.update_run( - run_id=scenario_run_id, - outputs={ - "result": result.passed, - "execution_time": execution_time, - "details": result.details, - }, - end_time=datetime.now(timezone.utc), + self.agentops.update_trace_metadata( + trace_id=scenario_run_id, + metadata={"result": result.success_rate, "details": result.to_dict()}, ) + self.agentops.end_trace(trace_id=scenario_run_id, status_code="SUCCESS") except Exception as e: logger.warning(f"Failed to update scenario run: {e}") @@ -527,11 +524,11 @@ def run_scenario(self, scenario: Scenario) -> ScenarioResult: # Update LangSmith run with error if scenario_run_id and self.enable_tracing: try: - self.client.update_run( - run_id=scenario_run_id, - outputs={"error": str(e)}, - end_time=datetime.now(timezone.utc), + self.agentops.update_trace_metadata( + trace_id=scenario_run_id, + metadata={"result": result.success_rate, "details": result.to_dict()}, ) + self.agentops.end_trace(trace_id=scenario_run_id, status_code="SUCCESS") except Exception as e: logger.warning(f"Failed to update failed scenario run: {e}") @@ -551,7 +548,7 @@ def collect_metrics(self, scenario: Any, result: Any, **kwargs) -> ProductionMet """ try: # Query recent runs from LangSmith - runs = list(self.client.list_runs(project_name=self.project_name, limit=100)) + runs = list(self.client.list_runs(project_name=self.project_id, limit=100)) if not runs: return ProductionMetrics( @@ -637,7 +634,7 @@ def establish_baseline( # Query historical runs runs = list( self.client.list_runs( - project_name=self.project_name, start_time=start_date, end_time=end_date + project_name=self.project_id, start_time=start_date, end_time=end_date ) ) @@ -749,7 +746,7 @@ def create_evaluation_dataset( logger.error(f"Failed to create LangSmith dataset: {e}") raise - def run_evaluation(self, dataset_id: str, evaluator_function: Any, **kwargs) -> dict[str, Any]: + def run_evaluation(self, dataset_id: str, evaluator_function: Any, **kwargs) -> Any: """ Run evaluation on a LangSmith dataset. @@ -767,7 +764,7 @@ def run_evaluation(self, dataset_id: str, evaluator_function: Any, **kwargs) -> results = evaluate( evaluator_function, data=dataset_id, - project_name=f"{self.project_name}-evaluation", + experiment_prefix=f"{self.project_id}-evaluation", **kwargs, ) diff --git a/src/agentunit/core/scenario.py b/src/agentunit/core/scenario.py index 70bba2a..3523707 100644 --- a/src/agentunit/core/scenario.py +++ b/src/agentunit/core/scenario.py @@ -7,7 +7,7 @@ import random from dataclasses import dataclass, field from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from agentunit.datasets.registry import resolve_dataset @@ -68,7 +68,7 @@ def from_openai_agents( adapter = OpenAIAgentsAdapter.from_flow(flow, **options) ds = resolve_dataset(dataset) - scenario_name = name or getattr(flow, "__name__", "openai-agents-scenario") + scenario_name = name or getattr(flow, "__name__", None) or "openai-agents-scenario" return cls(name=scenario_name, adapter=adapter, dataset=ds) @classmethod @@ -95,7 +95,7 @@ def from_autogen( orchestrator: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: """ Create scenario from AutoGen orchestrator. @@ -113,7 +113,7 @@ def from_haystack( pipeline: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.haystack import HaystackAdapter @@ -128,7 +128,7 @@ def from_llama_index( engine: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.llama_index import LlamaIndexAdapter @@ -143,7 +143,7 @@ def from_semantic_kernel( invoker: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.semantic_kernel import SemanticKernelAdapter @@ -158,7 +158,7 @@ def from_phidata( agent: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.phidata import PhidataAdapter @@ -173,7 +173,7 @@ def from_promptflow( flow: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.promptflow import PromptFlowAdapter @@ -188,7 +188,7 @@ def from_openai_swarm( swarm: object, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.openai_swarm import OpenAISwarmAdapter @@ -204,7 +204,7 @@ def from_anthropic_bedrock( model_id: str, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.anthropic_bedrock import AnthropicBedrockAdapter @@ -219,7 +219,7 @@ def from_mistral_server( base_url: str, dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.mistral_server import MistralServerAdapter @@ -239,7 +239,7 @@ def from_rasa_endpoint( target: str | Callable[[dict], object], dataset: str | DatasetSource | None = None, name: str | None = None, - **options: object, + **options: Any, ) -> Scenario: from agentunit.adapters.rasa import RasaAdapter diff --git a/src/agentunit/datasets/builtins.py b/src/agentunit/datasets/builtins.py index 4b10a0f..4d67445 100644 --- a/src/agentunit/datasets/builtins.py +++ b/src/agentunit/datasets/builtins.py @@ -2,7 +2,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, TypedDict + +from typing_extensions import NotRequired from .base import DatasetCase, DatasetSource @@ -11,7 +13,18 @@ from collections.abc import Iterable -_GAIA_L1_SHOPPING: list[dict[str, object]] = [ +class DatasetRow(TypedDict): + """Provides structure for type hints""" + + id: str + query: str + expected_output: str | None + tools: list[str] | None + context: list[str] | None + metadata: NotRequired[dict[str, object]] + + +_GAIA_L1_SHOPPING: list[DatasetRow] = [ { "id": "gaia-shopping-001", "query": "Find the best price for a pack of AA rechargeable batteries with at least 2500mAh capacity.", @@ -28,7 +41,7 @@ }, ] -_SWE_BENCH_LITE: list[dict[str, object]] = [ +_SWE_BENCH_LITE: list[DatasetRow] = [ { "id": "swe-lite-001", "query": "Fix the bug where the API returns HTTP 500 when the username is missing.", @@ -48,7 +61,7 @@ ] -def _build_loader(rows: list[dict[str, object]]) -> Iterable[DatasetCase]: +def _build_loader(rows: list[DatasetRow]) -> Iterable[DatasetCase]: for row in rows: yield DatasetCase( id=row["id"], diff --git a/src/agentunit/metrics/builtin.py b/src/agentunit/metrics/builtin.py index 3369dc4..638d0c7 100644 --- a/src/agentunit/metrics/builtin.py +++ b/src/agentunit/metrics/builtin.py @@ -143,8 +143,9 @@ def evaluate(self, case: DatasetCase, trace: TraceLog, outcome: Any) -> MetricRe cost = 0.0 # Check trace metadata - if trace.metadata and "cost" in trace.metadata: - cost = float(trace.metadata["cost"]) + metadata = getattr(trace, "metadata", {}) + if metadata and "cost" in metadata: + cost = float(metadata["cost"]) # Check outcome elif hasattr(outcome, "cost"): @@ -168,8 +169,9 @@ def evaluate(self, case: DatasetCase, trace: TraceLog, outcome: Any) -> MetricRe total_tokens = 0 # Check trace metadata - if trace.metadata and "usage" in trace.metadata: - usage = trace.metadata["usage"] + metadata = getattr(trace, "metadata", {}) + if metadata and "usage" in metadata: + usage = metadata["usage"] prompt_tokens = usage.get("prompt_tokens", 0) completion_tokens = usage.get("completion_tokens", 0) total_tokens = usage.get("total_tokens", 0) diff --git a/src/agentunit/production/integrations.py b/src/agentunit/production/integrations.py index f5ab9a7..a7199a1 100644 --- a/src/agentunit/production/integrations.py +++ b/src/agentunit/production/integrations.py @@ -97,7 +97,7 @@ def _calculate_baseline_stats( self, historical_data: list[dict[str, Any]], metrics: list[str] ) -> dict[str, dict[str, dict[str, float]]]: """Calculate baseline statistics from historical data.""" - baseline_stats = { + baseline_stats: dict[str, dict[str, dict[str, float]]] = { "performance_baseline": {}, "quality_baseline": {}, "reliability_baseline": {},