diff --git a/samples/calculator/evals/eval-sets/legacy.json b/samples/calculator/evals/eval-sets/legacy.json index dbb8744cb..7d0c345c4 100644 --- a/samples/calculator/evals/eval-sets/legacy.json +++ b/samples/calculator/evals/eval-sets/legacy.json @@ -19,7 +19,7 @@ "operator": "+" }, "expectedOutput": { - "result": 2 + "result": 2.0 }, "expectedAgentBehavior": "The operation should produce the right output.", "evalSetId": "default-eval-set-id", @@ -35,7 +35,7 @@ "operator": "random" }, "expectedOutput": { - "result": 2 + "result": 2.0 }, "expectedAgentBehavior": "The operation should produce the right output.", "mockingStrategy": { @@ -71,7 +71,7 @@ "operator": "random" }, "expectedOutput": { - "result": 2 + "result": 2.0 }, "expectedAgentBehavior": "The operation should produce the right output.", "mockingStrategy": { diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py index 5d1d17f38..4e0d7a177 100644 --- a/src/uipath/_cli/_evals/_console_progress_reporter.py +++ b/src/uipath/_cli/_evals/_console_progress_reporter.py @@ -7,7 +7,6 @@ from rich.rule import Rule from rich.table import Table -from uipath._cli._evals._models._evaluation_set import AnyEvaluator from uipath._events._event_bus import EventBus from uipath._events._events import ( EvalRunCreatedEvent, @@ -16,6 +15,7 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) +from uipath.eval.evaluators import BaseEvaluator from uipath.eval.models import ScoreType logger = logging.getLogger(__name__) @@ -26,7 +26,7 @@ class ConsoleProgressReporter: def __init__(self): self.console = Console() - self.evaluators: Dict[str, AnyEvaluator] = {} + self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {} self.display_started = False self.eval_results_by_name: Dict[str, list[Any]] = {} diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index da4b3aa84..f00b63614 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -6,7 +6,6 @@ from pydantic import TypeAdapter from uipath._cli._evals._helpers import try_extract_file_and_class_name # type: ignore -from uipath._cli._evals._models._evaluation_set import AnyEvaluator from uipath._cli._evals._models._evaluator import ( EqualsEvaluatorParams, EvaluatorConfig, @@ -72,10 +71,11 @@ class EvaluatorFactory: """Factory class for creating evaluator instances based on configuration.""" @classmethod - def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator: + def create_evaluator(cls, data: Dict[str, Any]) -> BaseEvaluator[Any, Any, Any]: if data.get("version", None) == "1.0": return cls._create_evaluator_internal(data) - return cls._create_legacy_evaluator_internal(data) + else: + return cls._create_legacy_evaluator_internal(data) @staticmethod def _create_evaluator_internal( @@ -352,14 +352,14 @@ def _create_legacy_exact_match_evaluator( params: EqualsEvaluatorParams, ) -> LegacyExactMatchEvaluator: """Create a deterministic evaluator.""" - return LegacyExactMatchEvaluator(**params.model_dump()) + return LegacyExactMatchEvaluator(**params.model_dump(), config={}) @staticmethod def _create_legacy_json_similarity_evaluator( params: JsonSimilarityEvaluatorParams, ) -> LegacyJsonSimilarityEvaluator: """Create a deterministic evaluator.""" - return LegacyJsonSimilarityEvaluator(**params.model_dump()) + return LegacyJsonSimilarityEvaluator(**params.model_dump(), config={}) @staticmethod def _create_legacy_llm_as_judge_evaluator( @@ -376,7 +376,7 @@ def _create_legacy_llm_as_judge_evaluator( "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." ) - return LegacyLlmAsAJudgeEvaluator(**params.model_dump()) + return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={}) @staticmethod def _create_legacy_trajectory_evaluator( @@ -393,4 +393,4 @@ def _create_legacy_trajectory_evaluator( "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." ) - return LegacyTrajectoryEvaluator(**params.model_dump()) + return LegacyTrajectoryEvaluator(**params.model_dump(), config={}) diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 137f4f06e..31e87dd95 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -1,11 +1,9 @@ from enum import Enum, IntEnum from typing import Annotated, Any, Dict, List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag +from pydantic import BaseModel, ConfigDict, Field from pydantic.alias_generators import to_camel -from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator - class EvaluationSimulationTool(BaseModel): name: str = Field(..., alias="name") @@ -216,15 +214,3 @@ def _discriminate_eval_set( if version == "1.0": return "evaluation_set" return "legacy_evaluation_set" - - -AnyEvaluationSet = Annotated[ - Union[ - Annotated[EvaluationSet, Tag("evaluation_set")], - Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")], - ], - Discriminator(_discriminate_eval_set), -] - -AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem] -AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]] diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index 5c0516e9b..968b5b4ae 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -13,11 +13,10 @@ from uipath import UiPath from uipath._cli._evals._models._evaluation_set import ( - AnyEvaluationItem, - AnyEvaluator, EvaluationItem, EvaluationStatus, ) +from uipath._cli._evals._models._evaluator import Evaluator from uipath._cli._evals._models._sw_reporting import ( StudioWebAgentSnapshot, StudioWebProgressItem, @@ -133,7 +132,9 @@ def _get_endpoint_prefix(self) -> str: return "api/" return "agentsruntime_/api/" - def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool: + def _is_coded_evaluator( + self, evaluators: List[BaseEvaluator[Any, Any, Any]] + ) -> bool: """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator). Args: @@ -235,7 +236,7 @@ async def create_eval_set_run_sw( @gracefully_handle_errors async def create_eval_run( - self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False + self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False ) -> str: """Create a new evaluation run in StudioWeb. @@ -262,7 +263,7 @@ async def create_eval_run( async def update_eval_run( self, sw_progress_item: StudioWebProgressItem, - evaluators: dict[str, AnyEvaluator], + evaluators: dict[str, Evaluator], is_coded: bool = False, spans: list[Any] | None = None, ): @@ -704,7 +705,7 @@ def _update_coded_eval_run_spec( ) def _create_eval_run_spec( - self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False + self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False ) -> RequestSpec: # Legacy API expects eval IDs as GUIDs, coded accepts strings # Convert string IDs to deterministic GUIDs for legacy diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index b9a7e7d6f..ad2399c07 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -25,7 +25,7 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator +from ...eval.evaluators import BaseEvaluator from ...eval.models import EvaluationResult from ...eval.models.models import AgentExecution, EvalItemResult from .._runtime._contracts import ( @@ -43,11 +43,8 @@ from ..models.runtime_schema import Entrypoint from ._evaluator_factory import EvaluatorFactory from ._models._evaluation_set import ( - AnyEvaluationItem, - AnyEvaluationSet, - AnyEvaluator, EvaluationItem, - LegacyEvaluationItem, + EvaluationSet, ) from ._models._exceptions import EvaluationRuntimeException from ._models._output import ( @@ -270,8 +267,8 @@ async def execute(self) -> UiPathRuntimeResult: async def _execute_sequential( self, - evaluation_set: AnyEvaluationSet, - evaluators: List[AnyEvaluator], + evaluation_set: EvaluationSet, + evaluators: List[BaseEvaluator[Any, Any, Any]], event_bus: EventBus, ) -> List[EvaluationRunResult]: all_eval_run_result: list[EvaluationRunResult] = [] @@ -285,13 +282,13 @@ async def _execute_sequential( async def _execute_parallel( self, - evaluation_set: AnyEvaluationSet, - evaluators: List[AnyEvaluator], + evaluation_set: EvaluationSet, + evaluators: List[BaseEvaluator[Any, Any, Any]], event_bus: EventBus, workers: int, ) -> List[EvaluationRunResult]: # Create a queue with max concurrency - queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue( + queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue( maxsize=workers ) @@ -301,7 +298,7 @@ async def _execute_parallel( # Producer task to fill the queue async def producer() -> None: for index, eval_item in enumerate(evaluation_set.evaluations): - await queue.put((index, eval_item)) # type: ignore[arg-type] + await queue.put((index, eval_item)) # Signal completion by putting None markers for _ in range(workers): await queue.put(None) # type: ignore @@ -343,8 +340,8 @@ async def worker(worker_id: int) -> None: async def _execute_eval( self, - eval_item: AnyEvaluationItem, - evaluators: List[AnyEvaluator], + eval_item: EvaluationItem, + evaluators: List[BaseEvaluator[Any, Any, Any]], event_bus: EventBus, ) -> EvaluationRunResult: # Generate LLM-based input if input_mocking_strategy is defined @@ -416,41 +413,21 @@ async def _execute_eval( evaluation_item_results: list[EvalItemResult] = [] for evaluator in evaluators: - # Determine which evaluator method to use based on evaluation set/item type - evaluation_result: Optional[EvaluationResult] = None - - match eval_item: - case LegacyEvaluationItem(): - # Legacy evaluation - use run_legacy_evaluator - evaluation_result = await self.run_legacy_evaluator( - evaluator=evaluator, # type: ignore - execution_output=agent_execution_output, - eval_item=eval_item, - ) - case EvaluationItem() if ( - evaluator.id in eval_item.evaluation_criterias - ): - # New evaluation with criteria - evaluation_criteria = eval_item.evaluation_criterias[ - evaluator.id - ] - - evaluation_result = await self.run_evaluator( - evaluator=evaluator, # type: ignore - execution_output=agent_execution_output, - eval_item=eval_item, - evaluation_criteria=evaluator.evaluation_criteria_type( # type: ignore - **evaluation_criteria - ) - if evaluation_criteria - else evaluator.evaluator_config.default_evaluation_criteria, # type: ignore - ) - case _: - # Skip if evaluator not in evaluation criteria - continue - - if evaluation_result is None: + if evaluator.id not in eval_item.evaluation_criterias: + # Skip! continue + evaluation_criteria = eval_item.evaluation_criterias[evaluator.id] + + evaluation_result = await self.run_evaluator( + evaluator=evaluator, + execution_output=agent_execution_output, + eval_item=eval_item, + evaluation_criteria=evaluator.evaluation_criteria_type( + **evaluation_criteria + ) + if evaluation_criteria + else evaluator.evaluator_config.default_evaluation_criteria, + ) dto_result = EvaluationResultDto.from_evaluation_result( evaluation_result @@ -527,8 +504,8 @@ async def _execute_eval( return evaluation_run_results async def _generate_input_for_eval( - self, eval_item: AnyEvaluationItem - ) -> AnyEvaluationItem: + self, eval_item: EvaluationItem + ) -> EvaluationItem: """Use LLM to generate a mock input for an evaluation item.""" generated_input = await generate_llm_input( eval_item, (await self.get_entrypoint()).input @@ -549,7 +526,7 @@ def _get_and_clear_execution_data( return spans, logs async def execute_runtime( - self, eval_item: AnyEvaluationItem, execution_id: str + self, eval_item: EvaluationItem, execution_id: str ) -> UiPathEvalRunExecutionOutput: context_args = self.context.model_dump() context_args["execution_id"] = execution_id @@ -622,28 +599,9 @@ async def run_evaluator( return result - async def run_legacy_evaluator( - self, - evaluator: LegacyBaseEvaluator[Any], - execution_output: UiPathEvalRunExecutionOutput, - eval_item: LegacyEvaluationItem, - ) -> EvaluationResult: - agent_execution = AgentExecution( - agent_input=eval_item.inputs, - agent_output=execution_output.result.output or {}, - agent_trace=execution_output.spans, - expected_agent_behavior=eval_item.expected_agent_behavior, - ) - - result = await evaluator.evaluate( - agent_execution=agent_execution, - # at the moment evaluation_criteria is always the expected output - evaluation_criteria=eval_item.expected_output, - ) - - return result - - def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]: + def _load_evaluators( + self, evaluation_set: EvaluationSet + ) -> list[BaseEvaluator[Any, Any, Any]]: """Load evaluators referenced by the evaluation set.""" evaluators = [] evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators" # type: ignore diff --git a/src/uipath/_cli/_evals/mocks/input_mocker.py b/src/uipath/_cli/_evals/mocks/input_mocker.py index 1530cb362..291251005 100644 --- a/src/uipath/_cli/_evals/mocks/input_mocker.py +++ b/src/uipath/_cli/_evals/mocks/input_mocker.py @@ -5,8 +5,8 @@ from typing import Any, Dict from uipath import UiPath -from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem -from uipath.tracing import traced +from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath.tracing._traced import traced from .mocker import UiPathInputMockingError @@ -54,7 +54,7 @@ def get_input_mocking_prompt( @traced(name="__mocker__", recording=False) async def generate_llm_input( - evaluation_item: AnyEvaluationItem, + evaluation_item: EvaluationItem, input_schema: Dict[str, Any], ) -> Dict[str, Any]: """Generate synthetic input using an LLM based on the evaluation context.""" diff --git a/src/uipath/_cli/_evals/mocks/llm_mocker.py b/src/uipath/_cli/_evals/mocks/llm_mocker.py index bb869c1db..1fbac8441 100644 --- a/src/uipath/_cli/_evals/mocks/llm_mocker.py +++ b/src/uipath/_cli/_evals/mocks/llm_mocker.py @@ -9,10 +9,7 @@ from uipath.tracing import traced from uipath.tracing._utils import _SpanUtils -from .._models._evaluation_set import ( - AnyEvaluationItem, - LLMMockingStrategy, -) +from .._models._evaluation_set import EvaluationItem, LLMMockingStrategy from .._models._mocks import ExampleCall from .mocker import ( Mocker, @@ -77,7 +74,7 @@ def pydantic_to_dict_safe(obj: Any) -> Any: class LLMMocker(Mocker): """LLM Based Mocker.""" - def __init__(self, evaluation_item: AnyEvaluationItem): + def __init__(self, evaluation_item: EvaluationItem): """LLM Mocker constructor.""" self.evaluation_item = evaluation_item assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy) diff --git a/src/uipath/_cli/_evals/mocks/mocker_factory.py b/src/uipath/_cli/_evals/mocks/mocker_factory.py index 5e024f65b..a3bdd47cd 100644 --- a/src/uipath/_cli/_evals/mocks/mocker_factory.py +++ b/src/uipath/_cli/_evals/mocks/mocker_factory.py @@ -1,7 +1,7 @@ """Mocker Factory.""" from uipath._cli._evals._models._evaluation_set import ( - AnyEvaluationItem, + EvaluationItem, LLMMockingStrategy, MockitoMockingStrategy, ) @@ -14,7 +14,7 @@ class MockerFactory: """Mocker factory.""" @staticmethod - def create(evaluation_item: AnyEvaluationItem) -> Mocker: + def create(evaluation_item: EvaluationItem) -> Mocker: """Create a mocker instance.""" match evaluation_item.mocking_strategy: case LLMMockingStrategy(): diff --git a/src/uipath/_cli/_evals/mocks/mockito_mocker.py b/src/uipath/_cli/_evals/mocks/mockito_mocker.py index d9d145be1..2a951f12d 100644 --- a/src/uipath/_cli/_evals/mocks/mockito_mocker.py +++ b/src/uipath/_cli/_evals/mocks/mockito_mocker.py @@ -9,7 +9,7 @@ from mockito import invocation, mocking # type: ignore[import-untyped] from uipath._cli._evals._models._evaluation_set import ( - AnyEvaluationItem, + EvaluationItem, MockingAnswerType, MockitoMockingStrategy, ) @@ -38,7 +38,7 @@ def func(*_args, **_kwargs): class MockitoMocker(Mocker): """Mockito Mocker.""" - def __init__(self, evaluation_item: AnyEvaluationItem): + def __init__(self, evaluation_item: EvaluationItem): """Instantiate a mockito mocker.""" self.evaluation_item = evaluation_item assert isinstance(self.evaluation_item.mocking_strategy, MockitoMockingStrategy) diff --git a/src/uipath/_cli/_evals/mocks/mocks.py b/src/uipath/_cli/_evals/mocks/mocks.py index ad555c8ab..e2416aedf 100644 --- a/src/uipath/_cli/_evals/mocks/mocks.py +++ b/src/uipath/_cli/_evals/mocks/mocks.py @@ -4,13 +4,13 @@ from contextvars import ContextVar from typing import Any, Callable, Optional -from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem +from uipath._cli._evals._models._evaluation_set import EvaluationItem from uipath._cli._evals._span_collection import ExecutionSpanCollector from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError from uipath._cli._evals.mocks.mocker_factory import MockerFactory # Context variables for evaluation items and mockers -evaluation_context: ContextVar[Optional[AnyEvaluationItem]] = ContextVar( +evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar( "evaluation", default=None ) @@ -30,7 +30,7 @@ def set_execution_context( - eval_item: AnyEvaluationItem, + eval_item: EvaluationItem, span_collector: ExecutionSpanCollector, execution_id: str, ) -> None: diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py index 10c3b9ab3..f42ef1aed 100644 --- a/src/uipath/_cli/_utils/_eval_set.py +++ b/src/uipath/_cli/_utils/_eval_set.py @@ -5,7 +5,12 @@ import click from pydantic import TypeAdapter, ValidationError -from uipath._cli._evals._models._evaluation_set import AnyEvaluationSet +from uipath._cli._evals._models._evaluation_set import ( + EvaluationItem, + EvaluationSet, + LegacyEvaluationItem, + LegacyEvaluationSet, +) from uipath._cli._utils._console import ConsoleLogger console = ConsoleLogger() @@ -58,7 +63,7 @@ def auto_discover_eval_set() -> str: @staticmethod def load_eval_set( eval_set_path: str, eval_ids: Optional[List[str]] = None - ) -> tuple[AnyEvaluationSet, str]: + ) -> tuple[EvaluationSet, str]: """Load the evaluation set from file. Args: @@ -66,7 +71,7 @@ def load_eval_set( eval_ids: Optional list of evaluation IDs to filter Returns: - Tuple of (AnyEvaluationSet, resolved_path) + Tuple of (EvaluationSet, resolved_path) """ # If the file doesn't exist at the given path, try looking in evals/eval-sets/ resolved_path = eval_set_path @@ -92,9 +97,41 @@ def load_eval_set( ) from e try: - eval_set: AnyEvaluationSet = TypeAdapter(AnyEvaluationSet).validate_python( - data - ) + eval_set: EvaluationSet | LegacyEvaluationSet = TypeAdapter( + EvaluationSet | LegacyEvaluationSet + ).validate_python(data) + if isinstance(eval_set, LegacyEvaluationSet): + + def migrate_evaluation_item( + evaluation: LegacyEvaluationItem, evaluators: list[str] + ) -> EvaluationItem: + return EvaluationItem.model_validate( + { + "id": evaluation.id, + "name": evaluation.name, + "inputs": evaluation.inputs, + "expectedAgentBehavior": evaluation.expected_agent_behavior, + "mockingStrategy": evaluation.mocking_strategy, + "inputMockingStrategy": evaluation.input_mocking_strategy, + "evaluationCriterias": { + k: { + "expectedOutput": evaluation.expected_output, + "expectedAgentBehavior": evaluation.expected_agent_behavior, + } + for k in evaluators + }, + } + ) + + eval_set = EvaluationSet( + id=eval_set.id, + name=eval_set.name, + evaluator_refs=eval_set.evaluator_refs, + evaluations=[ + migrate_evaluation_item(evaluation, eval_set.evaluator_refs) + for evaluation in eval_set.evaluations + ], + ) except ValidationError as e: raise ValueError( f"Invalid evaluation set format in '{resolved_path}': {str(e)}. " diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py index ffffcff14..486c48aaf 100644 --- a/src/uipath/_events/_events.py +++ b/src/uipath/_events/_events.py @@ -5,7 +5,8 @@ from opentelemetry.sdk.trace import ReadableSpan from pydantic import BaseModel, ConfigDict, Field, SkipValidation, model_validator -from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem, AnyEvaluator +from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath.eval.evaluators import BaseEvaluator from uipath.eval.models import EvalItemResult @@ -23,12 +24,12 @@ class EvalSetRunCreatedEvent(BaseModel): eval_set_run_id: Optional[str] = None no_of_evals: int # skip validation to avoid abstract class instantiation - evaluators: SkipValidation[List[AnyEvaluator]] + evaluators: SkipValidation[List[BaseEvaluator[Any, Any, Any]]] class EvalRunCreatedEvent(BaseModel): execution_id: str - eval_item: AnyEvaluationItem + eval_item: EvaluationItem class EvalItemExceptionDetails(BaseModel): @@ -42,7 +43,7 @@ class EvalRunUpdatedEvent(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) execution_id: str - eval_item: AnyEvaluationItem + eval_item: EvaluationItem eval_results: List[EvalItemResult] success: bool agent_output: Any diff --git a/src/uipath/eval/evaluators/legacy_base_evaluator.py b/src/uipath/eval/evaluators/legacy_base_evaluator.py index 26bb3f227..c31331817 100644 --- a/src/uipath/eval/evaluators/legacy_base_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_base_evaluator.py @@ -6,7 +6,7 @@ from collections.abc import Callable from typing import Any, Generic, TypeVar -from pydantic import BaseModel, ConfigDict +from pydantic import ConfigDict, Field from uipath.eval.models import EvaluationResult from uipath.eval.models.models import ( @@ -16,6 +16,8 @@ LegacyEvaluatorType, ) +from .base_evaluator import BaseEvaluationCriteria, BaseEvaluator, BaseEvaluatorConfig + def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]: """Decorator to track evaluation metrics and handle errors gracefully.""" @@ -39,16 +41,36 @@ async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult: return wrapper -T = TypeVar("T") +# Legacy evaluator config (non-generic version for simplicity) +class LegacyEvaluatorConfig(BaseEvaluatorConfig[BaseEvaluationCriteria]): + """Configuration for legacy evaluators.""" + + name: str = "LegacyEvaluator" + default_evaluation_criteria: None = None # Legacy evaluators don't use this + + +class LegacyEvaluationCriteria(BaseEvaluationCriteria): + """Legacy evaluation criteria.""" + + expected_output: Any = Field(alias="expectedOutput") + expected_agent_behavior: str = Field(alias="expectedAgentBehavior") + + +T = TypeVar("T", bound=LegacyEvaluatorConfig) -class LegacyBaseEvaluator(BaseModel, Generic[T], ABC): - """Abstract base class for all evaluators.""" +class LegacyBaseEvaluator( + BaseEvaluator[LegacyEvaluationCriteria, T, str], Generic[T], ABC +): + """Abstract base class for all legacy evaluators. + + Inherits from BaseEvaluator to share common evaluator infrastructure while maintaining + legacy-specific fields and behavior. + """ model_config = ConfigDict(arbitrary_types_allowed=True) - id: str - name: str + # Legacy-specific fields (in addition to inherited fields from BaseEvaluator) description: str target_output_key: str = "*" created_at: str @@ -56,23 +78,27 @@ class LegacyBaseEvaluator(BaseModel, Generic[T], ABC): category: LegacyEvaluatorCategory evaluator_type: LegacyEvaluatorType - def __init_subclass__(cls, **kwargs: Any): - """Hook for subclass creation - automatically applies evaluation metrics tracking.""" - super().__init_subclass__(**kwargs) - - if hasattr(cls, "evaluate") and not getattr( - cls.evaluate, "_has_metrics_decorator", False - ): - cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign] - cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined] + # Note: __init_subclass__ is inherited from BaseEvaluator and handles metrics tracking def model_post_init(self, __context: Any): """Post-initialization hook for Pydantic models.""" - pass + # Ensure config is set up for legacy evaluators + super().model_post_init(__context) + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id. + + For legacy evaluators, this returns a placeholder. Actual evaluator instances + have an 'id' field that identifies them. + """ + return "legacy-evaluator" @abstractmethod async def evaluate( - self, agent_execution: AgentExecution, evaluation_criteria: T + self, + agent_execution: AgentExecution, + evaluation_criteria: LegacyEvaluationCriteria, ) -> EvaluationResult: """Evaluate the given data and return a result. @@ -81,9 +107,14 @@ async def evaluate( - agent_input: The input received by the agent - actual_output: The actual output from the agent - spans: The execution spans to use for the evaluation - evaluation_criteria: The criteria to evaluate + evaluation_criteria: The criteria to evaluate (legacy evaluators accept any type) Returns: EvaluationResult containing the score and details + + Note: + The type: ignore[override] is necessary because legacy evaluators accept + evaluation_criteria of any type T, while the base class expects BaseEvaluationCriteria. + This is intentional to maintain backward compatibility with legacy evaluators. """ pass diff --git a/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py b/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py index c2eee78ef..798029148 100644 --- a/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py +++ b/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py @@ -2,14 +2,14 @@ import json from abc import ABC -from typing import Any, TypeVar +from typing import Any, Generic, TypeVar -from .legacy_base_evaluator import LegacyBaseEvaluator +from .legacy_base_evaluator import LegacyBaseEvaluator, LegacyEvaluatorConfig -T = TypeVar("T") +T = TypeVar("T", bound=LegacyEvaluatorConfig) -class DeterministicEvaluatorBase(LegacyBaseEvaluator[T], ABC): +class DeterministicEvaluatorBase(LegacyBaseEvaluator[T], Generic[T], ABC): """Base class for evaluators that produce deterministic, reproducible results. This class provides utility methods for canonical JSON comparison and number normalization diff --git a/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py b/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py index 7c4729445..5dc1dd149 100644 --- a/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py @@ -1,14 +1,21 @@ """Exact match evaluator for binary pass/fail evaluation of agent outputs.""" -from typing import Any - from uipath.eval.models import BooleanEvaluationResult, EvaluationResult from ..models.models import AgentExecution +from .legacy_base_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase -class LegacyExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): +class LegacyExactMatchEvaluatorConfig(LegacyEvaluatorConfig): + """Configuration for legacy exact-match evaluators.""" + + name: str = "LegacyExactMatchEvaluator" + + +class LegacyExactMatchEvaluator( + DeterministicEvaluatorBase[LegacyExactMatchEvaluatorConfig] +): """Evaluator that performs exact structural matching between expected and actual outputs. This evaluator returns True if the actual output exactly matches the expected output @@ -17,7 +24,9 @@ class LegacyExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): """ async def evaluate( - self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any] + self, + agent_execution: AgentExecution, + evaluation_criteria: LegacyEvaluationCriteria, ) -> EvaluationResult: """Evaluate whether actual output exactly matches expected output. @@ -33,5 +42,5 @@ async def evaluate( """ return BooleanEvaluationResult( score=self._canonical_json(agent_execution.agent_output) - == self._canonical_json(evaluation_criteria) + == self._canonical_json(evaluation_criteria.expected_output) ) diff --git a/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py b/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py index 30d3df868..09b467482 100644 --- a/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py @@ -6,12 +6,21 @@ from uipath.eval.models import EvaluationResult, NumericEvaluationResult from ..models.models import AgentExecution +from .legacy_base_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase T = TypeVar("T") -class LegacyJsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): +class LegacyJsonSimilarityEvaluatorConfig(LegacyEvaluatorConfig): + """Configuration for legacy json-similarity evaluators.""" + + name: str = "LegacyJsonSimilarityEvaluator" + + +class LegacyJsonSimilarityEvaluator( + DeterministicEvaluatorBase[LegacyJsonSimilarityEvaluatorConfig] +): """Legacy deterministic evaluator that scores structural JSON similarity between expected and actual output. Compares expected versus actual JSON-like structures and returns a @@ -20,7 +29,9 @@ class LegacyJsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): """ async def evaluate( - self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any] + self, + agent_execution: AgentExecution, + evaluation_criteria: LegacyEvaluationCriteria, ) -> EvaluationResult: """Evaluate similarity between expected and actual JSON outputs. @@ -37,7 +48,9 @@ async def evaluate( EvaluationResult: Numerical score between 0-100 indicating similarity """ return NumericEvaluationResult( - score=self._compare_json(evaluation_criteria, agent_execution.agent_output) + score=self._compare_json( + evaluation_criteria.expected_output, agent_execution.agent_output + ) ) def _compare_json(self, expected: Any, actual: Any) -> float: diff --git a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py index c55296583..edcf7d882 100644 --- a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py @@ -10,10 +10,20 @@ from ..._services import UiPathLlmChatService from ..._utils.constants import COMMUNITY_agents_SUFFIX from ..models.models import AgentExecution, EvaluationResult, LLMResponse -from .legacy_base_evaluator import LegacyBaseEvaluator +from .legacy_base_evaluator import ( + LegacyBaseEvaluator, + LegacyEvaluationCriteria, + LegacyEvaluatorConfig, +) -class LegacyLlmAsAJudgeEvaluator(LegacyBaseEvaluator[dict[str, Any]]): +class LegacyLlmAsAJudgeEvaluatorConfig(LegacyEvaluatorConfig): + """Configuration for legacy LLM-as-a-judge evaluators.""" + + name: str = "LegacyLlmAsAJudgeEvaluator" + + +class LegacyLlmAsAJudgeEvaluator(LegacyBaseEvaluator[LegacyLlmAsAJudgeEvaluatorConfig]): """Legacy evaluator that uses an LLM to judge the quality of agent output.""" prompt: str @@ -47,7 +57,7 @@ def _initialize_llm(self): async def evaluate( self, agent_execution: AgentExecution, - evaluation_criteria: dict[str, Any], + evaluation_criteria: LegacyEvaluationCriteria, ) -> EvaluationResult: """Evaluate using an LLM as a judge. @@ -65,7 +75,7 @@ async def evaluate( """ # Create the evaluation prompt evaluation_prompt = self._create_evaluation_prompt( - expected_output=evaluation_criteria, + expected_output=evaluation_criteria.expected_output, actual_output=agent_execution.agent_output, ) diff --git a/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py index 8e2a68219..53d698d5f 100644 --- a/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py @@ -16,10 +16,20 @@ NumericEvaluationResult, TrajectoryEvaluationTrace, ) -from .legacy_base_evaluator import LegacyBaseEvaluator +from .legacy_base_evaluator import ( + LegacyBaseEvaluator, + LegacyEvaluationCriteria, + LegacyEvaluatorConfig, +) + + +class LegacyTrajectoryEvaluatorConfig(LegacyEvaluatorConfig): + """Configuration for legacy trajectory evaluators.""" + + name: str = "LegacyTrajectoryEvaluator" -class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]): +class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[LegacyTrajectoryEvaluatorConfig]): """Legacy evaluator that analyzes the trajectory/path taken to reach outputs.""" prompt: str @@ -53,7 +63,7 @@ def _initialize_llm(self): async def evaluate( self, agent_execution: AgentExecution, - evaluation_criteria: dict[str, Any], + evaluation_criteria: LegacyEvaluationCriteria, ) -> EvaluationResult: """Evaluate using trajectory analysis. diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py index 2aa362e18..1f76e1e43 100644 --- a/src/uipath/eval/evaluators/output_evaluator.py +++ b/src/uipath/eval/evaluators/output_evaluator.py @@ -18,7 +18,7 @@ class OutputEvaluationCriteria(BaseEvaluationCriteria): """Base class for all output evaluation criteria.""" - expected_output: dict[str, Any] | str + expected_output: dict[str, Any] | str = Field(..., alias="expectedOutput") T = TypeVar("T", bound=BaseEvaluationCriteria) diff --git a/tests/cli/eval/evals/eval-sets/default.json b/tests/cli/eval/evals/eval-sets/default.json index a890ccca8..f2136b94a 100644 --- a/tests/cli/eval/evals/eval-sets/default.json +++ b/tests/cli/eval/evals/eval-sets/default.json @@ -1,21 +1,22 @@ { - "fileName": "default.json", + "version": "1.0", "id": "default-eval-set-id", - "name": "Basic Calculator Evaluation Set", - "batchSize": 10, + "name": "Basic Evaluation Set", "evaluatorRefs": [ - "equality" + "ExactMatchEvaluator" ], "evaluations": [ { - "id": "test-addition", - "name": "Test Addition", + "id": "default", + "name": "Default", "inputs": {"foo": "bar"}, - "expectedOutput": {"foo": "bar"}, - "expectedAgentBehavior": "", - "evalSetId": "default-eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z" + "evaluationCriterias": { + "ExactMatchEvaluator": { + "expectedOutput": { + "foo": "bar" + } + } + } } ], "modelSettings": [], diff --git a/tests/cli/eval/evals/evaluators/equality.json b/tests/cli/eval/evals/evaluators/equality.json deleted file mode 100644 index 10e073c8e..000000000 --- a/tests/cli/eval/evals/evaluators/equality.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "fileName": "equality.json", - "id": "equality", - "name": "Equality Evaluator", - "description": "An evaluator that judges the agent based on expected output.", - "category": 0, - "type": 1, - "targetOutputKey": "*", - "createdAt": "2025-06-26T17:45:39.651Z", - "updatedAt": "2025-06-26T17:45:39.651Z" -} diff --git a/tests/cli/eval/evals/evaluators/exact-match.json b/tests/cli/eval/evals/evaluators/exact-match.json new file mode 100644 index 000000000..013030875 --- /dev/null +++ b/tests/cli/eval/evals/evaluators/exact-match.json @@ -0,0 +1,17 @@ +{ + "version": "1.0", + "id": "ExactMatchEvaluator", + "description": "Checks if the response text exactly matches the expected value.", + "evaluatorTypeId": "uipath-exact-match", + "evaluatorConfig": { + "name": "ExactMatchEvaluator", + "targetOutputKey": "*", + "negated": false, + "ignoreCase": false, + "defaultEvaluationCriteria": { + "expectedOutput": { + "foo": "bar" + } + } + } +} diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py index 894f80862..e385a310d 100644 --- a/tests/cli/eval/mocks/test_mocks.py +++ b/tests/cli/eval/mocks/test_mocks.py @@ -6,7 +6,7 @@ from pytest_httpx import HTTPXMock from uipath._cli._evals._models._evaluation_set import ( - LegacyEvaluationItem, + EvaluationItem, LLMMockingStrategy, MockitoMockingStrategy, ) @@ -31,8 +31,9 @@ def foofoo(*args, **kwargs): "id": "evaluation-id", "name": "Mock foo", "inputs": {}, - "expectedOutput": {}, - "expectedAgentBehavior": "", + "evaluationCriterias": { + "ExactMatchEvaluator": None, + }, "mockingStrategy": { "type": "mockito", "behaviors": [ @@ -46,11 +47,8 @@ def foofoo(*args, **kwargs): } ], }, - "evalSetId": "eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = LegacyEvaluationItem(**evaluation_item) + evaluation = EvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy) # Act & Assert @@ -91,8 +89,9 @@ async def foofoo(*args, **kwargs): "id": "evaluation-id", "name": "Mock foo", "inputs": {}, - "expectedOutput": {}, - "expectedAgentBehavior": "", + "evaluationCriterias": { + "ExactMatchEvaluator": None, + }, "mockingStrategy": { "type": "mockito", "behaviors": [ @@ -106,11 +105,8 @@ async def foofoo(*args, **kwargs): } ], }, - "evalSetId": "eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = LegacyEvaluationItem(**evaluation_item) + evaluation = EvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy) # Act & Assert @@ -154,18 +150,16 @@ def foofoo(*args, **kwargs): "id": "evaluation-id", "name": "Mock foo", "inputs": {}, - "expectedOutput": {}, - "expectedAgentBehavior": "", + "evaluationCriterias": { + "ExactMatchEvaluator": None, + }, "mockingStrategy": { "type": "llm", "prompt": "response is 'bar1'", "toolsToSimulate": [{"name": "foo"}], }, - "evalSetId": "eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = LegacyEvaluationItem(**evaluation_item) + evaluation = EvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy) httpx_mock.add_response( url="https://example.com/agenthub_/llm/api/capabilities", @@ -240,18 +234,16 @@ async def foofoo(*args, **kwargs): "id": "evaluation-id", "name": "Mock foo", "inputs": {}, - "expectedOutput": {}, - "expectedAgentBehavior": "", + "evaluationCriterias": { + "ExactMatchEvaluator": None, + }, "mockingStrategy": { "type": "llm", "prompt": "response is 'bar1'", "toolsToSimulate": [{"name": "foo"}], }, - "evalSetId": "eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = LegacyEvaluationItem(**evaluation_item) + evaluation = EvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy) # Mock capability checks diff --git a/tests/cli/eval/test_evaluate.py b/tests/cli/eval/test_evaluate.py index 5c90a3edf..5bb4458a4 100644 --- a/tests/cli/eval/test_evaluate.py +++ b/tests/cli/eval/test_evaluate.py @@ -50,13 +50,13 @@ def __init__(self): result.output["evaluationSetResults"][0]["evaluationRunResults"][0]["result"][ "score" ] - == 100.0 + == 1.0 ) assert ( result.output["evaluationSetResults"][0]["evaluationRunResults"][0][ "evaluatorId" ] - == "equality" + == "ExactMatchEvaluator" ) diff --git a/tests/cli/evaluators/test_json_similarity_evaluator.py b/tests/cli/evaluators/test_json_similarity_evaluator.py index d47907546..d3ed11829 100644 --- a/tests/cli/evaluators/test_json_similarity_evaluator.py +++ b/tests/cli/evaluators/test_json_similarity_evaluator.py @@ -9,6 +9,7 @@ from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators import LegacyJsonSimilarityEvaluator +from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria from uipath.eval.models.models import ( AgentExecution, LegacyEvaluatorCategory, @@ -34,6 +35,7 @@ class TestJsonSimilarityEvaluator: async def test_json_similarity_exact_score_1(self) -> None: evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), + config={}, ) expected_json = """ { @@ -70,7 +72,10 @@ async def test_json_similarity_exact_score_1(self) -> None: agent_trace=[], agent_output=json.loads(actual_json), ), - evaluation_criteria=json.loads(expected_json), + evaluation_criteria=LegacyEvaluationCriteria( + expected_output=json.loads(expected_json), + expected_agent_behavior="", + ), ) assert result.score == 68.0 @@ -79,6 +84,7 @@ async def test_json_similarity_exact_score_1(self) -> None: async def test_json_similarity_exact_score_2(self) -> None: evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), + config={}, ) expected_json = """ { @@ -105,7 +111,10 @@ async def test_json_similarity_exact_score_2(self) -> None: agent_trace=[], agent_output=json.loads(actual_json), ), - evaluation_criteria=json.loads(expected_json), + evaluation_criteria=LegacyEvaluationCriteria( + expected_output=json.loads(expected_json), + expected_agent_behavior="", + ), ) assert result.score >= 82.333 @@ -115,6 +124,7 @@ async def test_json_similarity_exact_score_2(self) -> None: async def test_json_similarity_exact_score_3(self) -> None: evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), + config={}, ) expected_json = """ { @@ -138,7 +148,10 @@ async def test_json_similarity_exact_score_3(self) -> None: agent_trace=[], agent_output=json.loads(actual_json), ), - evaluation_criteria=json.loads(expected_json), + evaluation_criteria=LegacyEvaluationCriteria( + expected_output=json.loads(expected_json), + expected_agent_behavior="", + ), ) assert result.score >= 33.333 @@ -148,6 +161,7 @@ async def test_json_similarity_exact_score_3(self) -> None: async def test_json_similarity_exact_score_4(self) -> None: evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), + config={}, ) expected_json = """ { @@ -231,7 +245,10 @@ async def test_json_similarity_exact_score_4(self) -> None: agent_trace=[], agent_output=json.loads(actual_json), ), - evaluation_criteria=json.loads(expected_json), + evaluation_criteria=LegacyEvaluationCriteria( + expected_output=json.loads(expected_json), + expected_agent_behavior="", + ), ) assert result.score == 43.24977043158861