Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions samples/calculator/evals/eval-sets/legacy.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"operator": "+"
},
"expectedOutput": {
"result": 2
"result": 2.0
},
"expectedAgentBehavior": "The operation should produce the right output.",
"evalSetId": "default-eval-set-id",
Expand All @@ -35,7 +35,7 @@
"operator": "random"
},
"expectedOutput": {
"result": 2
"result": 2.0
},
"expectedAgentBehavior": "The operation should produce the right output.",
"mockingStrategy": {
Expand Down Expand Up @@ -71,7 +71,7 @@
"operator": "random"
},
"expectedOutput": {
"result": 2
"result": 2.0
},
"expectedAgentBehavior": "The operation should produce the right output.",
"mockingStrategy": {
Expand Down
4 changes: 2 additions & 2 deletions src/uipath/_cli/_evals/_console_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from rich.rule import Rule
from rich.table import Table

from uipath._cli._evals._models._evaluation_set import AnyEvaluator
from uipath._events._event_bus import EventBus
from uipath._events._events import (
EvalRunCreatedEvent,
Expand All @@ -16,6 +15,7 @@
EvalSetRunUpdatedEvent,
EvaluationEvents,
)
from uipath.eval.evaluators import BaseEvaluator
from uipath.eval.models import ScoreType

logger = logging.getLogger(__name__)
Expand All @@ -26,7 +26,7 @@ class ConsoleProgressReporter:

def __init__(self):
self.console = Console()
self.evaluators: Dict[str, AnyEvaluator] = {}
self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {}
self.display_started = False
self.eval_results_by_name: Dict[str, list[Any]] = {}

Expand Down
14 changes: 7 additions & 7 deletions src/uipath/_cli/_evals/_evaluator_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pydantic import TypeAdapter

from uipath._cli._evals._helpers import try_extract_file_and_class_name # type: ignore
from uipath._cli._evals._models._evaluation_set import AnyEvaluator
from uipath._cli._evals._models._evaluator import (
EqualsEvaluatorParams,
EvaluatorConfig,
Expand Down Expand Up @@ -72,10 +71,11 @@ class EvaluatorFactory:
"""Factory class for creating evaluator instances based on configuration."""

@classmethod
def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator:
def create_evaluator(cls, data: Dict[str, Any]) -> BaseEvaluator[Any, Any, Any]:
if data.get("version", None) == "1.0":
return cls._create_evaluator_internal(data)
return cls._create_legacy_evaluator_internal(data)
else:
return cls._create_legacy_evaluator_internal(data)

@staticmethod
def _create_evaluator_internal(
Expand Down Expand Up @@ -352,14 +352,14 @@ def _create_legacy_exact_match_evaluator(
params: EqualsEvaluatorParams,
) -> LegacyExactMatchEvaluator:
"""Create a deterministic evaluator."""
return LegacyExactMatchEvaluator(**params.model_dump())
return LegacyExactMatchEvaluator(**params.model_dump(), config={})

@staticmethod
def _create_legacy_json_similarity_evaluator(
params: JsonSimilarityEvaluatorParams,
) -> LegacyJsonSimilarityEvaluator:
"""Create a deterministic evaluator."""
return LegacyJsonSimilarityEvaluator(**params.model_dump())
return LegacyJsonSimilarityEvaluator(**params.model_dump(), config={})

@staticmethod
def _create_legacy_llm_as_judge_evaluator(
Expand All @@ -376,7 +376,7 @@ def _create_legacy_llm_as_judge_evaluator(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
)

return LegacyLlmAsAJudgeEvaluator(**params.model_dump())
return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})

@staticmethod
def _create_legacy_trajectory_evaluator(
Expand All @@ -393,4 +393,4 @@ def _create_legacy_trajectory_evaluator(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
)

return LegacyTrajectoryEvaluator(**params.model_dump())
return LegacyTrajectoryEvaluator(**params.model_dump(), config={})
16 changes: 1 addition & 15 deletions src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from enum import Enum, IntEnum
from typing import Annotated, Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
from pydantic import BaseModel, ConfigDict, Field
from pydantic.alias_generators import to_camel

from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator


class EvaluationSimulationTool(BaseModel):
name: str = Field(..., alias="name")
Expand Down Expand Up @@ -216,15 +214,3 @@ def _discriminate_eval_set(
if version == "1.0":
return "evaluation_set"
return "legacy_evaluation_set"


AnyEvaluationSet = Annotated[
Union[
Annotated[EvaluationSet, Tag("evaluation_set")],
Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
],
Discriminator(_discriminate_eval_set),
]

AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
13 changes: 7 additions & 6 deletions src/uipath/_cli/_evals/_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@

from uipath import UiPath
from uipath._cli._evals._models._evaluation_set import (
AnyEvaluationItem,
AnyEvaluator,
EvaluationItem,
EvaluationStatus,
)
from uipath._cli._evals._models._evaluator import Evaluator
from uipath._cli._evals._models._sw_reporting import (
StudioWebAgentSnapshot,
StudioWebProgressItem,
Expand Down Expand Up @@ -133,7 +132,9 @@ def _get_endpoint_prefix(self) -> str:
return "api/"
return "agentsruntime_/api/"

def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
def _is_coded_evaluator(
self, evaluators: List[BaseEvaluator[Any, Any, Any]]
) -> bool:
"""Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).

Args:
Expand Down Expand Up @@ -235,7 +236,7 @@ async def create_eval_set_run_sw(

@gracefully_handle_errors
async def create_eval_run(
self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
) -> str:
"""Create a new evaluation run in StudioWeb.

Expand All @@ -262,7 +263,7 @@ async def create_eval_run(
async def update_eval_run(
self,
sw_progress_item: StudioWebProgressItem,
evaluators: dict[str, AnyEvaluator],
evaluators: dict[str, Evaluator],
is_coded: bool = False,
spans: list[Any] | None = None,
):
Expand Down Expand Up @@ -704,7 +705,7 @@ def _update_coded_eval_run_spec(
)

def _create_eval_run_spec(
self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
) -> RequestSpec:
# Legacy API expects eval IDs as GUIDs, coded accepts strings
# Convert string IDs to deterministic GUIDs for legacy
Expand Down
102 changes: 30 additions & 72 deletions src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
EvalSetRunUpdatedEvent,
EvaluationEvents,
)
from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
from ...eval.evaluators import BaseEvaluator
from ...eval.models import EvaluationResult
from ...eval.models.models import AgentExecution, EvalItemResult
from .._runtime._contracts import (
Expand All @@ -43,11 +43,8 @@
from ..models.runtime_schema import Entrypoint
from ._evaluator_factory import EvaluatorFactory
from ._models._evaluation_set import (
AnyEvaluationItem,
AnyEvaluationSet,
AnyEvaluator,
EvaluationItem,
LegacyEvaluationItem,
EvaluationSet,
)
from ._models._exceptions import EvaluationRuntimeException
from ._models._output import (
Expand Down Expand Up @@ -270,8 +267,8 @@ async def execute(self) -> UiPathRuntimeResult:

async def _execute_sequential(
self,
evaluation_set: AnyEvaluationSet,
evaluators: List[AnyEvaluator],
evaluation_set: EvaluationSet,
evaluators: List[BaseEvaluator[Any, Any, Any]],
event_bus: EventBus,
) -> List[EvaluationRunResult]:
all_eval_run_result: list[EvaluationRunResult] = []
Expand All @@ -285,13 +282,13 @@ async def _execute_sequential(

async def _execute_parallel(
self,
evaluation_set: AnyEvaluationSet,
evaluators: List[AnyEvaluator],
evaluation_set: EvaluationSet,
evaluators: List[BaseEvaluator[Any, Any, Any]],
event_bus: EventBus,
workers: int,
) -> List[EvaluationRunResult]:
# Create a queue with max concurrency
queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue(
queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue(
maxsize=workers
)

Expand All @@ -301,7 +298,7 @@ async def _execute_parallel(
# Producer task to fill the queue
async def producer() -> None:
for index, eval_item in enumerate(evaluation_set.evaluations):
await queue.put((index, eval_item)) # type: ignore[arg-type]
await queue.put((index, eval_item))
# Signal completion by putting None markers
for _ in range(workers):
await queue.put(None) # type: ignore
Expand Down Expand Up @@ -343,8 +340,8 @@ async def worker(worker_id: int) -> None:

async def _execute_eval(
self,
eval_item: AnyEvaluationItem,
evaluators: List[AnyEvaluator],
eval_item: EvaluationItem,
evaluators: List[BaseEvaluator[Any, Any, Any]],
event_bus: EventBus,
) -> EvaluationRunResult:
# Generate LLM-based input if input_mocking_strategy is defined
Expand Down Expand Up @@ -416,41 +413,21 @@ async def _execute_eval(
evaluation_item_results: list[EvalItemResult] = []

for evaluator in evaluators:
# Determine which evaluator method to use based on evaluation set/item type
evaluation_result: Optional[EvaluationResult] = None

match eval_item:
case LegacyEvaluationItem():
# Legacy evaluation - use run_legacy_evaluator
evaluation_result = await self.run_legacy_evaluator(
evaluator=evaluator, # type: ignore
execution_output=agent_execution_output,
eval_item=eval_item,
)
case EvaluationItem() if (
evaluator.id in eval_item.evaluation_criterias
):
# New evaluation with criteria
evaluation_criteria = eval_item.evaluation_criterias[
evaluator.id
]

evaluation_result = await self.run_evaluator(
evaluator=evaluator, # type: ignore
execution_output=agent_execution_output,
eval_item=eval_item,
evaluation_criteria=evaluator.evaluation_criteria_type( # type: ignore
**evaluation_criteria
)
if evaluation_criteria
else evaluator.evaluator_config.default_evaluation_criteria, # type: ignore
)
case _:
# Skip if evaluator not in evaluation criteria
continue

if evaluation_result is None:
if evaluator.id not in eval_item.evaluation_criterias:
# Skip!
continue
evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]

evaluation_result = await self.run_evaluator(
evaluator=evaluator,
execution_output=agent_execution_output,
eval_item=eval_item,
evaluation_criteria=evaluator.evaluation_criteria_type(
**evaluation_criteria
)
if evaluation_criteria
else evaluator.evaluator_config.default_evaluation_criteria,
)

dto_result = EvaluationResultDto.from_evaluation_result(
evaluation_result
Expand Down Expand Up @@ -527,8 +504,8 @@ async def _execute_eval(
return evaluation_run_results

async def _generate_input_for_eval(
self, eval_item: AnyEvaluationItem
) -> AnyEvaluationItem:
self, eval_item: EvaluationItem
) -> EvaluationItem:
"""Use LLM to generate a mock input for an evaluation item."""
generated_input = await generate_llm_input(
eval_item, (await self.get_entrypoint()).input
Expand All @@ -549,7 +526,7 @@ def _get_and_clear_execution_data(
return spans, logs

async def execute_runtime(
self, eval_item: AnyEvaluationItem, execution_id: str
self, eval_item: EvaluationItem, execution_id: str
) -> UiPathEvalRunExecutionOutput:
context_args = self.context.model_dump()
context_args["execution_id"] = execution_id
Expand Down Expand Up @@ -622,28 +599,9 @@ async def run_evaluator(

return result

async def run_legacy_evaluator(
self,
evaluator: LegacyBaseEvaluator[Any],
execution_output: UiPathEvalRunExecutionOutput,
eval_item: LegacyEvaluationItem,
) -> EvaluationResult:
agent_execution = AgentExecution(
agent_input=eval_item.inputs,
agent_output=execution_output.result.output or {},
agent_trace=execution_output.spans,
expected_agent_behavior=eval_item.expected_agent_behavior,
)

result = await evaluator.evaluate(
agent_execution=agent_execution,
# at the moment evaluation_criteria is always the expected output
evaluation_criteria=eval_item.expected_output,
)

return result

def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]:
def _load_evaluators(
self, evaluation_set: EvaluationSet
) -> list[BaseEvaluator[Any, Any, Any]]:
"""Load evaluators referenced by the evaluation set."""
evaluators = []
evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators" # type: ignore
Expand Down
6 changes: 3 additions & 3 deletions src/uipath/_cli/_evals/mocks/input_mocker.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from typing import Any, Dict

from uipath import UiPath
from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem
from uipath.tracing import traced
from uipath._cli._evals._models._evaluation_set import EvaluationItem
from uipath.tracing._traced import traced

from .mocker import UiPathInputMockingError

Expand Down Expand Up @@ -54,7 +54,7 @@ def get_input_mocking_prompt(

@traced(name="__mocker__", recording=False)
async def generate_llm_input(
evaluation_item: AnyEvaluationItem,
evaluation_item: EvaluationItem,
input_schema: Dict[str, Any],
) -> Dict[str, Any]:
"""Generate synthetic input using an LLM based on the evaluation context."""
Expand Down
Loading