UiPath · akshaylive · Nov 7, 2025 · Oct 24, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.1.143"
+version = "2.1.144"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.10"

diff --git a/samples/calculator/evals/eval-sets/legacy.json b/samples/calculator/evals/eval-sets/legacy.json
@@ -19,7 +19,7 @@
         "operator": "+"
       },
       "expectedOutput": {
-        "result": 2
+        "result": 2.0
       },
       "expectedAgentBehavior": "The operation should produce the right output.",
       "evalSetId": "default-eval-set-id",
@@ -35,7 +35,7 @@
         "operator": "random"
       },
       "expectedOutput": {
-        "result": 2
+        "result": 2.0
       },
       "expectedAgentBehavior": "The operation should produce the right output.",
       "mockingStrategy": {
@@ -71,7 +71,7 @@
         "operator": "random"
       },
       "expectedOutput": {
-        "result": 2
+        "result": 2.0
       },
       "expectedAgentBehavior": "The operation should produce the right output.",
       "mockingStrategy": {

diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py
@@ -7,7 +7,6 @@
 from rich.rule import Rule
 from rich.table import Table
 
-from uipath._cli._evals._models._evaluation_set import AnyEvaluator
 from uipath._events._event_bus import EventBus
 from uipath._events._events import (
     EvalRunCreatedEvent,
@@ -16,6 +15,7 @@
     EvalSetRunUpdatedEvent,
     EvaluationEvents,
 )
+from uipath.eval.evaluators import BaseEvaluator
 from uipath.eval.models import ScoreType
 
 logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ class ConsoleProgressReporter:
 
     def __init__(self):
         self.console = Console()
-        self.evaluators: Dict[str, AnyEvaluator] = {}
+        self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {}
         self.display_started = False
         self.eval_results_by_name: Dict[str, list[Any]] = {}
 

diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py
@@ -6,7 +6,6 @@
 from pydantic import TypeAdapter
 
 from uipath._cli._evals._helpers import try_extract_file_and_class_name  # type: ignore
-from uipath._cli._evals._models._evaluation_set import AnyEvaluator
 from uipath._cli._evals._models._evaluator import (
     EqualsEvaluatorParams,
     EvaluatorConfig,
@@ -72,10 +71,11 @@ class EvaluatorFactory:
     """Factory class for creating evaluator instances based on configuration."""
 
     @classmethod
-    def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator:
+    def create_evaluator(cls, data: Dict[str, Any]) -> BaseEvaluator[Any, Any, Any]:
         if data.get("version", None) == "1.0":
             return cls._create_evaluator_internal(data)
-        return cls._create_legacy_evaluator_internal(data)
+        else:
+            return cls._create_legacy_evaluator_internal(data)
 
     @staticmethod
     def _create_evaluator_internal(
@@ -352,14 +352,14 @@ def _create_legacy_exact_match_evaluator(
         params: EqualsEvaluatorParams,
     ) -> LegacyExactMatchEvaluator:
         """Create a deterministic evaluator."""
-        return LegacyExactMatchEvaluator(**params.model_dump())
+        return LegacyExactMatchEvaluator(**params.model_dump(), config={})
 
     @staticmethod
     def _create_legacy_json_similarity_evaluator(
         params: JsonSimilarityEvaluatorParams,
     ) -> LegacyJsonSimilarityEvaluator:
         """Create a deterministic evaluator."""
-        return LegacyJsonSimilarityEvaluator(**params.model_dump())
+        return LegacyJsonSimilarityEvaluator(**params.model_dump(), config={})
 
     @staticmethod
     def _create_legacy_llm_as_judge_evaluator(
@@ -376,7 +376,7 @@ def _create_legacy_llm_as_judge_evaluator(
                 "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
             )
 
-        return LegacyLlmAsAJudgeEvaluator(**params.model_dump())
+        return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})
 
     @staticmethod
     def _create_legacy_trajectory_evaluator(
@@ -393,4 +393,4 @@ def _create_legacy_trajectory_evaluator(
                 "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
             )
 
-        return LegacyTrajectoryEvaluator(**params.model_dump())
+        return LegacyTrajectoryEvaluator(**params.model_dump(), config={})
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -1,11 +1,9 @@
 from enum import Enum, IntEnum
 from typing import Annotated, Any, Dict, List, Literal, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
+from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_camel
 
-from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
-
 
 class EvaluationSimulationTool(BaseModel):
     name: str = Field(..., alias="name")
@@ -216,15 +214,3 @@ def _discriminate_eval_set(
         if version == "1.0":
             return "evaluation_set"
     return "legacy_evaluation_set"
-
-
-AnyEvaluationSet = Annotated[
-    Union[
-        Annotated[EvaluationSet, Tag("evaluation_set")],
-        Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
-    ],
-    Discriminator(_discriminate_eval_set),
-]
-
-AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
-AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -14,11 +14,10 @@
 
 from uipath import UiPath
 from uipath._cli._evals._models._evaluation_set import (
-    AnyEvaluationItem,
-    AnyEvaluator,
     EvaluationItem,
     EvaluationStatus,
 )
+from uipath._cli._evals._models._evaluator import Evaluator
 from uipath._cli._evals._models._sw_reporting import (
     StudioWebAgentSnapshot,
     StudioWebProgressItem,
@@ -41,7 +40,10 @@
     ENV_TENANT_ID,
     HEADER_INTERNAL_TENANT_ID,
 )
-from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
+from uipath.eval.evaluators import (
+    BaseEvaluator,
+    LegacyBaseEvaluator,
+)
 from uipath.eval.models import EvalItemResult, ScoreType
 from uipath.tracing import LlmOpsHttpExporter
 
@@ -134,7 +136,9 @@ def _get_endpoint_prefix(self) -> str:
             return "api/"
         return "agentsruntime_/api/"
 
-    def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
+    def _is_coded_evaluator(
+        self, evaluators: List[BaseEvaluator[Any, Any, Any]]
+    ) -> bool:
         """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).
 
         Args:
@@ -146,7 +150,7 @@ def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
         if not evaluators:
             return False
         # Check the first evaluator type
-        return isinstance(evaluators[0], BaseEvaluator)
+        return not isinstance(evaluators[0], LegacyBaseEvaluator)
 
     def _extract_usage_from_spans(
         self, spans: list[Any]
@@ -236,7 +240,7 @@ async def create_eval_set_run_sw(
 
     @gracefully_handle_errors
     async def create_eval_run(
-        self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
+        self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
     ) -> str:
         """Create a new evaluation run in StudioWeb.
 
@@ -263,7 +267,7 @@ async def create_eval_run(
     async def update_eval_run(
         self,
         sw_progress_item: StudioWebProgressItem,
-        evaluators: dict[str, AnyEvaluator],
+        evaluators: dict[str, Evaluator],
         is_coded: bool = False,
         spans: list[Any] | None = None,
     ):
@@ -704,7 +708,7 @@ def _update_coded_eval_run_spec(
         )
 
     def _create_eval_run_spec(
-        self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
+        self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
     ) -> RequestSpec:
         # Legacy API expects eval IDs as GUIDs, coded accepts strings
         # Convert string IDs to deterministic GUIDs for legacy

diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
@@ -25,7 +25,7 @@
     EvalSetRunUpdatedEvent,
     EvaluationEvents,
 )
-from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
+from ...eval.evaluators import BaseEvaluator
 from ...eval.models import EvaluationResult
 from ...eval.models.models import AgentExecution, EvalItemResult
 from .._runtime._contracts import (
@@ -43,11 +43,8 @@
 from ..models.runtime_schema import Entrypoint
 from ._evaluator_factory import EvaluatorFactory
 from ._models._evaluation_set import (
-    AnyEvaluationItem,
-    AnyEvaluationSet,
-    AnyEvaluator,
     EvaluationItem,
-    LegacyEvaluationItem,
+    EvaluationSet,
 )
 from ._models._exceptions import EvaluationRuntimeException
 from ._models._output import (
@@ -270,8 +267,8 @@ async def execute(self) -> UiPathRuntimeResult:
 
     async def _execute_sequential(
         self,
-        evaluation_set: AnyEvaluationSet,
-        evaluators: List[AnyEvaluator],
+        evaluation_set: EvaluationSet,
+        evaluators: List[BaseEvaluator[Any, Any, Any]],
         event_bus: EventBus,
     ) -> List[EvaluationRunResult]:
         all_eval_run_result: list[EvaluationRunResult] = []
@@ -285,13 +282,13 @@ async def _execute_sequential(
 
     async def _execute_parallel(
         self,
-        evaluation_set: AnyEvaluationSet,
-        evaluators: List[AnyEvaluator],
+        evaluation_set: EvaluationSet,
+        evaluators: List[BaseEvaluator[Any, Any, Any]],
         event_bus: EventBus,
         workers: int,
     ) -> List[EvaluationRunResult]:
         # Create a queue with max concurrency
-        queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue(
+        queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue(
             maxsize=workers
         )
 
@@ -301,7 +298,7 @@ async def _execute_parallel(
         # Producer task to fill the queue
         async def producer() -> None:
             for index, eval_item in enumerate(evaluation_set.evaluations):
-                await queue.put((index, eval_item))  # type: ignore[arg-type]
+                await queue.put((index, eval_item))
             # Signal completion by putting None markers
             for _ in range(workers):
                 await queue.put(None)  # type: ignore
@@ -343,8 +340,8 @@ async def worker(worker_id: int) -> None:
 
     async def _execute_eval(
         self,
-        eval_item: AnyEvaluationItem,
-        evaluators: List[AnyEvaluator],
+        eval_item: EvaluationItem,
+        evaluators: List[BaseEvaluator[Any, Any, Any]],
         event_bus: EventBus,
     ) -> EvaluationRunResult:
         # Generate LLM-based input if input_mocking_strategy is defined
@@ -416,41 +413,21 @@ async def _execute_eval(
             evaluation_item_results: list[EvalItemResult] = []
 
             for evaluator in evaluators:
-                # Determine which evaluator method to use based on evaluation set/item type
-                evaluation_result: Optional[EvaluationResult] = None
-
-                match eval_item:
-                    case LegacyEvaluationItem():
-                        # Legacy evaluation - use run_legacy_evaluator
-                        evaluation_result = await self.run_legacy_evaluator(
-                            evaluator=evaluator,  # type: ignore
-                            execution_output=agent_execution_output,
-                            eval_item=eval_item,
-                        )
-                    case EvaluationItem() if (
-                        evaluator.id in eval_item.evaluation_criterias
-                    ):
-                        # New evaluation with criteria
-                        evaluation_criteria = eval_item.evaluation_criterias[
-                            evaluator.id
-                        ]
-
-                        evaluation_result = await self.run_evaluator(
-                            evaluator=evaluator,  # type: ignore
-                            execution_output=agent_execution_output,
-                            eval_item=eval_item,
-                            evaluation_criteria=evaluator.evaluation_criteria_type(  # type: ignore
-                                **evaluation_criteria
-                            )
-                            if evaluation_criteria
-                            else evaluator.evaluator_config.default_evaluation_criteria,  # type: ignore
-                        )
-                    case _:
-                        # Skip if evaluator not in evaluation criteria
-                        continue
-
-                if evaluation_result is None:
+                if evaluator.id not in eval_item.evaluation_criterias:
+                    # Skip!
                     continue
+                evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]
+
+                evaluation_result = await self.run_evaluator(
+                    evaluator=evaluator,
+                    execution_output=agent_execution_output,
+                    eval_item=eval_item,
+                    evaluation_criteria=evaluator.evaluation_criteria_type(
+                        **evaluation_criteria
+                    )
+                    if evaluation_criteria
+                    else evaluator.evaluator_config.default_evaluation_criteria,
+                )
 
                 dto_result = EvaluationResultDto.from_evaluation_result(
                     evaluation_result
@@ -527,8 +504,8 @@ async def _execute_eval(
         return evaluation_run_results
 
     async def _generate_input_for_eval(
-        self, eval_item: AnyEvaluationItem
-    ) -> AnyEvaluationItem:
+        self, eval_item: EvaluationItem
+    ) -> EvaluationItem:
         """Use LLM to generate a mock input for an evaluation item."""
         generated_input = await generate_llm_input(
             eval_item, (await self.get_entrypoint()).input
@@ -549,7 +526,7 @@ def _get_and_clear_execution_data(
         return spans, logs
 
     async def execute_runtime(
-        self, eval_item: AnyEvaluationItem, execution_id: str
+        self, eval_item: EvaluationItem, execution_id: str
     ) -> UiPathEvalRunExecutionOutput:
         context_args = self.context.model_dump()
         context_args["execution_id"] = execution_id
@@ -622,28 +599,9 @@ async def run_evaluator(
 
         return result
 
-    async def run_legacy_evaluator(
-        self,
-        evaluator: LegacyBaseEvaluator[Any],
-        execution_output: UiPathEvalRunExecutionOutput,
-        eval_item: LegacyEvaluationItem,
-    ) -> EvaluationResult:
-        agent_execution = AgentExecution(
-            agent_input=eval_item.inputs,
-            agent_output=execution_output.result.output or {},
-            agent_trace=execution_output.spans,
-            expected_agent_behavior=eval_item.expected_agent_behavior,
-        )
-
-        result = await evaluator.evaluate(
-            agent_execution=agent_execution,
-            # at the moment evaluation_criteria is always the expected output
-            evaluation_criteria=eval_item.expected_output,
-        )
-
-        return result
-
-    def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]:
+    def _load_evaluators(
+        self, evaluation_set: EvaluationSet
+    ) -> list[BaseEvaluator[Any, Any, Any]]:
         """Load evaluators referenced by the evaluation set."""
         evaluators = []
         evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators"  # type: ignore