Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hello
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hello.txt
43 changes: 40 additions & 3 deletions src/uipath/_cli/_evals/_console_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(self):
self.evaluators: Dict[str, AnyEvaluator] = {}
self.display_started = False
self.eval_results_by_name: Dict[str, list[Any]] = {}
self.evaluator_weights: Dict[str, float] = {}

def _convert_score_to_numeric(self, eval_result) -> float:
"""Convert evaluation result score to numeric value."""
Expand Down Expand Up @@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N
"""Handle evaluation set run creation."""
try:
self.evaluators = {eval.id: eval for eval in payload.evaluators}
if payload.evaluator_weights:
self.evaluator_weights = payload.evaluator_weights
except Exception as e:
logger.error(f"Failed to handle create eval set run event: {e}")

Expand Down Expand Up @@ -206,9 +209,20 @@ def display_final_results(self):

summary_table.add_row(*row_values)

# Add separator row before average
# Add separator row before weights and average
summary_table.add_section()

# Add weights row if weights are defined
if self.evaluator_weights:
weight_row_values = ["[bold]Weights[/bold]"]
for evaluator_id in evaluator_ids:
weight = self.evaluator_weights.get(evaluator_id, "-")
if weight != "-":
weight_row_values.append(f"[bold]{weight:.1f}[/bold]")
else:
weight_row_values.append("[bold]-[/bold]")
summary_table.add_row(*weight_row_values)

# Add average row
avg_row_values = ["[bold]Average[/bold]"]
for evaluator_id in evaluator_ids:
Expand All @@ -217,8 +231,31 @@ def display_final_results(self):

summary_table.add_row(*avg_row_values)

self.console.print(summary_table)
self.console.print()
# Calculate and display weighted final score if weights are defined
if self.evaluator_weights:
weighted_total = 0.0
weights_sum = 0.0
for evaluator_id in evaluator_ids:
weight = self.evaluator_weights.get(evaluator_id)
if weight is not None:
avg_score = self.final_results[evaluator_id]
weighted_total += weight * avg_score
weights_sum += weight

# Display as a separate info line
self.console.print(summary_table)
self.console.print()
self.console.print(
f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]"
)
if weights_sum != 1.0:
self.console.print(
f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]"
)
self.console.print()
else:
self.console.print(summary_table)
self.console.print()
else:
self.console.print(
"→ [bold green]All evaluations completed successfully![/bold green]"
Expand Down
3 changes: 3 additions & 0 deletions src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ class EvaluationSet(BaseModel):
version: Literal["1.0"] = "1.0"
evaluator_refs: List[str] = Field(default_factory=list)
evaluations: List[EvaluationItem] = Field(default_factory=list)
evaluator_weights: Optional[Dict[str, float]] = Field(
default=None, alias="evaluatorWeights"
)

def extract_selected_evals(self, eval_ids) -> None:
selected_evals: list[EvaluationItem] = []
Expand Down
8 changes: 7 additions & 1 deletion src/uipath/_cli/_evals/_models/_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel):
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)

score: float
details: Optional[str | BaseModel] = None
details: Optional[str | Dict[str, Any] | BaseModel] = None
evaluation_time: Optional[float] = None

@model_serializer(mode="wrap")
Expand All @@ -56,6 +56,7 @@ def serialize_model(
info: core_schema.SerializationInfo,
) -> Any:
data = serializer(self)
# Only remove details if it's None, keep empty dicts and populated dicts
if self.details is None and isinstance(data, dict):
data.pop("details", None)
return data
Expand Down Expand Up @@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel):

evaluator_name: str
evaluator_id: str
evaluator_type: Optional[str] = None
node_id: Optional[str] = None
result: EvaluationResultDto


Expand All @@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel):

evaluation_name: str
evaluation_run_results: List[EvaluationRunResultDto]
workflow: Optional[List[str]] = None
agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None

@property
Expand All @@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel):

evaluation_set_name: str
evaluation_set_results: List[EvaluationRunResult]
weighted_final_score: Optional[float] = None
evaluator_weights: Optional[Dict[str, float]] = None

@property
def score(self) -> float:
Expand Down
23 changes: 17 additions & 6 deletions src/uipath/_cli/_evals/_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,10 +329,11 @@ async def update_eval_set_run(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
is_coded: bool = False,
weighted_final_score: float | None = None,
):
"""Update the evaluation set run status to complete."""
spec = self._update_eval_set_run_spec(
eval_set_run_id, evaluator_scores, is_coded
eval_set_run_id, evaluator_scores, is_coded, weighted_final_score
)
await self._client.request_async(
method=spec.method,
Expand Down Expand Up @@ -452,6 +453,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
eval_set_run_id,
payload.evaluator_scores,
is_coded=is_coded,
weighted_final_score=payload.weighted_final_score,
)
logger.debug(
f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
Expand Down Expand Up @@ -797,6 +799,7 @@ def _update_eval_set_run_spec(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
is_coded: bool = False,
weighted_final_score: float | None = None,
) -> RequestSpec:
# Legacy API expects evaluatorId as GUID, coded accepts string
evaluator_scores_list = []
Expand All @@ -820,16 +823,24 @@ def _update_eval_set_run_spec(

# For legacy evaluations, endpoint is without /coded
endpoint_suffix = "coded/" if is_coded else ""

# Build the JSON payload
json_payload = {
"evalSetRunId": eval_set_run_id,
"status": EvaluationStatus.COMPLETED.value,
"evaluatorScores": evaluator_scores_list,
}

# Add weighted final score if available
if weighted_final_score is not None:
json_payload["weightedFinalScore"] = weighted_final_score

return RequestSpec(
method="PUT",
endpoint=Endpoint(
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun"
),
json={
"evalSetRunId": eval_set_run_id,
"status": EvaluationStatus.COMPLETED.value,
"evaluatorScores": evaluator_scores_list,
},
json=json_payload,
headers=self._tenant_header(),
)

Expand Down
Loading