Skip to content

Commit 4a86579

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Added an Fast API new endpoint to serve metric info
This endpoint could be used by ADK Web to dynamically know: - What are the available metrics in an App - A description of those metrics - A value range supported by those metrics PiperOrigin-RevId: 786796446
1 parent c8f8b4a commit 4a86579

16 files changed

+394
-54
lines changed

src/google/adk/cli/adk_web_server.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
from typing_extensions import override
4949
from watchdog.observers import Observer
5050

51-
from . import agent_graph
5251
from ..agents.live_request_queue import LiveRequest
5352
from ..agents.live_request_queue import LiveRequestQueue
5453
from ..agents.run_config import RunConfig
@@ -64,6 +63,7 @@
6463
from ..evaluation.eval_metrics import EvalMetric
6564
from ..evaluation.eval_metrics import EvalMetricResult
6665
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
66+
from ..evaluation.eval_metrics import MetricInfo
6767
from ..evaluation.eval_result import EvalSetResult
6868
from ..evaluation.eval_set_results_manager import EvalSetResultsManager
6969
from ..evaluation.eval_sets_manager import EvalSetsManager
@@ -72,6 +72,7 @@
7272
from ..runners import Runner
7373
from ..sessions.base_session_service import BaseSessionService
7474
from ..sessions.session import Session
75+
from . import agent_graph
7576
from .cli_eval import EVAL_SESSION_ID_PREFIX
7677
from .cli_eval import EvalStatus
7778
from .utils import cleanup
@@ -697,6 +698,24 @@ def list_eval_results(app_name: str) -> list[str]:
697698
"""Lists all eval results for the given app."""
698699
return self.eval_set_results_manager.list_eval_set_results(app_name)
699700

701+
@app.get(
702+
"/apps/{app_name}/eval_metrics",
703+
response_model_exclude_none=True,
704+
)
705+
def list_eval_metrics(app_name: str) -> list[MetricInfo]:
706+
"""Lists all eval metrics for the given app."""
707+
try:
708+
from ..evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
709+
710+
# Right now we ignore the app_name as eval metrics are not tied to the
711+
# app_name, but they could be moving forward.
712+
return DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics()
713+
except ModuleNotFoundError as e:
714+
logger.exception("%s\n%s", MISSING_EVAL_DEPENDENCIES_MESSAGE, e)
715+
raise HTTPException(
716+
status_code=400, detail=MISSING_EVAL_DEPENDENCIES_MESSAGE
717+
) from e
718+
700719
@app.delete("/apps/{app_name}/users/{user_id}/sessions/{session_id}")
701720
async def delete_session(app_name: str, user_id: str, session_id: str):
702721
await self.session_service.delete_session(

src/google/adk/evaluation/eval_metrics.py

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,22 @@ class JudgeModelOptions(BaseModel):
4949

5050
judge_model: str = Field(
5151
default="gemini-2.5-flash",
52-
description="""The judge model to use for evaluation. It can be a model name.""",
52+
description=(
53+
"The judge model to use for evaluation. It can be a model name."
54+
),
5355
)
5456

5557
judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
56-
default=None, description="""The configuration for the judge model."""
58+
default=None,
59+
description="The configuration for the judge model.",
5760
)
5861

5962
num_samples: Optional[int] = Field(
6063
default=None,
61-
description="""The number of times to sample the model for each invocation evaluation.""",
64+
description=(
65+
"The number of times to sample the model for each invocation"
66+
" evaluation."
67+
),
6268
)
6369

6470

@@ -70,15 +76,20 @@ class EvalMetric(BaseModel):
7076
populate_by_name=True,
7177
)
7278

73-
metric_name: str
74-
"""The name of the metric."""
79+
metric_name: str = Field(
80+
description="The name of the metric.",
81+
)
7582

76-
threshold: float
77-
"""A threshold value. Each metric decides how to interpret this threshold."""
83+
threshold: float = Field(
84+
description=(
85+
"A threshold value. Each metric decides how to interpret this"
86+
" threshold."
87+
),
88+
)
7889

7990
judge_model_options: Optional[JudgeModelOptions] = Field(
8091
default=None,
81-
description="""Options for the judge model.""",
92+
description="Options for the judge model.",
8293
)
8394

8495

@@ -90,8 +101,14 @@ class EvalMetricResult(EvalMetric):
90101
populate_by_name=True,
91102
)
92103

93-
score: Optional[float] = None
94-
eval_status: EvalStatus
104+
score: Optional[float] = Field(
105+
default=None,
106+
description=(
107+
"Score obtained after evaluating the metric. Optional, as evaluation"
108+
" might not have happened."
109+
),
110+
)
111+
eval_status: EvalStatus = Field(description="The status of this evaluation.")
95112

96113

97114
class EvalMetricResultPerInvocation(BaseModel):
@@ -102,11 +119,71 @@ class EvalMetricResultPerInvocation(BaseModel):
102119
populate_by_name=True,
103120
)
104121

105-
actual_invocation: Invocation
106-
"""The actual invocation, usually obtained by inferencing the agent."""
122+
actual_invocation: Invocation = Field(
123+
description=(
124+
"The actual invocation, usually obtained by inferencing the agent."
125+
)
126+
)
127+
128+
expected_invocation: Invocation = Field(
129+
description=(
130+
"The expected invocation, usually the reference or golden invocation."
131+
)
132+
)
107133

108-
expected_invocation: Invocation
109-
"""The expected invocation, usually the reference or golden invocation."""
134+
eval_metric_results: list[EvalMetricResult] = Field(
135+
default=[],
136+
description="Eval resutls for each applicable metric.",
137+
)
138+
139+
140+
class Interval(BaseModel):
141+
"""Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""
142+
143+
min_value: float = Field(description="The smaller end of the interval.")
144+
145+
open_at_min: bool = Field(
146+
default=False,
147+
description=(
148+
"The interval is Open on the min end. The default value is False,"
149+
" which means that we assume that the interval is Closed."
150+
),
151+
)
152+
153+
max_value: float = Field(description="The larger end of the interval.")
154+
155+
open_at_max: bool = Field(
156+
default=False,
157+
description=(
158+
"The interval is Open on the max end. The default value is False,"
159+
" which means that we assume that the interval is Closed."
160+
),
161+
)
110162

111-
eval_metric_results: list[EvalMetricResult] = []
112-
"""Eval resutls for each applicable metric."""
163+
164+
class MetricValueInfo(BaseModel):
165+
"""Information about the type of metric value."""
166+
167+
interval: Optional[Interval] = Field(
168+
default=None,
169+
description="The values represented by the metric are of type interval.",
170+
)
171+
172+
173+
class MetricInfo(BaseModel):
174+
"""Information about the metric that are used for Evals."""
175+
176+
model_config = ConfigDict(
177+
alias_generator=alias_generators.to_camel,
178+
populate_by_name=True,
179+
)
180+
181+
metric_name: str = Field(description="The name of the metric.")
182+
183+
description: str = Field(
184+
default=None, description="A 2 to 3 line description of the metric."
185+
)
186+
187+
metric_value_info: MetricValueInfo = Field(
188+
description="Information on the nature of values supported by the metric."
189+
)

src/google/adk/evaluation/final_response_match_v1.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,39 @@
2222

2323
from .eval_case import Invocation
2424
from .eval_metrics import EvalMetric
25+
from .eval_metrics import Interval
26+
from .eval_metrics import MetricInfo
27+
from .eval_metrics import MetricValueInfo
28+
from .eval_metrics import PrebuiltMetrics
2529
from .evaluator import EvalStatus
2630
from .evaluator import EvaluationResult
2731
from .evaluator import Evaluator
2832
from .evaluator import PerInvocationResult
2933

3034

3135
class RougeEvaluator(Evaluator):
32-
"""Calculates the ROUGE-1 metric to compare responses."""
36+
"""Evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric.
37+
38+
Value range for this metric is [0,1], with values closer to 1 more desirable.
39+
"""
3340

3441
def __init__(self, eval_metric: EvalMetric):
3542
self._eval_metric = eval_metric
3643

44+
@staticmethod
45+
def get_metric_info() -> MetricInfo:
46+
return MetricInfo(
47+
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
48+
description=(
49+
"This metric evaluates if the agent's final response matches a"
50+
" golden/expected final response using Rouge_1 metric. Value range"
51+
" for this metric is [0,1], with values closer to 1 more desirable."
52+
),
53+
metric_value_info=MetricValueInfo(
54+
interval=Interval(min_value=0.0, max_value=1.0)
55+
),
56+
)
57+
3758
@override
3859
def evaluate_invocations(
3960
self,

src/google/adk/evaluation/final_response_match_v2.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
from ..utils.feature_decorator import experimental
2525
from .eval_case import Invocation
2626
from .eval_metrics import EvalMetric
27+
from .eval_metrics import Interval
28+
from .eval_metrics import MetricInfo
29+
from .eval_metrics import MetricValueInfo
30+
from .eval_metrics import PrebuiltMetrics
2731
from .evaluator import EvalStatus
2832
from .evaluator import EvaluationResult
2933
from .evaluator import PerInvocationResult
@@ -146,6 +150,20 @@ def __init__(
146150
if self._eval_metric.judge_model_options.num_samples is None:
147151
self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES
148152

153+
@staticmethod
154+
def get_metric_info() -> MetricInfo:
155+
return MetricInfo(
156+
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
157+
description=(
158+
"This metric evaluates if the agent's final response matches a"
159+
" golden/expected final response using LLM as a judge. Value range"
160+
" for this metric is [0,1], with values closer to 1 more desirable."
161+
),
162+
metric_value_info=MetricValueInfo(
163+
interval=Interval(min_value=0.0, max_value=1.0)
164+
),
165+
)
166+
149167
@override
150168
def format_auto_rater_prompt(
151169
self, actual_invocation: Invocation, expected_invocation: Invocation
@@ -185,8 +203,7 @@ def aggregate_per_invocation_samples(
185203
tie, consider the result to be invalid.
186204
187205
Args:
188-
per_invocation_samples: Samples of per-invocation results to
189-
aggregate.
206+
per_invocation_samples: Samples of per-invocation results to aggregate.
190207
191208
Returns:
192209
If there is a majority of valid results, return the first valid result.

src/google/adk/evaluation/metric_evaluator_registry.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
import logging
1818

1919
from ..errors.not_found_error import NotFoundError
20+
from ..utils.feature_decorator import experimental
2021
from .eval_metrics import EvalMetric
22+
from .eval_metrics import MetricInfo
2123
from .eval_metrics import MetricName
2224
from .eval_metrics import PrebuiltMetrics
2325
from .evaluator import Evaluator
@@ -29,10 +31,11 @@
2931
logger = logging.getLogger("google_adk." + __name__)
3032

3133

34+
@experimental
3235
class MetricEvaluatorRegistry:
3336
"""A registry for metric Evaluators."""
3437

35-
_registry: dict[str, type[Evaluator]] = {}
38+
_registry: dict[str, tuple[type[Evaluator], MetricInfo]] = {}
3639

3740
def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
3841
"""Returns an Evaluator for the given metric.
@@ -48,15 +51,18 @@ def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
4851
if eval_metric.metric_name not in self._registry:
4952
raise NotFoundError(f"{eval_metric.metric_name} not found in registry.")
5053

51-
return self._registry[eval_metric.metric_name](eval_metric=eval_metric)
54+
return self._registry[eval_metric.metric_name][0](eval_metric=eval_metric)
5255

5356
def register_evaluator(
54-
self, metric_name: MetricName, evaluator: type[Evaluator]
57+
self,
58+
metric_info: MetricInfo,
59+
evaluator: type[Evaluator],
5560
):
56-
"""Registers an evaluator given the metric name.
61+
"""Registers an evaluator given the metric info.
5762
5863
If a mapping already exist, then it is updated.
5964
"""
65+
metric_name = metric_info.metric_name
6066
if metric_name in self._registry:
6167
logger.info(
6268
"Updating Evaluator class for %s from %s to %s",
@@ -65,31 +71,45 @@ def register_evaluator(
6571
evaluator,
6672
)
6773

68-
self._registry[str(metric_name)] = evaluator
74+
self._registry[str(metric_name)] = (evaluator, metric_info)
75+
76+
def get_registered_metrics(
77+
self,
78+
) -> list[MetricInfo]:
79+
"""Returns a list of MetricInfo about the metrics registered so far."""
80+
return [
81+
evaluator_and_metric_info[1].model_copy(deep=True)
82+
for _, evaluator_and_metric_info in self._registry.items()
83+
]
6984

7085

7186
def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
7287
"""Returns an instance of MetricEvaluatorRegistry with standard metrics already registered in it."""
7388
metric_evaluator_registry = MetricEvaluatorRegistry()
7489

7590
metric_evaluator_registry.register_evaluator(
76-
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
91+
metric_info=TrajectoryEvaluator.get_metric_info(),
7792
evaluator=TrajectoryEvaluator,
7893
)
94+
7995
metric_evaluator_registry.register_evaluator(
80-
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
96+
metric_info=ResponseEvaluator.get_metric_info(
97+
PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value
98+
),
8199
evaluator=ResponseEvaluator,
82100
)
83101
metric_evaluator_registry.register_evaluator(
84-
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
102+
metric_info=ResponseEvaluator.get_metric_info(
103+
PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
104+
),
85105
evaluator=ResponseEvaluator,
86106
)
87107
metric_evaluator_registry.register_evaluator(
88-
metric_name=PrebuiltMetrics.SAFETY_V1.value,
108+
metric_info=SafetyEvaluatorV1.get_metric_info(),
89109
evaluator=SafetyEvaluatorV1,
90110
)
91111
metric_evaluator_registry.register_evaluator(
92-
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
112+
metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
93113
evaluator=FinalResponseMatchV2Evaluator,
94114
)
95115

0 commit comments

Comments
 (0)