diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index d122c2150..b9630103b 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -15,6 +15,7 @@ from __future__ import annotations import importlib.util +import inspect import json import logging import os @@ -31,6 +32,7 @@ from ..evaluation.eval_metrics import EvalMetric from ..evaluation.eval_metrics import EvalMetricResult from ..evaluation.eval_metrics import EvalMetricResultPerInvocation +from ..evaluation.eval_metrics import JudgeModelOptions from ..evaluation.eval_result import EvalCaseResult from ..evaluation.evaluator import EvalStatus from ..evaluation.evaluator import Evaluator @@ -42,6 +44,7 @@ TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" RESPONSE_MATCH_SCORE_KEY = "response_match_score" SAFETY_V1_KEY = "safety_v1" +FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2" # This evaluation is not very stable. # This is always optional unless explicitly specified. RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score" @@ -191,10 +194,16 @@ async def run_evals( for eval_metric in eval_metrics: metric_evaluator = _get_evaluator(eval_metric) - evaluation_result = metric_evaluator.evaluate_invocations( - actual_invocations=inference_result, - expected_invocations=eval_case.conversation, - ) + if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations): + evaluation_result = await metric_evaluator.evaluate_invocations( + actual_invocations=inference_result, + expected_invocations=eval_case.conversation, + ) + else: + evaluation_result = metric_evaluator.evaluate_invocations( + actual_invocations=inference_result, + expected_invocations=eval_case.conversation, + ) overall_eval_metric_results.append( EvalMetricResult( @@ -260,6 +269,7 @@ async def run_evals( def _get_evaluator(eval_metric: EvalMetric) -> Evaluator: try: + from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator from ..evaluation.response_evaluator import ResponseEvaluator from ..evaluation.safety_evaluator import SafetyEvaluatorV1 from ..evaluation.trajectory_evaluator import TrajectoryEvaluator @@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator: ) elif eval_metric.metric_name == SAFETY_V1_KEY: return SafetyEvaluatorV1(eval_metric) + elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2: + eval_metric.judge_model_options = JudgeModelOptions() + return FinalResponseMatchV2Evaluator(eval_metric) raise ValueError(f"Unsupported eval metric: {eval_metric}") diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 8cf235427..1f6acf264 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum): RESPONSE_MATCH_SCORE = "response_match_score" + SAFETY_V1 = "safety_v1" + + FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2" + MetricName: TypeAlias = Union[str, PrebuiltMetrics] diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py index ad43448d8..cd13a0736 100644 --- a/src/google/adk/evaluation/final_response_match_v2.py +++ b/src/google/adk/evaluation/final_response_match_v2.py @@ -21,7 +21,7 @@ from typing_extensions import override from ..models.llm_response import LlmResponse -from ..utils.feature_decorator import working_in_progress +from ..utils.feature_decorator import experimental from .eval_case import Invocation from .eval_metrics import EvalMetric from .evaluator import EvalStatus @@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label: return label -@working_in_progress +@experimental class FinalResponseMatchV2Evaluator(LlmAsJudge): """V2 final response match evaluator which uses an LLM to judge responses. diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py index 99a700896..c3af06563 100644 --- a/src/google/adk/evaluation/metric_evaluator_registry.py +++ b/src/google/adk/evaluation/metric_evaluator_registry.py @@ -21,7 +21,9 @@ from .eval_metrics import MetricName from .eval_metrics import PrebuiltMetrics from .evaluator import Evaluator +from .final_response_match_v2 import FinalResponseMatchV2Evaluator from .response_evaluator import ResponseEvaluator +from .safety_evaluator import SafetyEvaluatorV1 from .trajectory_evaluator import TrajectoryEvaluator logger = logging.getLogger("google_adk." + __name__) @@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry: metric_evaluator_registry = MetricEvaluatorRegistry() metric_evaluator_registry.register_evaluator( - metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE, - evaluator=type(TrajectoryEvaluator), + metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + evaluator=TrajectoryEvaluator, ) metric_evaluator_registry.register_evaluator( - metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE, - evaluator=type(ResponseEvaluator), + metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value, + evaluator=ResponseEvaluator, ) metric_evaluator_registry.register_evaluator( - metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE, - evaluator=type(ResponseEvaluator), + metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value, + evaluator=ResponseEvaluator, + ) + metric_evaluator_registry.register_evaluator( + metric_name=PrebuiltMetrics.SAFETY_V1.value, + evaluator=SafetyEvaluatorV1, + ) + metric_evaluator_registry.register_evaluator( + metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value, + evaluator=FinalResponseMatchV2Evaluator, ) return metric_evaluator_registry