Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: Enable FinalResponseMatchV2 metric as an experiment #2000

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions src/google/adk/cli/cli_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from __future__ import annotations

import importlib.util
import inspect
import json
import logging
import os
Expand All @@ -31,6 +32,7 @@
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import EvalMetricResult
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
from ..evaluation.eval_metrics import JudgeModelOptions
from ..evaluation.eval_result import EvalCaseResult
from ..evaluation.evaluator import EvalStatus
from ..evaluation.evaluator import Evaluator
Expand All @@ -42,6 +44,7 @@
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
SAFETY_V1_KEY = "safety_v1"
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
# This evaluation is not very stable.
# This is always optional unless explicitly specified.
RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
Expand Down Expand Up @@ -191,10 +194,16 @@ async def run_evals(
for eval_metric in eval_metrics:
metric_evaluator = _get_evaluator(eval_metric)

evaluation_result = metric_evaluator.evaluate_invocations(
actual_invocations=inference_result,
expected_invocations=eval_case.conversation,
)
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
evaluation_result = await metric_evaluator.evaluate_invocations(
actual_invocations=inference_result,
expected_invocations=eval_case.conversation,
)
else:
evaluation_result = metric_evaluator.evaluate_invocations(
actual_invocations=inference_result,
expected_invocations=eval_case.conversation,
)

overall_eval_metric_results.append(
EvalMetricResult(
Expand Down Expand Up @@ -260,6 +269,7 @@ async def run_evals(

def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
try:
from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
from ..evaluation.response_evaluator import ResponseEvaluator
from ..evaluation.safety_evaluator import SafetyEvaluatorV1
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
Expand All @@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
)
elif eval_metric.metric_name == SAFETY_V1_KEY:
return SafetyEvaluatorV1(eval_metric)
elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
eval_metric.judge_model_options = JudgeModelOptions()
return FinalResponseMatchV2Evaluator(eval_metric)

raise ValueError(f"Unsupported eval metric: {eval_metric}")
4 changes: 4 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum):

RESPONSE_MATCH_SCORE = "response_match_score"

SAFETY_V1 = "safety_v1"

FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"


MetricName: TypeAlias = Union[str, PrebuiltMetrics]

Expand Down
4 changes: 2 additions & 2 deletions src/google/adk/evaluation/final_response_match_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from typing_extensions import override

from ..models.llm_response import LlmResponse
from ..utils.feature_decorator import working_in_progress
from ..utils.feature_decorator import experimental
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .evaluator import EvalStatus
Expand Down Expand Up @@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
return label


@working_in_progress
@experimental
class FinalResponseMatchV2Evaluator(LlmAsJudge):
"""V2 final response match evaluator which uses an LLM to judge responses.

Expand Down
22 changes: 16 additions & 6 deletions src/google/adk/evaluation/metric_evaluator_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
from .eval_metrics import MetricName
from .eval_metrics import PrebuiltMetrics
from .evaluator import Evaluator
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
from .response_evaluator import ResponseEvaluator
from .safety_evaluator import SafetyEvaluatorV1
from .trajectory_evaluator import TrajectoryEvaluator

logger = logging.getLogger("google_adk." + __name__)
Expand Down Expand Up @@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
metric_evaluator_registry = MetricEvaluatorRegistry()

metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
evaluator=type(TrajectoryEvaluator),
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
evaluator=TrajectoryEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
evaluator=type(ResponseEvaluator),
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
evaluator=ResponseEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
evaluator=type(ResponseEvaluator),
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
evaluator=ResponseEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.SAFETY_V1.value,
evaluator=SafetyEvaluatorV1,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
evaluator=FinalResponseMatchV2Evaluator,
)

return metric_evaluator_registry
Expand Down