Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions agentplatform/_genai/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2409,7 +2409,8 @@ class EvaluationRunConfig(_common.BaseModel):
default=None, description="""The output config for the evaluation run."""
)
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description="""The autorater config for the evaluation run."""
default=None,
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
)
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
default=None, description="""The prompt template used for inference."""
Expand Down Expand Up @@ -2439,7 +2440,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
"""The output config for the evaluation run."""

autorater_config: Optional[genai_types.AutoraterConfigDict]
"""The autorater config for the evaluation run."""
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""

prompt_template: Optional[EvaluationRunPromptTemplateDict]
"""The prompt template used for inference."""
Expand Down Expand Up @@ -4746,7 +4747,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
default=None, description=""""""
)
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description=""""""
default=None,
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
)
metrics: Optional[list[Metric]] = Field(
default=None,
Expand Down Expand Up @@ -4797,7 +4799,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
""""""

autorater_config: Optional[genai_types.AutoraterConfigDict]
""""""
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""

metrics: Optional[list[MetricDict]]
"""The metrics used for evaluation.
Expand Down Expand Up @@ -19028,7 +19030,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
default=None, description=""""""
)
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description=""""""
default=None,
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
)
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")

Expand All @@ -19046,7 +19049,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
""""""

autorater_config: Optional[genai_types.AutoraterConfigDict]
""""""
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""

config: Optional[EvaluateDatasetConfigDict]
""""""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,39 @@ def test_evaluation_result(client):
# assert case_result.response_candidate_results is not None


def test_predefined_metric_with_judge_model_ignores_autorater_config(client):
"""Tests that autorater_config is ignored for predefined metrics in replays."""
prompts_df = pd.DataFrame(
{
"prompt": ["Explain the concept of machine learning in simple terms."],
"response": [
"Machine learning is a type of artificial intelligence that allows"
" computers to learn from data without being explicitly programmed."
],
}
)

eval_dataset = types.EvaluationDataset(
eval_dataset_df=prompts_df,
candidate_name="gemini-2.5-flash",
)

# Set judge_model, which should be ignored for predefined metrics
metric = types.Metric(
name="safety_v1",
judge_model="projects/model-evaluation-dev/locations/us-central1/publishers/google/models/gemini-2.5-flash"
)

evaluation_result = client.evals.evaluate(
dataset=eval_dataset,
metrics=[metric],
)

assert isinstance(evaluation_result, types.EvaluationResult)
assert evaluation_result.summary_metrics is not None
assert evaluation_result.summary_metrics[0].metric_name == "safety_v1"


def test_multi_turn_predefined_metric(client):
"""Tests that evaluate works with multi-turn predefined metrics."""
prompts_data = {
Expand Down
21 changes: 14 additions & 7 deletions vertexai/_genai/_evals_metric_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from . import evals
from . import types


logger = logging.getLogger(__name__)
_MAX_RETRIES = 5
# HTTP status codes that are safe to retry with backoff.
Expand Down Expand Up @@ -972,6 +971,17 @@ def __init__(self, module: "evals.Evals", metric: types.Metric):
raise ValueError(
f"Metric '{self.metric.name}' is not a supported predefined metric."
)
if (
self.metric.judge_model
or self.metric.judge_model_generation_config
or self.metric.judge_model_sampling_count
):
logger.warning(
"Autorater config settings (judge_model, "
"judge_model_generation_config, judge_model_sampling_count) "
"are ignored for predefined metric '%s'.",
self.metric.name,
)

def _build_request_payload(
self, eval_case: types.EvalCase, response_index: int
Expand Down Expand Up @@ -1026,11 +1036,9 @@ def _build_request_payload(
"instance": instance_payload,
}

autorater_config = _get_autorater_config(self.metric)
if autorater_config:
request_payload["autorater_config"] = genai_types.AutoraterConfig(
**autorater_config
)
# Note: autorater_config is intentionally not passed for predefined
# metrics. The server uses its own model configuration for predefined
# metrics and ignores the autorater_config field.
return request_payload

@override
Expand All @@ -1045,7 +1053,6 @@ def get_metric_result(
lambda: self.module._evaluate_instances(
metrics=[self.metric],
instance=payload.get("instance"),
autorater_config=payload.get("autorater_config"),
),
metric_name,
)
Expand Down
15 changes: 9 additions & 6 deletions vertexai/_genai/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2409,7 +2409,8 @@ class EvaluationRunConfig(_common.BaseModel):
default=None, description="""The output config for the evaluation run."""
)
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description="""The autorater config for the evaluation run."""
default=None,
description="""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
)
prompt_template: Optional[EvaluationRunPromptTemplate] = Field(
default=None, description="""The prompt template used for inference."""
Expand Down Expand Up @@ -2439,7 +2440,7 @@ class EvaluationRunConfigDict(TypedDict, total=False):
"""The output config for the evaluation run."""

autorater_config: Optional[genai_types.AutoraterConfigDict]
"""The autorater config for the evaluation run."""
"""The autorater config for the evaluation run. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""

prompt_template: Optional[EvaluationRunPromptTemplateDict]
"""The prompt template used for inference."""
Expand Down Expand Up @@ -4746,7 +4747,8 @@ class _EvaluateInstancesRequestParameters(_common.BaseModel):
default=None, description=""""""
)
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description=""""""
default=None,
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
)
metrics: Optional[list[Metric]] = Field(
default=None,
Expand Down Expand Up @@ -4797,7 +4799,7 @@ class _EvaluateInstancesRequestParametersDict(TypedDict, total=False):
""""""

autorater_config: Optional[genai_types.AutoraterConfigDict]
""""""
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""

metrics: Optional[list[MetricDict]]
"""The metrics used for evaluation.
Expand Down Expand Up @@ -19028,7 +19030,8 @@ class EvaluateDatasetRequestParameters(_common.BaseModel):
default=None, description=""""""
)
autorater_config: Optional[genai_types.AutoraterConfig] = Field(
default=None, description=""""""
default=None,
description="""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored.""",
)
config: Optional[EvaluateDatasetConfig] = Field(default=None, description="""""")

Expand All @@ -19046,7 +19049,7 @@ class EvaluateDatasetRequestParametersDict(TypedDict, total=False):
""""""

autorater_config: Optional[genai_types.AutoraterConfigDict]
""""""
"""Autorater config used for evaluation. Not applicable for predefined metrics (PredefinedMetricSpec); the server uses its own model configuration for predefined metrics and this field is ignored."""

config: Optional[EvaluateDatasetConfigDict]
""""""
Expand Down
Loading