diff --git a/CHANGELOG.md b/CHANGELOG.md index cdc86d6..5d8082a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,27 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres ## [Unreleased] +### Fixed +- **`execution: azd` reports no longer ship empty `Dataset:` lines and empty + `## Rows` tables.** The `eval.yaml` parser now recognizes the `dataset_file:` + field that `azd ai agent eval init` emits, so `report.md` shows the actual + dataset path. When azd returns aggregate metrics only (the normal case), the + reporter omits the row tables entirely and instead emits a `## Per-row + breakdown` section that links to the Foundry run for the per-sample view. +- **`agentops eval run` prints a clickable Foundry deep link on success.** + After a successful azd run, the CLI now emits a `Foundry run: ` line + alongside the `results.json`/`report.md` paths so users can jump straight to + the per-sample table and rubric drill-downs in the Foundry portal. + +### Changed +- **Shorter azd backend log line.** Replaced the verbose `Running azd backend: + azd --no-prompt ai agent eval run --config --output json` line + with a concise `Running azd backend: azd ai agent eval run`; the full + command remains captured in the per-failure debug logs introduced in 0.3.18. +- **`execution: azd` startup line uses a workspace-relative recipe path** so + the "delegating to azd ai agent eval" message stays readable on long + Windows paths. + ## [0.3.18] - 2026-06-10 ### Fixed diff --git a/src/agentops/cli/app.py b/src/agentops/cli/app.py index a793190..81cd349 100644 --- a/src/agentops/cli/app.py +++ b/src/agentops/cli/app.py @@ -2766,6 +2766,10 @@ def _run_flat_schema_eval( typer.echo(f"{_cli_label('report.md')}: {_cli_path(output_dir / 'report.md')}") if latest_dir is not None: typer.echo(f"{_cli_label('latest/')}: {_cli_path(latest_dir)}") + azd_eval = result.config.get("azd_evaluation") if isinstance(result.config, dict) else None + report_url = azd_eval.get("report_url") if isinstance(azd_eval, dict) else None + if isinstance(report_url, str) and report_url.strip(): + typer.echo(f"{_cli_label('Foundry run')}: {report_url.strip()}") if result.summary.overall_passed: typer.echo(f"{_cli_label('Threshold status')}: {style('PASSED', 'bold', 'green')}") return diff --git a/src/agentops/core/azd_eval.py b/src/agentops/core/azd_eval.py index c43c88f..008364e 100644 --- a/src/agentops/core/azd_eval.py +++ b/src/agentops/core/azd_eval.py @@ -139,6 +139,7 @@ class EvalRecipe(BaseModel): name: Optional[str] = None agent: Optional[EvalAgent] = None dataset_reference: Optional[EvalDatasetReference] = None + dataset_file: Optional[str] = None evaluators: list[EvalEvaluator] = Field(default_factory=list) options: Optional[EvalOptions] = None diff --git a/src/agentops/pipeline/azd_runner.py b/src/agentops/pipeline/azd_runner.py index 56df298..bcfc598 100644 --- a/src/agentops/pipeline/azd_runner.py +++ b/src/agentops/pipeline/azd_runner.py @@ -118,7 +118,7 @@ def run_azd_eval( "--output", "json", ] - notify(f"Running azd backend: {' '.join(command)}") + notify("Running azd backend: azd ai agent eval run") started = time.perf_counter() completed = _run_command( @@ -301,7 +301,9 @@ def normalize_to_results( "azd_evaluation": { "recipe_path": str(azd_run.recipe_path), "run_id": azd_run.run_id, + "eval_id": _extract_eval_id(azd_run.payload), "status": azd_run.status, + "report_url": _extract_report_url(azd_run.payload), "dataset": ( recipe.dataset_reference.model_dump(mode="json") if recipe.dataset_reference @@ -477,6 +479,14 @@ def _extract_status(payload: Dict[str, Any]) -> str: return "unknown" +def _extract_report_url(payload: Dict[str, Any]) -> Optional[str]: + for key in ("report_url", "reportUrl", "report_uri", "url"): + value = payload.get(key) + if isinstance(value, str) and value.strip().lower().startswith(("http://", "https://")): + return value.strip() + return None + + def _extract_item_count(payload: Dict[str, Any]) -> int: for key in ("items_total", "item_count", "samples", "max_samples", "row_count"): value = payload.get(key) @@ -585,6 +595,11 @@ def _looks_like_metric_name(name: str) -> bool: def _recipe_dataset_path(recipe: EvalRecipe, recipe_path: Path) -> str: + if recipe.dataset_file: + dataset = Path(recipe.dataset_file) + if not dataset.is_absolute(): + dataset = recipe_path.parent / dataset + return str(dataset) ref = recipe.dataset_reference if ref and ref.local_uri: dataset = Path(ref.local_uri) diff --git a/src/agentops/pipeline/orchestrator.py b/src/agentops/pipeline/orchestrator.py index 7ee1e2d..81dc32e 100644 --- a/src/agentops/pipeline/orchestrator.py +++ b/src/agentops/pipeline/orchestrator.py @@ -537,10 +537,14 @@ def _run_evaluation_azd( recipe_path = azd_runner.resolve_eval_recipe(workspace, config) recipe = load_eval_recipe(recipe_path) + try: + recipe_display = recipe_path.relative_to(workspace).as_posix() + except ValueError: + recipe_display = recipe_path.name progress( f"execution: {style('azd', 'bold')} - delegating to " - f"{style('azd ai agent eval', 'cyan')} with recipe " - f"{style(str(recipe_path), 'cyan')}." + f"{style('azd ai agent eval', 'cyan')} (recipe " + f"{style(recipe_display, 'cyan')})." ) azd_run = azd_runner.run_azd_eval( diff --git a/src/agentops/pipeline/reporter.py b/src/agentops/pipeline/reporter.py index 16b1e07..20b8606 100644 --- a/src/agentops/pipeline/reporter.py +++ b/src/agentops/pipeline/reporter.py @@ -24,7 +24,8 @@ def render(result: RunResult) -> str: lines.append(f"- **Target:** `{result.target.raw}` ({result.target.kind})") if result.target.protocol: lines.append(f"- **Protocol:** {result.target.protocol}") - lines.append(f"- **Dataset:** `{result.dataset_path}`") + if result.dataset_path: + lines.append(f"- **Dataset:** `{result.dataset_path}`") lines.append(f"- **Started:** {result.started_at}") lines.append(f"- **Duration:** {result.duration_seconds:.2f}s") lines.append(f"- **Rows:** {result.summary.items_total}") @@ -62,15 +63,15 @@ def render(result: RunResult) -> str: lines.append(f"| {row.row_index} | {_short(row.error or '', 200)} |") lines.append("") - lines.append("## Rows") - lines.append("") - lines.append("| # | Latency (s) | Metrics |") - lines.append("| --- | --- | --- |") - for row in result.rows: - lines.append(_row_summary(row)) - lines.append("") - if result.rows: + lines.append("## Rows") + lines.append("") + lines.append("| # | Latency (s) | Metrics |") + lines.append("| --- | --- | --- |") + for row in result.rows: + lines.append(_row_summary(row)) + lines.append("") + lines.append("## Row Details") lines.append("") lines.append("| # | Input | Response | Expected |") @@ -78,6 +79,11 @@ def render(result: RunResult) -> str: for row in result.rows: lines.append(_row_detail(row)) lines.append("") + else: + azd_eval = result.config.get("azd_evaluation") + if isinstance(azd_eval, dict): + lines.extend(_render_azd_aggregate_note(azd_eval)) + lines.append("") cloud = result.config.get("cloud_evaluation") if isinstance(cloud, dict): @@ -120,6 +126,26 @@ def _short(text: str, limit: int) -> str: return text if len(text) <= limit else text[: limit - 1] + "…" +def _render_azd_aggregate_note(azd: dict) -> List[str]: + lines = ["## Per-row breakdown", ""] + lines.append( + "`execution: azd` reports aggregate metrics only; per-row scores " + "are recorded by Foundry." + ) + report_url = azd.get("report_url") + if isinstance(report_url, str) and report_url.strip(): + lines.append("") + lines.append(f"**Open the run in Foundry:** {report_url.strip()}") + else: + lines.append("") + lines.append( + "Open the latest run in the Foundry portal " + "(Agents → your agent → Evaluations) to see the per-sample table " + "and rubric drill-downs." + ) + return lines + + def _render_cloud_evaluation(cloud: dict) -> List[str]: lines = ["## Foundry Cloud Session", ""] status = str(cloud.get("status") or "unknown")