Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,27 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres

## [Unreleased]

### Fixed
- **`execution: azd` reports no longer ship empty `Dataset:` lines and empty
`## Rows` tables.** The `eval.yaml` parser now recognizes the `dataset_file:`
field that `azd ai agent eval init` emits, so `report.md` shows the actual
dataset path. When azd returns aggregate metrics only (the normal case), the
reporter omits the row tables entirely and instead emits a `## Per-row
breakdown` section that links to the Foundry run for the per-sample view.
- **`agentops eval run` prints a clickable Foundry deep link on success.**
After a successful azd run, the CLI now emits a `Foundry run: <url>` line
alongside the `results.json`/`report.md` paths so users can jump straight to
the per-sample table and rubric drill-downs in the Foundry portal.

### Changed
- **Shorter azd backend log line.** Replaced the verbose `Running azd backend:
azd --no-prompt ai agent eval run --config <long path> --output json` line
with a concise `Running azd backend: azd ai agent eval run`; the full
command remains captured in the per-failure debug logs introduced in 0.3.18.
- **`execution: azd` startup line uses a workspace-relative recipe path** so
the "delegating to azd ai agent eval" message stays readable on long
Windows paths.

## [0.3.18] - 2026-06-10

### Fixed
Expand Down
4 changes: 4 additions & 0 deletions src/agentops/cli/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2766,6 +2766,10 @@ def _run_flat_schema_eval(
typer.echo(f"{_cli_label('report.md')}: {_cli_path(output_dir / 'report.md')}")
if latest_dir is not None:
typer.echo(f"{_cli_label('latest/')}: {_cli_path(latest_dir)}")
azd_eval = result.config.get("azd_evaluation") if isinstance(result.config, dict) else None
report_url = azd_eval.get("report_url") if isinstance(azd_eval, dict) else None
if isinstance(report_url, str) and report_url.strip():
typer.echo(f"{_cli_label('Foundry run')}: {report_url.strip()}")
if result.summary.overall_passed:
typer.echo(f"{_cli_label('Threshold status')}: {style('PASSED', 'bold', 'green')}")
return
Expand Down
1 change: 1 addition & 0 deletions src/agentops/core/azd_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ class EvalRecipe(BaseModel):
name: Optional[str] = None
agent: Optional[EvalAgent] = None
dataset_reference: Optional[EvalDatasetReference] = None
dataset_file: Optional[str] = None
evaluators: list[EvalEvaluator] = Field(default_factory=list)
options: Optional[EvalOptions] = None

Expand Down
17 changes: 16 additions & 1 deletion src/agentops/pipeline/azd_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def run_azd_eval(
"--output",
"json",
]
notify(f"Running azd backend: {' '.join(command)}")
notify("Running azd backend: azd ai agent eval run")

started = time.perf_counter()
completed = _run_command(
Expand Down Expand Up @@ -301,7 +301,9 @@ def normalize_to_results(
"azd_evaluation": {
"recipe_path": str(azd_run.recipe_path),
"run_id": azd_run.run_id,
"eval_id": _extract_eval_id(azd_run.payload),
"status": azd_run.status,
"report_url": _extract_report_url(azd_run.payload),
"dataset": (
recipe.dataset_reference.model_dump(mode="json")
if recipe.dataset_reference
Expand Down Expand Up @@ -477,6 +479,14 @@ def _extract_status(payload: Dict[str, Any]) -> str:
return "unknown"


def _extract_report_url(payload: Dict[str, Any]) -> Optional[str]:
for key in ("report_url", "reportUrl", "report_uri", "url"):
value = payload.get(key)
if isinstance(value, str) and value.strip().lower().startswith(("http://", "https://")):
return value.strip()
return None


def _extract_item_count(payload: Dict[str, Any]) -> int:
for key in ("items_total", "item_count", "samples", "max_samples", "row_count"):
value = payload.get(key)
Expand Down Expand Up @@ -585,6 +595,11 @@ def _looks_like_metric_name(name: str) -> bool:


def _recipe_dataset_path(recipe: EvalRecipe, recipe_path: Path) -> str:
if recipe.dataset_file:
dataset = Path(recipe.dataset_file)
if not dataset.is_absolute():
dataset = recipe_path.parent / dataset
return str(dataset)
ref = recipe.dataset_reference
if ref and ref.local_uri:
dataset = Path(ref.local_uri)
Expand Down
8 changes: 6 additions & 2 deletions src/agentops/pipeline/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,10 +537,14 @@ def _run_evaluation_azd(

recipe_path = azd_runner.resolve_eval_recipe(workspace, config)
recipe = load_eval_recipe(recipe_path)
try:
recipe_display = recipe_path.relative_to(workspace).as_posix()
except ValueError:
recipe_display = recipe_path.name
progress(
f"execution: {style('azd', 'bold')} - delegating to "
f"{style('azd ai agent eval', 'cyan')} with recipe "
f"{style(str(recipe_path), 'cyan')}."
f"{style('azd ai agent eval', 'cyan')} (recipe "
f"{style(recipe_display, 'cyan')})."
)

azd_run = azd_runner.run_azd_eval(
Expand Down
44 changes: 35 additions & 9 deletions src/agentops/pipeline/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def render(result: RunResult) -> str:
lines.append(f"- **Target:** `{result.target.raw}` ({result.target.kind})")
if result.target.protocol:
lines.append(f"- **Protocol:** {result.target.protocol}")
lines.append(f"- **Dataset:** `{result.dataset_path}`")
if result.dataset_path:
lines.append(f"- **Dataset:** `{result.dataset_path}`")
lines.append(f"- **Started:** {result.started_at}")
lines.append(f"- **Duration:** {result.duration_seconds:.2f}s")
lines.append(f"- **Rows:** {result.summary.items_total}")
Expand Down Expand Up @@ -62,22 +63,27 @@ def render(result: RunResult) -> str:
lines.append(f"| {row.row_index} | {_short(row.error or '', 200)} |")
lines.append("")

lines.append("## Rows")
lines.append("")
lines.append("| # | Latency (s) | Metrics |")
lines.append("| --- | --- | --- |")
for row in result.rows:
lines.append(_row_summary(row))
lines.append("")

if result.rows:
lines.append("## Rows")
lines.append("")
lines.append("| # | Latency (s) | Metrics |")
lines.append("| --- | --- | --- |")
for row in result.rows:
lines.append(_row_summary(row))
lines.append("")

lines.append("## Row Details")
lines.append("")
lines.append("| # | Input | Response | Expected |")
lines.append("| --- | --- | --- | --- |")
for row in result.rows:
lines.append(_row_detail(row))
lines.append("")
else:
azd_eval = result.config.get("azd_evaluation")
if isinstance(azd_eval, dict):
lines.extend(_render_azd_aggregate_note(azd_eval))
lines.append("")

cloud = result.config.get("cloud_evaluation")
if isinstance(cloud, dict):
Expand Down Expand Up @@ -120,6 +126,26 @@ def _short(text: str, limit: int) -> str:
return text if len(text) <= limit else text[: limit - 1] + "…"


def _render_azd_aggregate_note(azd: dict) -> List[str]:
lines = ["## Per-row breakdown", ""]
lines.append(
"`execution: azd` reports aggregate metrics only; per-row scores "
"are recorded by Foundry."
)
report_url = azd.get("report_url")
if isinstance(report_url, str) and report_url.strip():
lines.append("")
lines.append(f"**Open the run in Foundry:** {report_url.strip()}")
else:
lines.append("")
lines.append(
"Open the latest run in the Foundry portal "
"(Agents → your agent → Evaluations) to see the per-sample table "
"and rubric drill-downs."
)
return lines


def _render_cloud_evaluation(cloud: dict) -> List[str]:
lines = ["## Foundry Cloud Session", ""]
status = str(cloud.get("status") or "unknown")
Expand Down
Loading