diff --git a/src/pr_af/hitl/client.py b/src/pr_af/hitl/client.py index 1c5ceb2..c0edfaa 100644 --- a/src/pr_af/hitl/client.py +++ b/src/pr_af/hitl/client.py @@ -48,9 +48,19 @@ def approval_webhook_url(app: Any) -> str | None: Mirrors the URL SWE-AF's plan-approval gate uses (``{cp_base_url}/api/v1/webhooks/approval-response``). Returns ``None`` when no control-plane URL can be resolved. + + ``AGENTFIELD_PUBLIC_URL`` takes precedence: this callback is invoked by + hax-sdk, which lives in a *separate* Railway project, so it must be a + publicly reachable URL — not the ``AGENTFIELD_SERVER`` internal address + (e.g. ``control-plane.railway.internal``) the agent uses to call the CP. + Conflating the two is what made an answered review undeliverable: the + webhook ``fetch failed`` because the internal host doesn't resolve + cross-project. Falls back to ``AGENTFIELD_SERVER`` for single-project / + local setups where the two are the same. """ cp_base = ( - getattr(app, "agentfield_server", None) + os.environ.get("AGENTFIELD_PUBLIC_URL") + or getattr(app, "agentfield_server", None) or os.environ.get("AGENTFIELD_SERVER") or "" ).rstrip("/") diff --git a/src/pr_af/hitl/review_gate.py b/src/pr_af/hitl/review_gate.py index a78a3b8..501e185 100644 --- a/src/pr_af/hitl/review_gate.py +++ b/src/pr_af/hitl/review_gate.py @@ -18,6 +18,7 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any +from ..schemas.severity import normalize_severity from .client import ( create_hax_form_request_with_timeout, extract_values_from_raw, @@ -98,7 +99,10 @@ def _finding_payload(finding: ScoredFinding) -> dict[str, Any]: """One finding entry for the hax template (camelCase per its zod schema).""" entry: dict[str, Any] = { "id": finding.id, - "severity": finding.severity, + # Last line of defense before the hax-sdk zod enum: a finding built off + # the validated path could still carry a stray label. Normalize so the + # create never 422s on severity. + "severity": normalize_severity(finding.severity), "title": finding.title, # All findings start checked, matching the prior "submit posts all" default. "defaultSelected": True, diff --git a/src/pr_af/orchestrator.py b/src/pr_af/orchestrator.py index c9d8451..2ab9659 100644 --- a/src/pr_af/orchestrator.py +++ b/src/pr_af/orchestrator.py @@ -60,6 +60,7 @@ ReviewFinding, ReviewPlan, ) +from .schemas.severity import normalize_severity from .scoring import deduplicate_exact, determine_review_event, score_findings @@ -206,11 +207,20 @@ async def run(self) -> ReviewResult: # reject, expire/error, or a re-review past the revision cap → no post. reason = "revision cap reached" if decision.is_rerun else decision.action + # Surface the underlying cause loudly: a create_request/pause failure + # carries its exception text in `instructions`, and dropping a whole + # computed review on a transient hax hiccup must not look silent. + detail = (decision.instructions or "").strip() + detail_suffix = f": {detail}" if detail else "" self.app.note( - f"hitl: not posting review ({reason}; raw={decision.decision_raw})", + f"hitl: not posting review ({reason}; raw={decision.decision_raw}){detail_suffix}", tags=["hitl", "no-post", decision.action], ) - print(f"[PR-AF] HITL: not posting review ({reason})", flush=True) + print( + f"[PR-AF] HITL: NOT posting {len(scored_findings)} finding(s) " + f"({reason}{detail_suffix})", + flush=True, + ) return await self._finish(scored_findings, intake, anatomy, plan, post=False) # The loop always returns; this guards against a future edit slipping by. @@ -534,7 +544,8 @@ async def _run_evidence_verification( updated = f.model_copy( update={ "confidence": max(0.1, vf.get("revised_confidence", 0.3)), - "severity": vf.get("revised_severity", "suggestion") or "suggestion", + # model_copy bypasses validators, so normalize explicitly. + "severity": normalize_severity(vf.get("revised_severity")), } ) updated_findings.append(updated) @@ -544,8 +555,9 @@ async def _run_evidence_verification( if revised_conf is not None and isinstance(revised_conf, (int, float)): updates["confidence"] = float(revised_conf) revised_sev = vf.get("revised_severity") - if revised_sev and revised_sev in ("critical", "important", "suggestion", "nitpick"): - updates["severity"] = revised_sev + if revised_sev: + # model_copy bypasses validators, so normalize explicitly. + updates["severity"] = normalize_severity(revised_sev) if updates: updated_findings.append(f.model_copy(update=updates)) else: diff --git a/src/pr_af/reasoners/harnesses.py b/src/pr_af/reasoners/harnesses.py index c388c81..25c523c 100644 --- a/src/pr_af/reasoners/harnesses.py +++ b/src/pr_af/reasoners/harnesses.py @@ -18,6 +18,7 @@ ReviewFinding, ReviewPlan, ) +from ..schemas.severity import Severity # noqa: TC001 - runtime-needed pydantic field type from . import router @@ -44,7 +45,7 @@ class _ReviewFindingsResult(BaseModel): class _CompoundFinding(BaseModel): title: str = "" - severity: str = "suggestion" + severity: Severity = "suggestion" file_path: str = "" line_start: int = 0 line_end: int = 0 @@ -906,6 +907,9 @@ async def review_dimension( f"handling, test coverage gaps for specific scenarios. The code works but could be " f"more robust.\n" f"- **nitpick**: Naming, style, readability, documentation. Truly cosmetic.\n\n" + f"The `severity` field MUST be EXACTLY one of these four lowercase strings: " + f"`critical`, `important`, `suggestion`, `nitpick`. Do NOT use `high`, `medium`, " + f"`low`, `warning`, or any other label.\n\n" f"If you're unsure whether something is critical or important, provide your reasoning " f"in the `body` field and let the confidence score reflect your uncertainty.\n\n" f"## False-Positive Prevention (CRITICAL)\n\n" @@ -956,7 +960,17 @@ async def review_dimension( schema=_ReviewFindingsResult, cwd=repo_path or None, ) - parsed = result.parsed if result.parsed else _ReviewFindingsResult() + if result.parsed: + parsed = result.parsed + else: + # Schema parse failed entirely — don't silently report "0 findings", + # which is indistinguishable from a clean review. Make it visible. + print( + "[PR-AF] review_dimension: schema parse failed — treating as 0 " + f"findings for this dimension (error={getattr(result, 'error_message', None)!r})", + flush=True, + ) + parsed = _ReviewFindingsResult() sub_review_dicts = [] if can_spawn and parsed.sub_reviews: sub_review_dicts = [ @@ -1053,6 +1067,7 @@ async def compound_finder_phase( "- If a compound issue exists, emit NEW findings only. Do not repeat original findings.\n" "- Each output finding must include: title, severity, file_path, line_start, line_end, " "body, evidence, suggestion, confidence, tags, and contributing_findings.\n" + "- `severity` MUST be exactly one of: `critical`, `important`, `suggestion`, `nitpick`.\n" "- `contributing_findings` must list the exact titles from this cluster that combine.\n" "- Only emit findings with confidence >= 0.6 and concrete evidence.\n\n" + findings_ref diff --git a/src/pr_af/schemas/output.py b/src/pr_af/schemas/output.py index 55aff25..ccb71d3 100644 --- a/src/pr_af/schemas/output.py +++ b/src/pr_af/schemas/output.py @@ -7,6 +7,8 @@ from pydantic import BaseModel, Field +from .severity import Severity # noqa: TC001 - runtime-needed pydantic field type + class ScoredFinding(BaseModel): """A finding after scoring, dedup, and line mapping.""" @@ -19,7 +21,7 @@ class ScoredFinding(BaseModel): line_end: int diff_line: int | None = None # Line number in the diff (for GitHub API) diff_side: str = "RIGHT" # LEFT (deletion) or RIGHT (addition) - severity: str # critical | important | suggestion | nitpick + severity: Severity # critical | important | suggestion | nitpick (normalized) title: str body: str suggestion: str | None = None diff --git a/src/pr_af/schemas/pipeline.py b/src/pr_af/schemas/pipeline.py index 821668d..27116bd 100644 --- a/src/pr_af/schemas/pipeline.py +++ b/src/pr_af/schemas/pipeline.py @@ -11,6 +11,8 @@ from pydantic import BaseModel, Field +from .severity import Severity # noqa: TC001 - runtime-needed pydantic field type + # --------------------------------------------------------------------------- # Phase 1 → Phase 3: Intake Result # Format: Hybrid (structured fields for routing + pr_summary string for LLM context) @@ -169,7 +171,7 @@ class ReviewFinding(BaseModel): line_start: int line_end: int hunk_context: str = "" # Code context around the finding - severity: str # critical | important | suggestion | nitpick + severity: Severity # critical | important | suggestion | nitpick (normalized) title: str body: str # Detailed explanation (string — appears in GitHub comment) suggestion: str | None = None # Concrete fix (code block) @@ -191,7 +193,7 @@ class CrossRefInteraction(BaseModel): finding_b_title: str interaction_type: str # compound_risk | assumption_violation | consistency_gap description: str # How they interact - combined_severity: str # The severity of the combined issue + combined_severity: Severity # The severity of the combined issue (normalized) file_paths: list[str] line_references: list[str] = Field(default_factory=list) diff --git a/src/pr_af/schemas/severity.py b/src/pr_af/schemas/severity.py new file mode 100644 index 0000000..793a272 --- /dev/null +++ b/src/pr_af/schemas/severity.py @@ -0,0 +1,95 @@ +"""Canonical finding-severity vocabulary — the single source of truth. + +PR-AF findings must use exactly one of four severities. This is a *cross-repo +contract*: the hax-sdk ``pr-af-review-v1`` template enforces the same set as a +zod enum, and rejects the whole HITL request if a finding carries anything else +(this is what silently swallowed a real review — a model-emitted ``"high"`` +failed the template's enum, the create 422'd, and the gate treated it as a +reject → nothing posted). + +To keep the model honest *and* the pipeline robust we do two things with this +module: + +1. Type finding-severity fields as :data:`Severity` (an ``Annotated`` ``Literal``) + so the JSON schema handed to the model advertises the enum, **and** so a + :func:`normalize_severity` ``BeforeValidator`` coerces stray values + (``high`` → ``important``) instead of failing validation — deterministic and + cheap, with no retry storms or dropped-findings fallout. +2. Re-apply :func:`normalize_severity` at the hax-sdk boundary as defense in + depth, so a finding constructed off the validated path can never break the + create. +""" + +from __future__ import annotations + +from typing import Annotated, Literal + +from pydantic import BeforeValidator + +#: The four canonical severities, in *decreasing* order of urgency. +SeverityLiteral = Literal["critical", "important", "suggestion", "nitpick"] + +#: Tuple form for membership checks / iteration. +VALID_SEVERITIES: tuple[str, ...] = ( + "critical", + "important", + "suggestion", + "nitpick", +) + +#: Where unknown / empty labels land. "suggestion" is the mid-low rung — it +#: keeps the finding visible without inflating it to a blocker. +DEFAULT_SEVERITY: SeverityLiteral = "suggestion" + +# Common labels models reach for, mapped onto the 4-level scale by rank. +# critical > important > suggestion > nitpick. +_SEVERITY_ALIASES: dict[str, SeverityLiteral] = { + # canonical (identity) + "critical": "critical", + "important": "important", + "suggestion": "suggestion", + "nitpick": "nitpick", + # critical-tier synonyms + "blocker": "critical", + "fatal": "critical", + "severe": "critical", + # important-tier synonyms (the "high" that caused the incident) + "high": "important", + "error": "important", + "major": "important", + # suggestion-tier synonyms + "medium": "suggestion", + "moderate": "suggestion", + "warning": "suggestion", + "warn": "suggestion", + # nitpick-tier synonyms + "low": "nitpick", + "minor": "nitpick", + "nit": "nitpick", + "info": "nitpick", + "informational": "nitpick", + "trivial": "nitpick", +} + + +def normalize_severity( + value: object, default: SeverityLiteral = DEFAULT_SEVERITY +) -> SeverityLiteral: + """Coerce an arbitrary severity label to the canonical vocabulary. + + Case-insensitive and whitespace-tolerant. Known synonyms map by rank + (``high`` → ``important``, ``medium`` → ``suggestion``, ``low`` → + ``nitpick``); empty, non-string, or unrecognized input falls back to + ``default``. + """ + if not isinstance(value, str): + return default + key = value.strip().lower() + if not key: + return default + return _SEVERITY_ALIASES.get(key, default) + + +#: Field type for finding severities: advertises the enum in the JSON schema +#: (so the model is guided) while coercing stray labels before validation. +Severity = Annotated[SeverityLiteral, BeforeValidator(normalize_severity)] diff --git a/tests/test_hitl_review_gate.py b/tests/test_hitl_review_gate.py index 0e8d8f6..1bbf41a 100644 --- a/tests/test_hitl_review_gate.py +++ b/tests/test_hitl_review_gate.py @@ -17,6 +17,7 @@ ACTION_REJECT, ACTION_RERUN, HAX_REVIEW_TEMPLATE, + approval_webhook_url, build_hax_client_from_env, build_review_payload, clean_intent, @@ -167,6 +168,45 @@ def test_values_nested_under_response_key_are_found(): assert decision.selected_finding_ids == {"f2"} +# --- callback URL resolution --------------------------------------------- +# +# Contract: the approval callback is invoked by hax-sdk (a separate Railway +# project), so a public URL must win over the internal AGENTFIELD_SERVER the +# agent uses to reach the CP. Conflating them made answered reviews +# undeliverable (the webhook "fetch failed" on an internal hostname). + + +def test_webhook_url_prefers_public_env_over_internal_server(monkeypatch): + monkeypatch.setenv("AGENTFIELD_PUBLIC_URL", "https://cp.public.example") + monkeypatch.setenv("AGENTFIELD_SERVER", "http://control-plane.railway.internal:8080") + app = SimpleNamespace(agentfield_server="http://control-plane.railway.internal:8080") + assert ( + approval_webhook_url(app) + == "https://cp.public.example/api/v1/webhooks/approval-response" + ) + + +def test_webhook_url_falls_back_to_server_when_no_public(monkeypatch): + monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False) + monkeypatch.delenv("AGENTFIELD_SERVER", raising=False) + app = SimpleNamespace(agentfield_server="http://cp.local:8080") + assert approval_webhook_url(app) == "http://cp.local:8080/api/v1/webhooks/approval-response" + + +def test_webhook_url_uses_server_env_when_app_attr_missing(monkeypatch): + monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False) + monkeypatch.setenv("AGENTFIELD_SERVER", "http://cp.env:8080") + app = SimpleNamespace(agentfield_server=None) + assert approval_webhook_url(app) == "http://cp.env:8080/api/v1/webhooks/approval-response" + + +def test_webhook_url_none_when_nothing_configured(monkeypatch): + monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False) + monkeypatch.delenv("AGENTFIELD_SERVER", raising=False) + app = SimpleNamespace(agentfield_server=None) + assert approval_webhook_url(app) is None + + # --- watchdog safety ----------------------------------------------------- diff --git a/tests/test_severity.py b/tests/test_severity.py new file mode 100644 index 0000000..5b8c87e --- /dev/null +++ b/tests/test_severity.py @@ -0,0 +1,126 @@ +"""Tests for the canonical severity vocabulary and its enforcement. + +Validation contract (behavior, not implementation): + +* ``normalize_severity`` maps any label to the 4-level canonical scale: + canonical values are returned unchanged; known synonyms map by rank + (``high`` → ``important``, ``medium`` → ``suggestion``, ``low`` → ``nitpick``); + matching is case-insensitive and whitespace-tolerant; empty / non-string / + unknown input falls back to ``suggestion``. +* Finding models coerce a stray ``severity`` at validation time, so a model that + emits ``"high"`` never produces an out-of-vocabulary value downstream. +* The HITL payload (``build_review_payload`` → ``_finding_payload``) only ever + emits canonical severities — this is the contract the hax-sdk template's zod + enum enforces, and the bug this guards against (a ``"high"`` finding 422'd the + whole review-approval create). +""" + +from __future__ import annotations + +import pytest + +from pr_af.hitl import build_review_payload +from pr_af.schemas.output import ScoredFinding +from pr_af.schemas.pipeline import ReviewFinding +from pr_af.schemas.severity import VALID_SEVERITIES, normalize_severity + +# --- normalize_severity: synonym mapping -------------------------------- + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("critical", "critical"), + ("important", "important"), + ("suggestion", "suggestion"), + ("nitpick", "nitpick"), + # the incident: "high" must land on "important", not blow up + ("high", "important"), + ("medium", "suggestion"), + ("low", "nitpick"), + ("blocker", "critical"), + ("error", "important"), + ("warning", "suggestion"), + ("info", "nitpick"), + ("minor", "nitpick"), + ], +) +def test_normalize_maps_known_labels_by_rank(raw, expected): + assert normalize_severity(raw) == expected + + +def test_normalize_is_case_and_whitespace_insensitive(): + assert normalize_severity(" HIGH ") == "important" + assert normalize_severity("Critical") == "critical" + + +@pytest.mark.parametrize("bad", ["", " ", "bogus", "p1", None, 3, ["high"]]) +def test_normalize_falls_back_to_default_for_unusable_input(bad): + assert normalize_severity(bad) == "suggestion" + + +def test_normalize_respects_explicit_default(): + assert normalize_severity("", default="nitpick") == "nitpick" + + +def test_normalize_output_is_always_canonical(): + for raw in ["high", "medium", "low", "blocker", "weird", ""]: + assert normalize_severity(raw) in VALID_SEVERITIES + + +# --- model-level coercion ----------------------------------------------- + + +def test_review_finding_coerces_stray_severity(): + f = ReviewFinding( + dimension_id="d", + dimension_name="dim", + file_path="a.py", + line_start=1, + line_end=1, + severity="high", + title="t", + body="b", + ) + assert f.severity == "important" + + +def test_scored_finding_coerces_stray_severity(): + f = ScoredFinding( + id="f1", + dimension_id="d", + dimension_name="dim", + file_path="a.py", + line_start=1, + line_end=1, + severity="HIGH", + title="t", + body="b", + ) + assert f.severity == "important" + + +# --- HITL payload boundary ---------------------------------------------- + + +def _scored(severity: str) -> ScoredFinding: + return ScoredFinding( + id="f1", + dimension_id="d", + dimension_name="dim", + file_path="a.py", + line_start=1, + line_end=1, + severity=severity, + title="t", + body="b", + ) + + +def test_hitl_payload_severity_is_canonical_even_from_stray_label(): + # Build a finding, then force a stray severity past validation to prove the + # _finding_payload boundary still normalizes (defense in depth). + finding = _scored("critical") + object.__setattr__(finding, "severity", "high") + payload = build_review_payload(pr_intent="x", findings=[finding], title="t") + assert payload["findings"][0]["severity"] == "important"