Agent-Field · AbirAbbas · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/src/pr_af/hitl/client.py b/src/pr_af/hitl/client.py
@@ -48,9 +48,19 @@ def approval_webhook_url(app: Any) -> str | None:
     Mirrors the URL SWE-AF's plan-approval gate uses
     (``{cp_base_url}/api/v1/webhooks/approval-response``). Returns ``None`` when
     no control-plane URL can be resolved.
+
+    ``AGENTFIELD_PUBLIC_URL`` takes precedence: this callback is invoked by
+    hax-sdk, which lives in a *separate* Railway project, so it must be a
+    publicly reachable URL — not the ``AGENTFIELD_SERVER`` internal address
+    (e.g. ``control-plane.railway.internal``) the agent uses to call the CP.
+    Conflating the two is what made an answered review undeliverable: the
+    webhook ``fetch failed`` because the internal host doesn't resolve
+    cross-project. Falls back to ``AGENTFIELD_SERVER`` for single-project /
+    local setups where the two are the same.
     """
     cp_base = (
-        getattr(app, "agentfield_server", None)
+        os.environ.get("AGENTFIELD_PUBLIC_URL")
+        or getattr(app, "agentfield_server", None)
         or os.environ.get("AGENTFIELD_SERVER")
         or ""
     ).rstrip("/")

diff --git a/src/pr_af/hitl/review_gate.py b/src/pr_af/hitl/review_gate.py
@@ -18,6 +18,7 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
+from ..schemas.severity import normalize_severity
 from .client import (
     create_hax_form_request_with_timeout,
     extract_values_from_raw,
@@ -98,7 +99,10 @@ def _finding_payload(finding: ScoredFinding) -> dict[str, Any]:
     """One finding entry for the hax template (camelCase per its zod schema)."""
     entry: dict[str, Any] = {
         "id": finding.id,
-        "severity": finding.severity,
+        # Last line of defense before the hax-sdk zod enum: a finding built off
+        # the validated path could still carry a stray label. Normalize so the
+        # create never 422s on severity.
+        "severity": normalize_severity(finding.severity),
         "title": finding.title,
         # All findings start checked, matching the prior "submit posts all" default.
         "defaultSelected": True,

diff --git a/src/pr_af/orchestrator.py b/src/pr_af/orchestrator.py
@@ -60,6 +60,7 @@
     ReviewFinding,
     ReviewPlan,
 )
+from .schemas.severity import normalize_severity
 from .scoring import deduplicate_exact, determine_review_event, score_findings
 
 
@@ -206,11 +207,20 @@ async def run(self) -> ReviewResult:
 
             # reject, expire/error, or a re-review past the revision cap → no post.
             reason = "revision cap reached" if decision.is_rerun else decision.action
+            # Surface the underlying cause loudly: a create_request/pause failure
+            # carries its exception text in `instructions`, and dropping a whole
+            # computed review on a transient hax hiccup must not look silent.
+            detail = (decision.instructions or "").strip()
+            detail_suffix = f": {detail}" if detail else ""
             self.app.note(
-                f"hitl: not posting review ({reason}; raw={decision.decision_raw})",
+                f"hitl: not posting review ({reason}; raw={decision.decision_raw}){detail_suffix}",
                 tags=["hitl", "no-post", decision.action],
             )
-            print(f"[PR-AF] HITL: not posting review ({reason})", flush=True)
+            print(
+                f"[PR-AF] HITL: NOT posting {len(scored_findings)} finding(s) "
+                f"({reason}{detail_suffix})",
+                flush=True,
+            )
             return await self._finish(scored_findings, intake, anatomy, plan, post=False)
 
         # The loop always returns; this guards against a future edit slipping by.
@@ -534,7 +544,8 @@ async def _run_evidence_verification(
                 updated = f.model_copy(
                     update={
                         "confidence": max(0.1, vf.get("revised_confidence", 0.3)),
-                        "severity": vf.get("revised_severity", "suggestion") or "suggestion",
+                        # model_copy bypasses validators, so normalize explicitly.
+                        "severity": normalize_severity(vf.get("revised_severity")),
                     }
                 )
                 updated_findings.append(updated)
@@ -544,8 +555,9 @@ async def _run_evidence_verification(
                 if revised_conf is not None and isinstance(revised_conf, (int, float)):
                     updates["confidence"] = float(revised_conf)
                 revised_sev = vf.get("revised_severity")
-                if revised_sev and revised_sev in ("critical", "important", "suggestion", "nitpick"):
-                    updates["severity"] = revised_sev
+                if revised_sev:
+                    # model_copy bypasses validators, so normalize explicitly.
+                    updates["severity"] = normalize_severity(revised_sev)
                 if updates:
                     updated_findings.append(f.model_copy(update=updates))
                 else:

diff --git a/src/pr_af/reasoners/harnesses.py b/src/pr_af/reasoners/harnesses.py
@@ -18,6 +18,7 @@
     ReviewFinding,
     ReviewPlan,
 )
+from ..schemas.severity import Severity  # noqa: TC001 - runtime-needed pydantic field type
 from . import router
 
 
@@ -44,7 +45,7 @@ class _ReviewFindingsResult(BaseModel):
 
 class _CompoundFinding(BaseModel):
     title: str = ""
-    severity: str = "suggestion"
+    severity: Severity = "suggestion"
     file_path: str = ""
     line_start: int = 0
     line_end: int = 0
@@ -906,6 +907,9 @@ async def review_dimension(
         f"handling, test coverage gaps for specific scenarios. The code works but could be "
         f"more robust.\n"
         f"- **nitpick**: Naming, style, readability, documentation. Truly cosmetic.\n\n"
+        f"The `severity` field MUST be EXACTLY one of these four lowercase strings: "
+        f"`critical`, `important`, `suggestion`, `nitpick`. Do NOT use `high`, `medium`, "
+        f"`low`, `warning`, or any other label.\n\n"
         f"If you're unsure whether something is critical or important, provide your reasoning "
         f"in the `body` field and let the confidence score reflect your uncertainty.\n\n"
         f"## False-Positive Prevention (CRITICAL)\n\n"
@@ -956,7 +960,17 @@ async def review_dimension(
         schema=_ReviewFindingsResult,
         cwd=repo_path or None,
     )
-    parsed = result.parsed if result.parsed else _ReviewFindingsResult()
+    if result.parsed:
+        parsed = result.parsed
+    else:
+        # Schema parse failed entirely — don't silently report "0 findings",
+        # which is indistinguishable from a clean review. Make it visible.
+        print(
+            "[PR-AF] review_dimension: schema parse failed — treating as 0 "
+            f"findings for this dimension (error={getattr(result, 'error_message', None)!r})",
+            flush=True,
+        )
+        parsed = _ReviewFindingsResult()
     sub_review_dicts = []
     if can_spawn and parsed.sub_reviews:
         sub_review_dicts = [
@@ -1053,6 +1067,7 @@ async def compound_finder_phase(
         "- If a compound issue exists, emit NEW findings only. Do not repeat original findings.\n"
         "- Each output finding must include: title, severity, file_path, line_start, line_end, "
         "body, evidence, suggestion, confidence, tags, and contributing_findings.\n"
+        "- `severity` MUST be exactly one of: `critical`, `important`, `suggestion`, `nitpick`.\n"
         "- `contributing_findings` must list the exact titles from this cluster that combine.\n"
         "- Only emit findings with confidence >= 0.6 and concrete evidence.\n\n"
         + findings_ref

diff --git a/src/pr_af/schemas/output.py b/src/pr_af/schemas/output.py
@@ -7,6 +7,8 @@
 
 from pydantic import BaseModel, Field
 
+from .severity import Severity  # noqa: TC001 - runtime-needed pydantic field type
+
 
 class ScoredFinding(BaseModel):
     """A finding after scoring, dedup, and line mapping."""
@@ -19,7 +21,7 @@ class ScoredFinding(BaseModel):
     line_end: int
     diff_line: int | None = None  # Line number in the diff (for GitHub API)
     diff_side: str = "RIGHT"  # LEFT (deletion) or RIGHT (addition)
-    severity: str  # critical | important | suggestion | nitpick
+    severity: Severity  # critical | important | suggestion | nitpick (normalized)
     title: str
     body: str
     suggestion: str | None = None

diff --git a/src/pr_af/schemas/pipeline.py b/src/pr_af/schemas/pipeline.py
@@ -11,6 +11,8 @@
 
 from pydantic import BaseModel, Field
 
+from .severity import Severity  # noqa: TC001 - runtime-needed pydantic field type
+
 # ---------------------------------------------------------------------------
 # Phase 1 → Phase 3: Intake Result
 # Format: Hybrid (structured fields for routing + pr_summary string for LLM context)
@@ -169,7 +171,7 @@ class ReviewFinding(BaseModel):
     line_start: int
     line_end: int
     hunk_context: str = ""  # Code context around the finding
-    severity: str  # critical | important | suggestion | nitpick
+    severity: Severity  # critical | important | suggestion | nitpick (normalized)
     title: str
     body: str  # Detailed explanation (string — appears in GitHub comment)
     suggestion: str | None = None  # Concrete fix (code block)
@@ -191,7 +193,7 @@ class CrossRefInteraction(BaseModel):
     finding_b_title: str
     interaction_type: str  # compound_risk | assumption_violation | consistency_gap
     description: str  # How they interact
-    combined_severity: str  # The severity of the combined issue
+    combined_severity: Severity  # The severity of the combined issue (normalized)
     file_paths: list[str]
     line_references: list[str] = Field(default_factory=list)
 

diff --git a/src/pr_af/schemas/severity.py b/src/pr_af/schemas/severity.py
@@ -0,0 +1,95 @@
+"""Canonical finding-severity vocabulary — the single source of truth.
+
+PR-AF findings must use exactly one of four severities. This is a *cross-repo
+contract*: the hax-sdk ``pr-af-review-v1`` template enforces the same set as a
+zod enum, and rejects the whole HITL request if a finding carries anything else
+(this is what silently swallowed a real review — a model-emitted ``"high"``
+failed the template's enum, the create 422'd, and the gate treated it as a
+reject → nothing posted).
+
+To keep the model honest *and* the pipeline robust we do two things with this
+module:
+
+1. Type finding-severity fields as :data:`Severity` (an ``Annotated`` ``Literal``)
+   so the JSON schema handed to the model advertises the enum, **and** so a
+   :func:`normalize_severity` ``BeforeValidator`` coerces stray values
+   (``high`` → ``important``) instead of failing validation — deterministic and
+   cheap, with no retry storms or dropped-findings fallout.
+2. Re-apply :func:`normalize_severity` at the hax-sdk boundary as defense in
+   depth, so a finding constructed off the validated path can never break the
+   create.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal
+
+from pydantic import BeforeValidator
+
+#: The four canonical severities, in *decreasing* order of urgency.
+SeverityLiteral = Literal["critical", "important", "suggestion", "nitpick"]
+
+#: Tuple form for membership checks / iteration.
+VALID_SEVERITIES: tuple[str, ...] = (
+    "critical",
+    "important",
+    "suggestion",
+    "nitpick",
+)
+
+#: Where unknown / empty labels land. "suggestion" is the mid-low rung — it
+#: keeps the finding visible without inflating it to a blocker.
+DEFAULT_SEVERITY: SeverityLiteral = "suggestion"
+
+# Common labels models reach for, mapped onto the 4-level scale by rank.
+# critical > important > suggestion > nitpick.
+_SEVERITY_ALIASES: dict[str, SeverityLiteral] = {
+    # canonical (identity)
+    "critical": "critical",
+    "important": "important",
+    "suggestion": "suggestion",
+    "nitpick": "nitpick",
+    # critical-tier synonyms
+    "blocker": "critical",
+    "fatal": "critical",
+    "severe": "critical",
+    # important-tier synonyms (the "high" that caused the incident)
+    "high": "important",
+    "error": "important",
+    "major": "important",
+    # suggestion-tier synonyms
+    "medium": "suggestion",
+    "moderate": "suggestion",
+    "warning": "suggestion",
+    "warn": "suggestion",
+    # nitpick-tier synonyms
+    "low": "nitpick",
+    "minor": "nitpick",
+    "nit": "nitpick",
+    "info": "nitpick",
+    "informational": "nitpick",
+    "trivial": "nitpick",
+}
+
+
+def normalize_severity(
+    value: object, default: SeverityLiteral = DEFAULT_SEVERITY
+) -> SeverityLiteral:
+    """Coerce an arbitrary severity label to the canonical vocabulary.
+
+    Case-insensitive and whitespace-tolerant. Known synonyms map by rank
+    (``high`` → ``important``, ``medium`` → ``suggestion``, ``low`` →
+    ``nitpick``); empty, non-string, or unrecognized input falls back to
+    ``default``.
+    """
+    if not isinstance(value, str):
+        return default
+    key = value.strip().lower()
+    if not key:
+        return default
+    return _SEVERITY_ALIASES.get(key, default)
+
+
+#: Field type for finding severities: advertises the enum in the JSON schema
+#: (so the model is guided) while coercing stray labels before validation.
+Severity = Annotated[SeverityLiteral, BeforeValidator(normalize_severity)]
diff --git a/tests/test_hitl_review_gate.py b/tests/test_hitl_review_gate.py
@@ -17,6 +17,7 @@
     ACTION_REJECT,
     ACTION_RERUN,
     HAX_REVIEW_TEMPLATE,
+    approval_webhook_url,
     build_hax_client_from_env,
     build_review_payload,
     clean_intent,
@@ -167,6 +168,45 @@ def test_values_nested_under_response_key_are_found():
     assert decision.selected_finding_ids == {"f2"}
 
 
+# --- callback URL resolution ---------------------------------------------
+#
+# Contract: the approval callback is invoked by hax-sdk (a separate Railway
+# project), so a public URL must win over the internal AGENTFIELD_SERVER the
+# agent uses to reach the CP. Conflating them made answered reviews
+# undeliverable (the webhook "fetch failed" on an internal hostname).
+
+
+def test_webhook_url_prefers_public_env_over_internal_server(monkeypatch):
+    monkeypatch.setenv("AGENTFIELD_PUBLIC_URL", "https://cp.public.example")
+    monkeypatch.setenv("AGENTFIELD_SERVER", "http://control-plane.railway.internal:8080")
+    app = SimpleNamespace(agentfield_server="http://control-plane.railway.internal:8080")
+    assert (
+        approval_webhook_url(app)
+        == "https://cp.public.example/api/v1/webhooks/approval-response"
+    )
+
+
+def test_webhook_url_falls_back_to_server_when_no_public(monkeypatch):
+    monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False)
+    monkeypatch.delenv("AGENTFIELD_SERVER", raising=False)
+    app = SimpleNamespace(agentfield_server="http://cp.local:8080")
+    assert approval_webhook_url(app) == "http://cp.local:8080/api/v1/webhooks/approval-response"
+
+
+def test_webhook_url_uses_server_env_when_app_attr_missing(monkeypatch):
+    monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False)
+    monkeypatch.setenv("AGENTFIELD_SERVER", "http://cp.env:8080")
+    app = SimpleNamespace(agentfield_server=None)
+    assert approval_webhook_url(app) == "http://cp.env:8080/api/v1/webhooks/approval-response"
+
+
+def test_webhook_url_none_when_nothing_configured(monkeypatch):
+    monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False)
+    monkeypatch.delenv("AGENTFIELD_SERVER", raising=False)
+    app = SimpleNamespace(agentfield_server=None)
+    assert approval_webhook_url(app) is None
+
+
 # --- watchdog safety -----------------------------------------------------