Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/pr_af/hitl/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,19 @@ def approval_webhook_url(app: Any) -> str | None:
Mirrors the URL SWE-AF's plan-approval gate uses
(``{cp_base_url}/api/v1/webhooks/approval-response``). Returns ``None`` when
no control-plane URL can be resolved.

``AGENTFIELD_PUBLIC_URL`` takes precedence: this callback is invoked by
hax-sdk, which lives in a *separate* Railway project, so it must be a
publicly reachable URL — not the ``AGENTFIELD_SERVER`` internal address
(e.g. ``control-plane.railway.internal``) the agent uses to call the CP.
Conflating the two is what made an answered review undeliverable: the
webhook ``fetch failed`` because the internal host doesn't resolve
cross-project. Falls back to ``AGENTFIELD_SERVER`` for single-project /
local setups where the two are the same.
"""
cp_base = (
getattr(app, "agentfield_server", None)
os.environ.get("AGENTFIELD_PUBLIC_URL")
or getattr(app, "agentfield_server", None)
or os.environ.get("AGENTFIELD_SERVER")
or ""
).rstrip("/")
Expand Down
6 changes: 5 additions & 1 deletion src/pr_af/hitl/review_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any

from ..schemas.severity import normalize_severity
from .client import (
create_hax_form_request_with_timeout,
extract_values_from_raw,
Expand Down Expand Up @@ -98,7 +99,10 @@ def _finding_payload(finding: ScoredFinding) -> dict[str, Any]:
"""One finding entry for the hax template (camelCase per its zod schema)."""
entry: dict[str, Any] = {
"id": finding.id,
"severity": finding.severity,
# Last line of defense before the hax-sdk zod enum: a finding built off
# the validated path could still carry a stray label. Normalize so the
# create never 422s on severity.
"severity": normalize_severity(finding.severity),
"title": finding.title,
# All findings start checked, matching the prior "submit posts all" default.
"defaultSelected": True,
Expand Down
22 changes: 17 additions & 5 deletions src/pr_af/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
ReviewFinding,
ReviewPlan,
)
from .schemas.severity import normalize_severity
from .scoring import deduplicate_exact, determine_review_event, score_findings


Expand Down Expand Up @@ -206,11 +207,20 @@ async def run(self) -> ReviewResult:

# reject, expire/error, or a re-review past the revision cap → no post.
reason = "revision cap reached" if decision.is_rerun else decision.action
# Surface the underlying cause loudly: a create_request/pause failure
# carries its exception text in `instructions`, and dropping a whole
# computed review on a transient hax hiccup must not look silent.
detail = (decision.instructions or "").strip()
detail_suffix = f": {detail}" if detail else ""
self.app.note(
f"hitl: not posting review ({reason}; raw={decision.decision_raw})",
f"hitl: not posting review ({reason}; raw={decision.decision_raw}){detail_suffix}",
tags=["hitl", "no-post", decision.action],
)
print(f"[PR-AF] HITL: not posting review ({reason})", flush=True)
print(
f"[PR-AF] HITL: NOT posting {len(scored_findings)} finding(s) "
f"({reason}{detail_suffix})",
flush=True,
)
return await self._finish(scored_findings, intake, anatomy, plan, post=False)

# The loop always returns; this guards against a future edit slipping by.
Expand Down Expand Up @@ -534,7 +544,8 @@ async def _run_evidence_verification(
updated = f.model_copy(
update={
"confidence": max(0.1, vf.get("revised_confidence", 0.3)),
"severity": vf.get("revised_severity", "suggestion") or "suggestion",
# model_copy bypasses validators, so normalize explicitly.
"severity": normalize_severity(vf.get("revised_severity")),
}
)
updated_findings.append(updated)
Expand All @@ -544,8 +555,9 @@ async def _run_evidence_verification(
if revised_conf is not None and isinstance(revised_conf, (int, float)):
updates["confidence"] = float(revised_conf)
revised_sev = vf.get("revised_severity")
if revised_sev and revised_sev in ("critical", "important", "suggestion", "nitpick"):
updates["severity"] = revised_sev
if revised_sev:
# model_copy bypasses validators, so normalize explicitly.
updates["severity"] = normalize_severity(revised_sev)
if updates:
updated_findings.append(f.model_copy(update=updates))
else:
Expand Down
19 changes: 17 additions & 2 deletions src/pr_af/reasoners/harnesses.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
ReviewFinding,
ReviewPlan,
)
from ..schemas.severity import Severity # noqa: TC001 - runtime-needed pydantic field type
from . import router


Expand All @@ -44,7 +45,7 @@ class _ReviewFindingsResult(BaseModel):

class _CompoundFinding(BaseModel):
title: str = ""
severity: str = "suggestion"
severity: Severity = "suggestion"
file_path: str = ""
line_start: int = 0
line_end: int = 0
Expand Down Expand Up @@ -906,6 +907,9 @@ async def review_dimension(
f"handling, test coverage gaps for specific scenarios. The code works but could be "
f"more robust.\n"
f"- **nitpick**: Naming, style, readability, documentation. Truly cosmetic.\n\n"
f"The `severity` field MUST be EXACTLY one of these four lowercase strings: "
f"`critical`, `important`, `suggestion`, `nitpick`. Do NOT use `high`, `medium`, "
f"`low`, `warning`, or any other label.\n\n"
f"If you're unsure whether something is critical or important, provide your reasoning "
f"in the `body` field and let the confidence score reflect your uncertainty.\n\n"
f"## False-Positive Prevention (CRITICAL)\n\n"
Expand Down Expand Up @@ -956,7 +960,17 @@ async def review_dimension(
schema=_ReviewFindingsResult,
cwd=repo_path or None,
)
parsed = result.parsed if result.parsed else _ReviewFindingsResult()
if result.parsed:
parsed = result.parsed
else:
# Schema parse failed entirely — don't silently report "0 findings",
# which is indistinguishable from a clean review. Make it visible.
print(
"[PR-AF] review_dimension: schema parse failed — treating as 0 "
f"findings for this dimension (error={getattr(result, 'error_message', None)!r})",
flush=True,
)
parsed = _ReviewFindingsResult()
sub_review_dicts = []
if can_spawn and parsed.sub_reviews:
sub_review_dicts = [
Expand Down Expand Up @@ -1053,6 +1067,7 @@ async def compound_finder_phase(
"- If a compound issue exists, emit NEW findings only. Do not repeat original findings.\n"
"- Each output finding must include: title, severity, file_path, line_start, line_end, "
"body, evidence, suggestion, confidence, tags, and contributing_findings.\n"
"- `severity` MUST be exactly one of: `critical`, `important`, `suggestion`, `nitpick`.\n"
"- `contributing_findings` must list the exact titles from this cluster that combine.\n"
"- Only emit findings with confidence >= 0.6 and concrete evidence.\n\n"
+ findings_ref
Expand Down
4 changes: 3 additions & 1 deletion src/pr_af/schemas/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from pydantic import BaseModel, Field

from .severity import Severity # noqa: TC001 - runtime-needed pydantic field type


class ScoredFinding(BaseModel):
"""A finding after scoring, dedup, and line mapping."""
Expand All @@ -19,7 +21,7 @@ class ScoredFinding(BaseModel):
line_end: int
diff_line: int | None = None # Line number in the diff (for GitHub API)
diff_side: str = "RIGHT" # LEFT (deletion) or RIGHT (addition)
severity: str # critical | important | suggestion | nitpick
severity: Severity # critical | important | suggestion | nitpick (normalized)
title: str
body: str
suggestion: str | None = None
Expand Down
6 changes: 4 additions & 2 deletions src/pr_af/schemas/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from pydantic import BaseModel, Field

from .severity import Severity # noqa: TC001 - runtime-needed pydantic field type

# ---------------------------------------------------------------------------
# Phase 1 → Phase 3: Intake Result
# Format: Hybrid (structured fields for routing + pr_summary string for LLM context)
Expand Down Expand Up @@ -169,7 +171,7 @@ class ReviewFinding(BaseModel):
line_start: int
line_end: int
hunk_context: str = "" # Code context around the finding
severity: str # critical | important | suggestion | nitpick
severity: Severity # critical | important | suggestion | nitpick (normalized)
title: str
body: str # Detailed explanation (string — appears in GitHub comment)
suggestion: str | None = None # Concrete fix (code block)
Expand All @@ -191,7 +193,7 @@ class CrossRefInteraction(BaseModel):
finding_b_title: str
interaction_type: str # compound_risk | assumption_violation | consistency_gap
description: str # How they interact
combined_severity: str # The severity of the combined issue
combined_severity: Severity # The severity of the combined issue (normalized)
file_paths: list[str]
line_references: list[str] = Field(default_factory=list)

Expand Down
95 changes: 95 additions & 0 deletions src/pr_af/schemas/severity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Canonical finding-severity vocabulary — the single source of truth.

PR-AF findings must use exactly one of four severities. This is a *cross-repo
contract*: the hax-sdk ``pr-af-review-v1`` template enforces the same set as a
zod enum, and rejects the whole HITL request if a finding carries anything else
(this is what silently swallowed a real review — a model-emitted ``"high"``
failed the template's enum, the create 422'd, and the gate treated it as a
reject → nothing posted).

To keep the model honest *and* the pipeline robust we do two things with this
module:

1. Type finding-severity fields as :data:`Severity` (an ``Annotated`` ``Literal``)
so the JSON schema handed to the model advertises the enum, **and** so a
:func:`normalize_severity` ``BeforeValidator`` coerces stray values
(``high`` → ``important``) instead of failing validation — deterministic and
cheap, with no retry storms or dropped-findings fallout.
2. Re-apply :func:`normalize_severity` at the hax-sdk boundary as defense in
depth, so a finding constructed off the validated path can never break the
create.
"""

from __future__ import annotations

from typing import Annotated, Literal

from pydantic import BeforeValidator

#: The four canonical severities, in *decreasing* order of urgency.
SeverityLiteral = Literal["critical", "important", "suggestion", "nitpick"]

#: Tuple form for membership checks / iteration.
VALID_SEVERITIES: tuple[str, ...] = (
"critical",
"important",
"suggestion",
"nitpick",
)

#: Where unknown / empty labels land. "suggestion" is the mid-low rung — it
#: keeps the finding visible without inflating it to a blocker.
DEFAULT_SEVERITY: SeverityLiteral = "suggestion"

# Common labels models reach for, mapped onto the 4-level scale by rank.
# critical > important > suggestion > nitpick.
_SEVERITY_ALIASES: dict[str, SeverityLiteral] = {
# canonical (identity)
"critical": "critical",
"important": "important",
"suggestion": "suggestion",
"nitpick": "nitpick",
# critical-tier synonyms
"blocker": "critical",
"fatal": "critical",
"severe": "critical",
# important-tier synonyms (the "high" that caused the incident)
"high": "important",
"error": "important",
"major": "important",
# suggestion-tier synonyms
"medium": "suggestion",
"moderate": "suggestion",
"warning": "suggestion",
"warn": "suggestion",
# nitpick-tier synonyms
"low": "nitpick",
"minor": "nitpick",
"nit": "nitpick",
"info": "nitpick",
"informational": "nitpick",
"trivial": "nitpick",
}


def normalize_severity(
value: object, default: SeverityLiteral = DEFAULT_SEVERITY
) -> SeverityLiteral:
"""Coerce an arbitrary severity label to the canonical vocabulary.

Case-insensitive and whitespace-tolerant. Known synonyms map by rank
(``high`` → ``important``, ``medium`` → ``suggestion``, ``low`` →
``nitpick``); empty, non-string, or unrecognized input falls back to
``default``.
"""
if not isinstance(value, str):
return default
key = value.strip().lower()
if not key:
return default
return _SEVERITY_ALIASES.get(key, default)


#: Field type for finding severities: advertises the enum in the JSON schema
#: (so the model is guided) while coercing stray labels before validation.
Severity = Annotated[SeverityLiteral, BeforeValidator(normalize_severity)]
40 changes: 40 additions & 0 deletions tests/test_hitl_review_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
ACTION_REJECT,
ACTION_RERUN,
HAX_REVIEW_TEMPLATE,
approval_webhook_url,
build_hax_client_from_env,
build_review_payload,
clean_intent,
Expand Down Expand Up @@ -167,6 +168,45 @@ def test_values_nested_under_response_key_are_found():
assert decision.selected_finding_ids == {"f2"}


# --- callback URL resolution ---------------------------------------------
#
# Contract: the approval callback is invoked by hax-sdk (a separate Railway
# project), so a public URL must win over the internal AGENTFIELD_SERVER the
# agent uses to reach the CP. Conflating them made answered reviews
# undeliverable (the webhook "fetch failed" on an internal hostname).


def test_webhook_url_prefers_public_env_over_internal_server(monkeypatch):
monkeypatch.setenv("AGENTFIELD_PUBLIC_URL", "https://cp.public.example")
monkeypatch.setenv("AGENTFIELD_SERVER", "http://control-plane.railway.internal:8080")
app = SimpleNamespace(agentfield_server="http://control-plane.railway.internal:8080")
assert (
approval_webhook_url(app)
== "https://cp.public.example/api/v1/webhooks/approval-response"
)


def test_webhook_url_falls_back_to_server_when_no_public(monkeypatch):
monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False)
monkeypatch.delenv("AGENTFIELD_SERVER", raising=False)
app = SimpleNamespace(agentfield_server="http://cp.local:8080")
assert approval_webhook_url(app) == "http://cp.local:8080/api/v1/webhooks/approval-response"


def test_webhook_url_uses_server_env_when_app_attr_missing(monkeypatch):
monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False)
monkeypatch.setenv("AGENTFIELD_SERVER", "http://cp.env:8080")
app = SimpleNamespace(agentfield_server=None)
assert approval_webhook_url(app) == "http://cp.env:8080/api/v1/webhooks/approval-response"


def test_webhook_url_none_when_nothing_configured(monkeypatch):
monkeypatch.delenv("AGENTFIELD_PUBLIC_URL", raising=False)
monkeypatch.delenv("AGENTFIELD_SERVER", raising=False)
app = SimpleNamespace(agentfield_server=None)
assert approval_webhook_url(app) is None


# --- watchdog safety -----------------------------------------------------


Expand Down
Loading
Loading