diff --git a/.env.example b/.env.example index a2e7f4e8..d797aa1d 100644 --- a/.env.example +++ b/.env.example @@ -24,4 +24,4 @@ # # ANTHROPIC_API_KEY=sk-ant-api03-REPLACE_ME # Optional model pin -# ANTHROPIC_MODEL=claude-sonnet-4-7-20260101 +# ANTHROPIC_MODEL=claude-sonnet-4-6 diff --git a/README.md b/README.md index 9db43e94..2de132a0 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ max_budget_usd = 5.0 # per-pipeline budget cap (>= 0) persona = "falcor" # CLI output persona [models] # bring your own provider key — strings live here -reasoning = "claude-opus-4-7" # researcher, reviewer, synthesizer, analyst +reasoning = "claude-opus-4-8" # researcher, reviewer, synthesizer, analyst fast = "claude-haiku-4-5" # tester, implementer, verifier, publisher, closer balanced = "claude-sonnet-4-6" # fallback for unknown role strings diff --git a/src/bonfire/dispatch/pydantic_ai_backend.py b/src/bonfire/dispatch/pydantic_ai_backend.py index 5c6b3bdf..fe6c704b 100644 --- a/src/bonfire/dispatch/pydantic_ai_backend.py +++ b/src/bonfire/dispatch/pydantic_ai_backend.py @@ -35,7 +35,7 @@ class PydanticAIBackend: Parameters ---------- model - The model identifier string (e.g. ``"claude-sonnet-4-20250514"``). + The model identifier string (e.g. ``"claude-sonnet-4-6"``). """ def __init__(self, model: str = "") -> None: diff --git a/src/bonfire/models/config.py b/src/bonfire/models/config.py index e7cf8260..534d018c 100644 --- a/src/bonfire/models/config.py +++ b/src/bonfire/models/config.py @@ -90,7 +90,7 @@ class ModelsConfig(BaseModel): configured backend. """ - reasoning: str = "claude-opus-4-7" + reasoning: str = "claude-opus-4-8" fast: str = "claude-haiku-4-5" balanced: str = "claude-sonnet-4-6" diff --git a/tests/e2e/prompts/runner-prompt.md b/tests/e2e/prompts/runner-prompt.md index 4bd2bb29..bac64ad9 100644 --- a/tests/e2e/prompts/runner-prompt.md +++ b/tests/e2e/prompts/runner-prompt.md @@ -18,7 +18,7 @@ The project at /workspace/target has one deliberately broken test. Your job is t Append two JSONL lines (one JSON object per line, newline-terminated). Use today's unix timestamp for `timestamp` (call `date +%s.%N`). Use the SESSION_ID env var for `session_id`. Approximate the cost and duration based on what you observed. Line 1 (DispatchRecord — your work counts as one dispatch): -{"type":"dispatch","timestamp":1714564800.123,"session_id":"","agent_name":"claude-cli","cost_usd":0.42,"duration_seconds":187.5,"model":"claude-sonnet-4-7-20260101"} +{"type":"dispatch","timestamp":1714564800.123,"session_id":"","agent_name":"claude-cli","cost_usd":0.42,"duration_seconds":187.5,"model":"claude-sonnet-4-6"} Line 2 (PipelineRecord — your work counts as a one-stage pipeline): {"type":"pipeline","timestamp":1714564800.456,"session_id":"","total_cost_usd":0.42,"duration_seconds":187.5,"stages_completed":1} diff --git a/tests/integration/test_budget_enforcement.py b/tests/integration/test_budget_enforcement.py index 72ce3ea5..f677df91 100644 --- a/tests/integration/test_budget_enforcement.py +++ b/tests/integration/test_budget_enforcement.py @@ -309,9 +309,9 @@ async def test_max_budget_usd_reaches_claude_agent_options(self) -> None: ): backend = ClaudeSDKBackend() await backend.execute( - Envelope(task="t", agent_name="scout-agent", model="claude-opus-4-7"), + Envelope(task="t", agent_name="scout-agent", model="claude-opus-4-8"), options=DispatchOptions( - model="claude-opus-4-7", + model="claude-opus-4-8", max_budget_usd=3.14, max_turns=7, ), @@ -334,7 +334,7 @@ async def test_max_budget_usd_zero_default_preserved(self) -> None: # Default DispatchOptions.max_budget_usd is 0.0 per protocols.py:62. await backend.execute( Envelope(task="t", agent_name="scout-agent"), - options=DispatchOptions(model="claude-opus-4-7"), + options=DispatchOptions(model="claude-opus-4-8"), ) assert captured["max_budget_usd"] == pytest.approx(0.0) @@ -359,9 +359,9 @@ async def test_max_turns_reaches_claude_agent_options(self) -> None: ): backend = ClaudeSDKBackend() await backend.execute( - Envelope(task="t", agent_name="knight-agent", model="claude-opus-4-7"), + Envelope(task="t", agent_name="knight-agent", model="claude-opus-4-8"), options=DispatchOptions( - model="claude-opus-4-7", + model="claude-opus-4-8", max_turns=42, max_budget_usd=1.0, ), @@ -381,7 +381,7 @@ async def test_max_turns_default_reaches_sdk(self) -> None: backend = ClaudeSDKBackend() await backend.execute( Envelope(task="t", agent_name="scout-agent"), - options=DispatchOptions(model="claude-opus-4-7"), + options=DispatchOptions(model="claude-opus-4-8"), ) assert captured["max_turns"] == 10 diff --git a/tests/unit/test_bard_handler.py b/tests/unit/test_bard_handler.py index b489fc40..6f760fd1 100644 --- a/tests/unit/test_bard_handler.py +++ b/tests/unit/test_bard_handler.py @@ -1151,7 +1151,7 @@ async def test_custom_config_does_not_override_slug_constants( github_client, ) -> None: """slug_max_len + suffix_chars are module-scope constants, not config knobs.""" - cfg = PipelineConfig(model="claude-opus-4-7") + cfg = PipelineConfig(model="claude-opus-4-8") handler = BardHandler( git_workflow=git_workflow, github_client=github_client, diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index b5e3082b..7912eb67 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -83,13 +83,13 @@ def test_default_construction(self): def test_custom_values(self): p = PipelineConfig( tier="pro", - model="claude-opus-4", + model="claude-opus-4-8", max_turns=20, max_budget_usd=50.0, persona="anta", ) assert p.tier == "pro" - assert p.model == "claude-opus-4" + assert p.model == "claude-opus-4-8" assert p.max_turns == 20 assert p.max_budget_usd == 50.0 assert p.persona == "anta" @@ -380,7 +380,7 @@ def test_agents_from_toml(self, tmp_path, monkeypatch): # Locks the ``[models]`` TOML schema delivered by BON-350: # # * §D5 — three string fields (``reasoning``/``fast``/``balanced``) -# defaulting to ``claude-opus-4-7``/``claude-haiku-4-5``/``claude-sonnet-4-6``. +# defaulting to ``claude-opus-4-8``/``claude-haiku-4-5``/``claude-sonnet-4-6``. # * §D5 — TOML ``[models]`` section loads onto ``BonfireSettings.models``; # missing section falls back to defaults (backward-compatible). # * §D5 — Arbitrary strings accepted (BYOK passthrough). @@ -399,7 +399,7 @@ def test_default_construction_recommends_anthropic(self): from bonfire.models.config import ModelsConfig m = ModelsConfig() - assert m.reasoning == "claude-opus-4-7" + assert m.reasoning == "claude-opus-4-8" assert m.fast == "claude-haiku-4-5" assert m.balanced == "claude-sonnet-4-6" @@ -453,13 +453,13 @@ class TestModelsConfigTomlPartialOverride: ), ( '[models]\nfast = "Y"\n', - "claude-opus-4-7", + "claude-opus-4-8", "Y", "claude-sonnet-4-6", ), ( '[models]\nbalanced = "Z"\n', - "claude-opus-4-7", + "claude-opus-4-8", "claude-haiku-4-5", "Z", ), diff --git a/tests/unit/test_cost_analyzer.py b/tests/unit/test_cost_analyzer.py index 2c4f492d..3092e000 100644 --- a/tests/unit/test_cost_analyzer.py +++ b/tests/unit/test_cost_analyzer.py @@ -576,7 +576,7 @@ def test_groups_records_by_model(self, ledger_path: Path) -> None: agent_name="a", cost_usd=0.10, duration_seconds=1.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ), DispatchRecord( timestamp=2.0, @@ -584,7 +584,7 @@ def test_groups_records_by_model(self, ledger_path: Path) -> None: agent_name="b", cost_usd=0.05, duration_seconds=2.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ), DispatchRecord( timestamp=3.0, @@ -601,7 +601,7 @@ def test_groups_records_by_model(self, ledger_path: Path) -> None: assert len(results) == 2 assert all(isinstance(r, ModelCost) for r in results) models_to_records = {r.model: r for r in results} - assert set(models_to_records) == {"claude-opus-4-7", "claude-haiku-4-5"} + assert set(models_to_records) == {"claude-opus-4-8", "claude-haiku-4-5"} def test_sort_descending_by_cost(self, ledger_path: Path) -> None: """Sage memo D8 — sort key is ``total_cost_usd``, descending. Same @@ -659,7 +659,7 @@ def test_legacy_empty_model_grouped_visible(self, ledger_path: Path) -> None: agent_name="modern-b", cost_usd=0.10, duration_seconds=1.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ), ] _write_records(ledger_path, recs) @@ -668,7 +668,7 @@ def test_legacy_empty_model_grouped_visible(self, ledger_path: Path) -> None: assert len(results) == 2 models = {r.model for r in results} assert "" in models - assert "claude-opus-4-7" in models + assert "claude-opus-4-8" in models def test_dispatch_count_correct(self, ledger_path: Path) -> None: """Sage memo D8 — ``dispatch_count`` reflects the number of records @@ -704,7 +704,7 @@ def test_total_duration_summed(self, ledger_path: Path) -> None: agent_name="a", cost_usd=0.01, duration_seconds=12.5, - model="claude-opus-4-7", + model="claude-opus-4-8", ), DispatchRecord( timestamp=2.0, @@ -712,7 +712,7 @@ def test_total_duration_summed(self, ledger_path: Path) -> None: agent_name="b", cost_usd=0.02, duration_seconds=7.5, - model="claude-opus-4-7", + model="claude-opus-4-8", ), ] _write_records(ledger_path, recs) @@ -765,7 +765,7 @@ def test_pathological_corpus_groups_correctly_and_sorts(self, ledger_path: Path) spec'd dict-based group, it stays fast. Cost layout (deterministic): - claude-opus-4-7 -> 4000 records * 0.04 = 160.00 + claude-opus-4-8 -> 4000 records * 0.04 = 160.00 claude-sonnet-4-6 -> 3000 records * 0.03 = 90.00 claude-haiku-4-5 -> 2000 records * 0.02 = 40.00 claude-extra-1 -> 900 records * 0.05 = 45.00 @@ -778,7 +778,7 @@ def test_pathological_corpus_groups_correctly_and_sorts(self, ledger_path: Path) n_total = 10_000 layout: list[tuple[str, int, float]] = [ - ("claude-opus-4-7", 4000, 0.04), + ("claude-opus-4-8", 4000, 0.04), ("claude-sonnet-4-6", 3000, 0.03), ("claude-haiku-4-5", 2000, 0.02), ("claude-extra-1", 900, 0.05), @@ -811,7 +811,7 @@ def test_pathological_corpus_groups_correctly_and_sorts(self, ledger_path: Path) # Descending by total_cost_usd. names = [m.model for m in results] assert names == [ - "claude-opus-4-7", + "claude-opus-4-8", "claude-sonnet-4-6", "claude-extra-1", "claude-haiku-4-5", @@ -819,8 +819,8 @@ def test_pathological_corpus_groups_correctly_and_sorts(self, ledger_path: Path) ] # Spot-check totals. by_name = {m.model: m for m in results} - assert by_name["claude-opus-4-7"].total_cost_usd == pytest.approx(160.0) - assert by_name["claude-opus-4-7"].dispatch_count == 4000 + assert by_name["claude-opus-4-8"].total_cost_usd == pytest.approx(160.0) + assert by_name["claude-opus-4-8"].dispatch_count == 4000 assert by_name["claude-haiku-4-5"].total_cost_usd == pytest.approx(40.0) assert by_name["claude-extra-2"].dispatch_count == 100 @@ -855,7 +855,7 @@ def test_mixed_legacy_and_new_rows_round_trip(self, ledger_path: Path) -> None: agent_name="a", cost_usd=0.30, duration_seconds=2.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ) new_b = DispatchRecord( timestamp=5.0, @@ -873,7 +873,7 @@ def test_mixed_legacy_and_new_rows_round_trip(self, ledger_path: Path) -> None: # Three buckets: "", opus, haiku. by_name = {m.model: m for m in results} - assert set(by_name) == {"", "claude-opus-4-7", "claude-haiku-4-5"} + assert set(by_name) == {"", "claude-opus-4-8", "claude-haiku-4-5"} # Legacy bucket is preserved as a visible "" key with summed cost. legacy = by_name[""] @@ -881,15 +881,15 @@ def test_mixed_legacy_and_new_rows_round_trip(self, ledger_path: Path) -> None: assert legacy.total_cost_usd == pytest.approx(0.85) # New rows attributed to their model strings. - assert by_name["claude-opus-4-7"].dispatch_count == 1 - assert by_name["claude-opus-4-7"].total_cost_usd == pytest.approx(0.30) + assert by_name["claude-opus-4-8"].dispatch_count == 1 + assert by_name["claude-opus-4-8"].total_cost_usd == pytest.approx(0.30) assert by_name["claude-haiku-4-5"].dispatch_count == 1 assert by_name["claude-haiku-4-5"].total_cost_usd == pytest.approx(0.20) # Sort descending by cost: legacy (0.85) > opus (0.30) > haiku (0.20). assert [m.model for m in results] == [ "", - "claude-opus-4-7", + "claude-opus-4-8", "claude-haiku-4-5", ] diff --git a/tests/unit/test_cost_analyzer_memoization.py b/tests/unit/test_cost_analyzer_memoization.py index 16b2f377..b556eb1d 100644 --- a/tests/unit/test_cost_analyzer_memoization.py +++ b/tests/unit/test_cost_analyzer_memoization.py @@ -156,7 +156,7 @@ def test_cache_invalidates_when_ledger_modified( agent_name="warrior", cost_usd=0.50, duration_seconds=60.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ) _write_records(ledger_path, [*sample_records, new_record]) @@ -313,7 +313,7 @@ def test_same_mtime_inode_change_busts_cache( agent_name="sage", cost_usd=0.77, duration_seconds=42.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ) _write_records(replacement, [*sample_records, extra_record]) os.replace(replacement, ledger_path) diff --git a/tests/unit/test_cost_models.py b/tests/unit/test_cost_models.py index 5f2550b0..13da95d7 100644 --- a/tests/unit/test_cost_models.py +++ b/tests/unit/test_cost_models.py @@ -363,9 +363,9 @@ def test_dispatch_record_accepts_model(self) -> None: agent_name="a", cost_usd=0.0, duration_seconds=0.0, - model="claude-opus-4-7", + model="claude-opus-4-8", ) - assert record.model == "claude-opus-4-7" + assert record.model == "claude-opus-4-8" def test_legacy_jsonl_row_without_model_loads(self) -> None: """Sage memo D5 — pre-BON-351 ledger rows omit ``model`` entirely. @@ -397,12 +397,12 @@ class TestModelCost: def test_model_cost_construction(self) -> None: """Sage memo D5 — four required fields, all positional-by-name.""" m = ModelCost( - model="claude-opus-4-7", + model="claude-opus-4-8", total_cost_usd=1.0, dispatch_count=3, total_duration_seconds=12.5, ) - assert m.model == "claude-opus-4-7" + assert m.model == "claude-opus-4-8" assert m.total_cost_usd == 1.0 assert m.dispatch_count == 3 assert m.total_duration_seconds == 12.5 diff --git a/tests/unit/test_dispatch_options_role.py b/tests/unit/test_dispatch_options_role.py index 6730ed51..faebc43e 100644 --- a/tests/unit/test_dispatch_options_role.py +++ b/tests/unit/test_dispatch_options_role.py @@ -159,13 +159,13 @@ def test_role_with_leading_slash_accepted(self) -> None: def test_role_paired_with_tools_and_model(self) -> None: """Sage D4 + D7 — role travels alongside tools and model.""" opts = DispatchOptions( - model="claude-opus-4-7", + model="claude-opus-4-8", tools=["Read", "Write", "Edit", "Bash", "Grep", "Glob"], role="warrior", ) assert opts.role == "warrior" assert opts.tools == ["Read", "Write", "Edit", "Bash", "Grep", "Glob"] - assert opts.model == "claude-opus-4-7" + assert opts.model == "claude-opus-4-8" # =========================================================================== diff --git a/tests/unit/test_dispatch_pydantic_ai_backend.py b/tests/unit/test_dispatch_pydantic_ai_backend.py index 82e63e35..09fc5251 100644 --- a/tests/unit/test_dispatch_pydantic_ai_backend.py +++ b/tests/unit/test_dispatch_pydantic_ai_backend.py @@ -96,7 +96,7 @@ def test_import_from_module(self): assert _P is not None def test_construct_with_model_kwarg(self): - backend = PydanticAIBackend(model="claude-sonnet-4") + backend = PydanticAIBackend(model="claude-sonnet-4-6") assert backend is not None def test_construct_with_no_args(self): diff --git a/tests/unit/test_dispatch_runner.py b/tests/unit/test_dispatch_runner.py index a4c93fa3..028c5e4d 100644 --- a/tests/unit/test_dispatch_runner.py +++ b/tests/unit/test_dispatch_runner.py @@ -802,12 +802,12 @@ async def test_started_event_carries_agent_and_model(self): env = _envelope(agent_name="knight") backend = ScriptedBackend([env.with_result("ok", cost_usd=0.01)]) await execute_with_retry( - backend, env, _options(model="claude-opus-4"), event_bus=bus, retry_delay=0.0 + backend, env, _options(model="claude-opus-4-8"), event_bus=bus, retry_delay=0.0 ) started = capture.of_type(DispatchStarted) assert len(started) == 1 assert started[0].agent_name == "knight" # type: ignore[attr-defined] - assert started[0].model == "claude-opus-4" # type: ignore[attr-defined] + assert started[0].model == "claude-opus-4-8" # type: ignore[attr-defined] async def test_completed_event_carries_cost_and_duration(self): bus, capture = _bus_with_capture(DispatchCompleted) @@ -974,7 +974,7 @@ async def health_check(self) -> bool: env = _envelope() opts = DispatchOptions( - model="claude-opus-4", + model="claude-opus-4-8", max_turns=7, max_budget_usd=2.5, thinking_depth="ultrathink", @@ -982,7 +982,7 @@ async def health_check(self) -> bool: await execute_with_retry(ObservingBackend(), env, opts, retry_delay=0.0) assert observed["options"] is opts - assert observed["options"].model == "claude-opus-4" + assert observed["options"].model == "claude-opus-4-8" assert observed["options"].thinking_depth == "ultrathink" async def test_envelope_reaches_backend_unchanged(self): @@ -1088,7 +1088,7 @@ async def execute(self, envelope: Envelope, *, options: DispatchOptions) -> Enve async def health_check(self) -> bool: return True - opts = DispatchOptions(model="claude-opus-4-7", max_budget_usd=1.0) + opts = DispatchOptions(model="claude-opus-4-8", max_budget_usd=1.0) result = await execute_with_retry( _ObservingBackend(), env, opts, max_retries=3, retry_delay=0.0 ) @@ -1097,7 +1097,7 @@ async def health_check(self) -> bool: assert result.retries == 2 assert len(observed_models) == 3 # Same model on every attempt — runner does NOT mutate options.model. - assert observed_models == ["claude-opus-4-7"] * 3 + assert observed_models == ["claude-opus-4-8"] * 3 async def test_completed_model_equals_started_model(self): """`DispatchCompleted.model` equals `DispatchStarted.model` for one dispatch. diff --git a/tests/unit/test_engine_executor.py b/tests/unit/test_engine_executor.py index 62f2ccf9..66baebe1 100644 --- a/tests/unit/test_engine_executor.py +++ b/tests/unit/test_engine_executor.py @@ -1180,7 +1180,7 @@ async def test_wizard_and_reviewer_resolve_same( Locks Sage D1 line 55: ``AgentRole.REVIEWER.value == "reviewer"``, the gamified alias ``"wizard"`` ALSO maps to AgentRole.REVIEWER. Both routes resolve to ``DEFAULT_ROLE_TIER[REVIEWER] == REASONING`` - -> ``settings.models.reasoning`` -> ``"claude-opus-4-7"`` by default. + -> ``settings.models.reasoning`` -> ``"claude-opus-4-8"`` by default. This is the dispatch-side guarantee that the wizard handler can safely pass the canonical string while workflow stages pass the diff --git a/tests/unit/test_events.py b/tests/unit/test_events.py index 82b29715..5023b4bb 100644 --- a/tests/unit/test_events.py +++ b/tests/unit/test_events.py @@ -279,10 +279,10 @@ def test_dispatch_completed_accepts_model(self): agent_name="x", cost_usd=0.0, duration_seconds=0.0, - model="claude-opus-4-7", + model="claude-opus-4-8", **SESSION, ) - assert e.model == "claude-opus-4-7" + assert e.model == "claude-opus-4-8" # --------------------------------------------------------------------------- diff --git a/tests/unit/test_model_id_allowlist_sweep.py b/tests/unit/test_model_id_allowlist_sweep.py new file mode 100644 index 00000000..43070c77 --- /dev/null +++ b/tests/unit/test_model_id_allowlist_sweep.py @@ -0,0 +1,116 @@ +"""Sweep test — every Claude model id in src/ + config is a real, current id. + +Walks every ``.py`` file under ``src/bonfire/`` plus the user-facing config +surfaces (``.env.example``, ``README.md``, ``pyproject.toml``) and extracts +every token shaped like a Claude *model* id. Each one must be a member of +the explicit allowlist of real, currently-served model ids below. + +Why this exists: model ids rot silently. A deprecated tier keeps working +until the provider retires it, then every call 404s at once; worse, an id +that was never real (a hallucinated date-suffixed variant) 404s on first +use while looking perfectly plausible in review. This sweep makes every +model-id literal in the shipped surface cost an explicit allowlist entry, +so a model ratchet is a deliberate, reviewed event instead of drift. + +Scope notes: + +* Only *model-family* tokens are checked (``claude-opus*``, ``claude-sonnet*``, + ``claude-haiku*``, ``claude-fable*``, ``claude-mythos*``). Other ``claude-*`` + tokens — package names (``claude-agent-sdk``), tool names (``claude-cli``, + ``claude-code``), editor dirs (``claude-dev``, ``claude-plugin``) — are not + model ids and are exempt. +* ``tests/`` is intentionally out of scope: fixtures use deliberately fake + tier placeholders (``claude-sonnet``, ``claude-opus``) that never reach a + provider. ``docs/audit/`` is frozen decision history and is never edited. + +Reads files on disk only — no subprocess, no network. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +# Repo root = ``repo/tests/unit/`` → ``repo/``. +_REPO_ROOT = Path(__file__).resolve().parents[2] +_SRC_DIR = _REPO_ROOT / "src" / "bonfire" + +# User-facing config surfaces that carry live defaults. +_CONFIG_FILES: tuple[str, ...] = ( + ".env.example", + "README.md", + "pyproject.toml", +) + +# Tokens shaped like a Claude id. The follow-up family filter decides +# whether a token is a *model* id (vs a package/tool name). +_CLAUDE_TOKEN = re.compile(r"claude-[a-z0-9][a-z0-9-]*") +_MODEL_FAMILY = re.compile(r"^claude-(opus|sonnet|haiku|fable|mythos)(-|$)") + +# --------------------------------------------------------------------------- +# Allowlist — real, currently-served model ids ONLY. +# --------------------------------------------------------------------------- +# Extending this set is a deliberate act: add an id only when the provider +# actually serves it. Dated variants are allowed only when they are real +# published snapshot ids (none are currently needed in this repo). +_ALLOWED_MODEL_IDS: frozenset[str] = frozenset( + { + "claude-fable-5", + "claude-opus-4-8", + "claude-sonnet-4-6", + "claude-haiku-4-5", + } +) + + +def _iter_scanned_files() -> list[Path]: + """All files in scope: src/bonfire/**/*.py + the config surfaces.""" + files = [p for p in sorted(_SRC_DIR.rglob("*.py")) if "__pycache__" not in p.parts] + for name in _CONFIG_FILES: + candidate = _REPO_ROOT / name + if candidate.is_file(): + files.append(candidate) + return files + + +def _model_tokens(line: str) -> list[str]: + """Extract model-family claude tokens from one line of text.""" + return [token for token in _CLAUDE_TOKEN.findall(line) if _MODEL_FAMILY.match(token)] + + +def test_all_model_ids_in_src_and_config_are_allowlisted() -> None: + """Every model-id literal in src/ + config must be a real current id. + + Failure message lists every offender as ``path:line: token`` so the + fixer can navigate directly. A new model generation means bumping the + literals AND the allowlist in the same change. + """ + offenders: list[str] = [] + for path in _iter_scanned_files(): + rel = path.relative_to(_REPO_ROOT).as_posix() + text = path.read_text(encoding="utf-8") + for i, line in enumerate(text.splitlines(), start=1): + for token in _model_tokens(line): + if token not in _ALLOWED_MODEL_IDS: + offenders.append(f" {rel}:{i}: {token}") + + assert not offenders, ( + "Found Claude model ids outside the current-id allowlist.\n" + "Each is deprecated, retired, or was never a real model id. " + "Bump it to a current id (or, for a genuinely new real id, extend " + "_ALLOWED_MODEL_IDS in this test with rationale).\n" + "\n".join(offenders) + ) + + +def test_allowlist_ids_look_like_model_ids() -> None: + """Self-check: every allowlist entry matches the model-id shape. + + Guards against typos in the allowlist itself (an entry that the + extractor could never produce would silently never match anything). + """ + malformed = [ + model_id + for model_id in _ALLOWED_MODEL_IDS + if not (_CLAUDE_TOKEN.fullmatch(model_id) and _MODEL_FAMILY.match(model_id)) + ] + assert not malformed, f"Malformed allowlist entries: {malformed}" diff --git a/tests/unit/test_onboard_scanner_claude_memory.py b/tests/unit/test_onboard_scanner_claude_memory.py index 785fb6c8..a135dd2e 100644 --- a/tests/unit/test_onboard_scanner_claude_memory.py +++ b/tests/unit/test_onboard_scanner_claude_memory.py @@ -145,7 +145,7 @@ async def test_panel_is_always_claude_memory(tmp_path): async def test_reads_settings_model(tmp_path): """Reports model override from settings.json.""" home = tmp_path / "home" - _build_claude_dir(home, settings={"model": "claude-sonnet-4-20250514"}) + _build_claude_dir(home, settings={"model": "claude-sonnet-4-6"}) project = tmp_path / "project" project.mkdir() @@ -157,7 +157,7 @@ async def test_reads_settings_model(tmp_path): events = _events(emit) model_events = [e for e in events if e.label == "model"] assert len(model_events) == 1 - assert model_events[0].value == "claude-sonnet-4-20250514" + assert model_events[0].value == "claude-sonnet-4-6" async def test_reads_settings_permissions(tmp_path): diff --git a/tests/unit/test_persona_cli.py b/tests/unit/test_persona_cli.py index 1f94296e..7d599c56 100644 --- a/tests/unit/test_persona_cli.py +++ b/tests/unit/test_persona_cli.py @@ -107,7 +107,7 @@ def test_set_preserves_existing_config( """persona set preserves other keys in bonfire.toml.""" monkeypatch.chdir(tmp_path) toml_path = tmp_path / "bonfire.toml" - toml_path.write_text('[bonfire]\nmodel = "claude-opus-4"\npersona = "falcor"\n') + toml_path.write_text('[bonfire]\nmodel = "claude-opus-4-8"\npersona = "falcor"\n') result = cli_runner.invoke(app, ["persona", "set", "minimal"]) assert result.exit_code == 0 @@ -115,7 +115,7 @@ def test_set_preserves_existing_config( with toml_path.open("rb") as f: data = tomllib.load(f) assert data["bonfire"]["persona"] == "minimal" - assert data["bonfire"]["model"] == "claude-opus-4" + assert data["bonfire"]["model"] == "claude-opus-4-8" def test_set_only_replaces_bonfire_section( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path diff --git a/tests/unit/test_protocols.py b/tests/unit/test_protocols.py index 1029aad3..4d400c29 100644 --- a/tests/unit/test_protocols.py +++ b/tests/unit/test_protocols.py @@ -535,8 +535,8 @@ def test_max_budget_usd_default_is_zero(self): assert isinstance(opts.max_budget_usd, float) def test_override_model(self): - opts = DispatchOptions(model="claude-sonnet-4-20250514") - assert opts.model == "claude-sonnet-4-20250514" + opts = DispatchOptions(model="claude-sonnet-4-6") + assert opts.model == "claude-sonnet-4-6" def test_override_max_turns(self): opts = DispatchOptions(max_turns=3) diff --git a/tests/unit/test_sdk_backend_tool_presence.py b/tests/unit/test_sdk_backend_tool_presence.py index 2fcd553f..295f6961 100644 --- a/tests/unit/test_sdk_backend_tool_presence.py +++ b/tests/unit/test_sdk_backend_tool_presence.py @@ -80,7 +80,7 @@ async def _empty_query(*, prompt: str = "", options: Any = None): # type: ignor def _envelope(agent: str = "warrior-agent") -> Envelope: - return Envelope(task="do work", agent_name=agent, model="claude-opus-4-7") + return Envelope(task="do work", agent_name=agent, model="claude-opus-4-8") # =========================================================================== @@ -101,7 +101,7 @@ async def test_tools_kwarg_present_and_equals_options_tools(self) -> None: ): backend = ClaudeSDKBackend() options = DispatchOptions( - model="claude-opus-4-7", + model="claude-opus-4-8", tools=["Read", "Write", "Edit", "Bash", "Grep", "Glob"], role="warrior", ) @@ -127,7 +127,7 @@ async def test_allowed_tools_kwarg_remains_unchanged(self) -> None: ): backend = ClaudeSDKBackend() options = DispatchOptions( - model="claude-opus-4-7", + model="claude-opus-4-8", tools=["Read", "Write", "Edit", "Bash", "Grep", "Glob"], ) await backend.execute(_envelope(), options=options) @@ -295,10 +295,10 @@ async def test_model_kwarg_preserved(self) -> None: patch("bonfire.dispatch.sdk_backend.query", _empty_query), ): backend = ClaudeSDKBackend() - options = DispatchOptions(model="claude-opus-4-7", tools=["Read"]) + options = DispatchOptions(model="claude-opus-4-8", tools=["Read"]) await backend.execute(_envelope(), options=options) - assert captured.get("model") == "claude-opus-4-7" + assert captured.get("model") == "claude-opus-4-8" async def test_max_turns_preserved(self) -> None: captured, FakeOptions = _make_capture() @@ -443,7 +443,7 @@ async def test_role_kwarg_not_passed_to_claude_agent_options(self) -> None: ): backend = ClaudeSDKBackend() options = DispatchOptions( - model="claude-opus-4-7", + model="claude-opus-4-8", tools=["Read"], role="warrior", ) diff --git a/tests/unit/test_wizard_handler.py b/tests/unit/test_wizard_handler.py index 7aafc98e..9515fe86 100644 --- a/tests/unit/test_wizard_handler.py +++ b/tests/unit/test_wizard_handler.py @@ -246,7 +246,7 @@ async def emit(self, event: Any) -> None: def _make_config( *, - model: str = "claude-opus-4-7", + model: str = "claude-opus-4-8", max_turns: int = 10, max_budget_usd: float = 5.0, dispatch_timeout_seconds: float | None = None, @@ -497,14 +497,14 @@ async def test_setting_sources_empty(self) -> None: @pytest.mark.asyncio async def test_model_uses_config_when_no_override(self) -> None: """model = config.model when stage.model_override is None.""" - handler, backend, _ = _make_handler(config=_make_config(model="claude-opus-4-7")) + handler, backend, _ = _make_handler(config=_make_config(model="claude-opus-4-8")) await handler.handle(_make_stage(model_override=None), _make_envelope(), {}) - assert backend.captured_options.model == "claude-opus-4-7" + assert backend.captured_options.model == "claude-opus-4-8" @pytest.mark.asyncio async def test_model_uses_override_when_set(self) -> None: """stage.model_override wins over config.model.""" - handler, backend, _ = _make_handler(config=_make_config(model="claude-opus-4-7")) + handler, backend, _ = _make_handler(config=_make_config(model="claude-opus-4-8")) await handler.handle( _make_stage(model_override="claude-sonnet-4-6"), _make_envelope(), @@ -694,10 +694,10 @@ async def test_shows_reason_enum(self) -> None: @pytest.mark.asyncio async def test_shows_model_and_cost(self) -> None: - handler, _, gh = _make_handler(canned="", config=_make_config(model="claude-opus-4-7")) + handler, _, gh = _make_handler(canned="", config=_make_config(model="claude-opus-4-8")) await handler.handle(_make_stage(), _make_envelope(), {}) body = gh.actions[-1]["body"] - assert "claude-opus-4-7" in body + assert "claude-opus-4-8" in body assert "$0." in body @pytest.mark.asyncio