From 79a725737da92dc5b8305024f3814c6cc79a9ed5 Mon Sep 17 00:00:00 2001 From: garnet Date: Sat, 23 May 2026 15:09:39 -0500 Subject: [PATCH 1/2] fix(synthbench): refuse display-label model recs from --best-model-for (sy-kh3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SynthBench product/ensemble leaderboard rows can carry a human-readable display label (e.g. "SynthPanel (Gemini Flash Lite)") in their model field rather than a runnable provider model id. --best-model-for stamped that label straight onto --model, deferring the failure to call time as an opaque provider "not a valid model ID" error (gh-519). - Add is_runnable_model_id(): structural check rejecting whitespace/paren display labels while accepting bare openai-compat/local ids. - recommend() now sets a runnable flag on Recommendation, and only adopts a config_id-derived base model when it's recognized (known provider prefix or registered alias) — never a hash fragment from a hyphenated config_id split. - _apply_best_model_for() refuses a non-runnable recommendation with an actionable message and falls back to the existing --model/default instead of stamping the label; --dry-run surfaces this upfront. Closes #519. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 7 +++ src/synth_panel/cli/commands.py | 20 ++++++ src/synth_panel/synthbench.py | 99 ++++++++++++++++++++++++++++-- tests/test_best_model_for_cli.py | 92 +++++++++++++++++++++++++++ tests/test_synthbench_recommend.py | 81 ++++++++++++++++++++++++ 5 files changed, 294 insertions(+), 5 deletions(-) create mode 100644 tests/test_best_model_for_cli.py diff --git a/README.md b/README.md index 5aea750..bc6524b 100644 --- a/README.md +++ b/README.md @@ -998,6 +998,13 @@ The leaderboard is cached for 24 hours at [docs/recommended-models.md](docs/recommended-models.md) for the full rules, offline behaviour, and a use-case → top-model table. +If the top-ranked entry is a product/ensemble row that only exposes a +display label (e.g. `SynthPanel (Gemini Flash Lite)`) rather than a +runnable provider model id, `--best-model-for` refuses to stamp it onto +`--model` — it prints an actionable message and falls back to your +existing `--model`/default. Pair with `--dry-run` to see the picked +model (and any such refusal) before any LLM call is made. + ### Model packs (agent guidance) `--best-model-for` picks one model; **model packs** pick a model *mix* diff --git a/src/synth_panel/cli/commands.py b/src/synth_panel/cli/commands.py index 374fe57..b44e5b9 100644 --- a/src/synth_panel/cli/commands.py +++ b/src/synth_panel/cli/commands.py @@ -195,6 +195,26 @@ def _apply_best_model_for(args: argparse.Namespace, spec: str) -> str | None: return None print(rec.format_line(), file=sys.stderr) + + # gh-519: SynthBench product/ensemble rows can surface a display label + # (e.g. "SynthPanel (Gemini Flash Lite)") in the model field instead of + # a runnable provider model id. Stamping that onto --model defers the + # failure to call time as an opaque provider "not a valid model ID" + # error. Refuse here with an actionable message and fall back to the + # existing --model / default selection (matching the no-recommendation + # path), so --dry-run also makes the non-runnable pick obvious upfront. + if not rec.runnable: + print( + f"synthbench: recommendation '{rec.model}' is a display label, not a " + f"runnable model id — skipping. The top entry for '{spec}' is a " + f"product/ensemble config with no resolvable base model. Pass --model " + f"with a provider model id (e.g. claude-sonnet-4-6, gemini-2.5-flash) " + f"explicitly, or choose a different topic. Continuing with existing " + f"model selection.", + file=sys.stderr, + ) + return None + if rec.source == "bundled-snapshot": print( "synthbench: note — recommendation derived from the package-bundled " diff --git a/src/synth_panel/synthbench.py b/src/synth_panel/synthbench.py index d2f6559..32aee0b 100644 --- a/src/synth_panel/synthbench.py +++ b/src/synth_panel/synthbench.py @@ -93,6 +93,19 @@ class Recommendation: fetched_at: datetime cache_age_hours: float low_confidence: bool + runnable: bool = True + """Whether :attr:`model` is a runnable provider model id (gh-519). + + SynthBench product/ensemble rows can surface a human-readable display + label (e.g. ``"SynthPanel (Gemini Flash Lite)"``) in their ``model`` + field rather than a provider-runnable id. Stamping such a label onto + ``--model`` fails at call time with a provider "not a valid model ID" + error. ``recommend`` sets this to ``False`` when neither the entry's + model field nor an inferred config base resolves to a runnable id, so + the CLI can refuse with an actionable message instead of producing a + non-runnable selection. Defaults to ``True`` so test fixtures that + construct :class:`Recommendation` directly keep their prior shape. + """ source: str = "live" """Provenance discriminator — one of :data:`RECOMMENDATION_SOURCES`. @@ -546,6 +559,73 @@ def _resolve_model_string(raw: str) -> str: return raw +def is_runnable_model_id(model: str) -> bool: + """Return ``True`` if *model* looks like a runnable provider model id. + + SynthBench leaderboard rows — especially product/ensemble configs — + sometimes carry a human-readable display label in their ``model`` + field (e.g. ``"SynthPanel (Gemini Flash Lite)"``) rather than a + provider-runnable id. A label like that, stamped onto ``--model``, + fails at call time with a provider "not a valid model ID" error + (gh-519). + + The discriminator is structural rather than an allow-list: real model + ids (``claude-haiku-4-5-20251001``, ``gemini-2.5-flash``, ``grok-3``, + ``gpt-4o-mini``, ``openrouter/...``, short aliases like ``haiku``, and + ``ollama:``/``local:`` specs) are single whitespace-free tokens, while + display labels contain spaces and/or parentheses. The OpenAI-compatible + and local backends accept arbitrary bare ids, so enumerating known + prefixes would wrongly reject valid local/self-hosted models — instead + we reject anything empty, whitespace-bearing, or paren-bearing and + accept the rest. + """ + if not model: + return False + text = model.strip() + if not text: + return False + if any(ch.isspace() for ch in text): + return False + return "(" not in text and ")" not in text + + +# Canonical provider id prefixes (mirrors the ``model_prefixes`` declared on +# each provider's ProviderConfig in synth_panel.llm.providers.*). Kept local so +# resolving a recommendation doesn't have to import the full LLM client stack; +# only used as a *recognition* heuristic, never for routing. +_KNOWN_PROVIDER_PREFIXES: tuple[str, ...] = ( + "claude-", + "gemini-", + "grok-", + "gpt-", + "openrouter/", +) + + +def _is_recognized_model_id(model: str) -> bool: + """Stricter than :func:`is_runnable_model_id` — used to gate config-id-derived + base models (gh-519). + + ``_underlying_base`` extracts a token from ``config_id`` by splitting on + separators, which for hyphenated ids like + ``synthpanel-gemini-flash-lite-ba37570c`` yields a meaningless hash + fragment (``ba37570c``). That fragment is whitespace-free so it passes + :func:`is_runnable_model_id`, yet it is not a real model id. We only adopt + a config-derived base when it matches a known provider prefix or is a + registered short alias — otherwise we keep the original (display) label so + the caller refuses rather than swapping in garbage. + """ + if not is_runnable_model_id(model): + return False + text = model.strip() + if any(text.startswith(prefix) for prefix in _KNOWN_PROVIDER_PREFIXES): + return True + # Registered short alias (resolve_alias maps it to a different canonical id). + from synth_panel.llm.aliases import resolve_alias + + return resolve_alias(text) != text + + def _underlying_base(entry: dict[str, Any]) -> str | None: """If an entry is an ensemble/product config, try to surface a base model.""" config_id = entry.get("config_id") @@ -602,18 +682,26 @@ def recommend( framework = entry.get("framework") if isinstance(entry.get("framework"), str) else None is_ensemble = bool(entry.get("is_ensemble")) - # Product/ensemble configs aren't runnable as-is — fall back to an - # inferred base model from config_id when possible. - if (is_ensemble or framework == "product") and not resolved_model: + # Product/ensemble configs aren't runnable as-is. When the entry's + # model field is empty OR a non-runnable display label (gh-519: e.g. + # "SynthPanel (Gemini Flash Lite)"), try to infer a runnable base model + # from config_id instead. Only adopt the inferred base when it itself + # resolves to a runnable id — otherwise keep the original string so the + # caller can report it verbatim and refuse rather than silently swap in + # a config-id hash fragment. + if (is_ensemble or framework == "product") and not is_runnable_model_id(resolved_model): base = _underlying_base(entry) if base: - resolved_model = _resolve_model_string(base) + candidate = _resolve_model_string(base) + if _is_recognized_model_id(candidate): + resolved_model = candidate + final_model = resolved_model or raw_model run_count = _as_int(entry, "run_count") or 0 cache_age_hours = max(0.0, (datetime.now(timezone.utc) - fetched_at).total_seconds() / 3600.0) return Recommendation( - model=resolved_model or raw_model, + model=final_model, raw_model=raw_model, provider=str(entry.get("provider") or ""), dataset=dataset, @@ -628,5 +716,6 @@ def recommend( fetched_at=fetched_at, cache_age_hours=cache_age_hours, low_confidence=run_count > 0 and run_count < MIN_RUN_COUNT, + runnable=is_runnable_model_id(final_model), source=source, ) diff --git a/tests/test_best_model_for_cli.py b/tests/test_best_model_for_cli.py new file mode 100644 index 0000000..0452177 --- /dev/null +++ b/tests/test_best_model_for_cli.py @@ -0,0 +1,92 @@ +"""CLI-level tests for ``--best-model-for`` model stamping (gh-519). + +These exercise ``_apply_best_model_for``: the bridge between a SynthBench +:class:`Recommendation` and ``args.model``. The core regression guarded here +is that a non-runnable recommendation (a product/ensemble display label like +``"SynthPanel (Gemini Flash Lite)"``) must NOT be stamped onto ``--model`` — +it has to be refused with an actionable message and fall back to the existing +selection, instead of deferring an opaque provider "not a valid model ID" +failure to call time. +""" + +from __future__ import annotations + +import argparse +from datetime import datetime, timezone + +import pytest + +from synth_panel import synthbench +from synth_panel.cli.commands import _apply_best_model_for +from synth_panel.synthbench import Recommendation + + +def _rec(*, model: str, runnable: bool, is_ensemble: bool = False, framework: str | None = None) -> Recommendation: + return Recommendation( + model=model, + raw_model=model, + provider="", + dataset="globalopinionqa", + topic="Technology & Digital Life", + sps=0.9, + jsd=0.1, + n=100, + cost_per_100q=0.5, + run_count=10, + framework=framework, + is_ensemble=is_ensemble, + fetched_at=datetime.now(timezone.utc), + cache_age_hours=0.0, + low_confidence=False, + runnable=runnable, + source="live", + ) + + +def test_runnable_recommendation_is_stamped(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True) + ) + args = argparse.Namespace(model=None) + picked = _apply_best_model_for(args, "Technology & Digital Life") + assert picked == "gemini-2.5-flash" + assert args.model == "gemini-2.5-flash" + + +def test_nonrunnable_recommendation_is_refused_not_stamped( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr( + synthbench, + "recommend", + lambda spec: _rec( + model="SynthPanel (Gemini Flash Lite)", + runnable=False, + is_ensemble=True, + framework="product", + ), + ) + args = argparse.Namespace(model="sonnet") + picked = _apply_best_model_for(args, "Technology & Digital Life") + + # Refused: returns None and leaves the prior selection untouched so the + # caller falls through to --model / the default. + assert picked is None + assert args.model == "sonnet" + + err = capsys.readouterr().err + # The display label is surfaced verbatim and the message is actionable. + assert "SynthPanel (Gemini Flash Lite)" in err + assert "display label" in err + assert "--model" in err + + +def test_no_recommendation_falls_through( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr(synthbench, "recommend", lambda spec: None) + args = argparse.Namespace(model="haiku") + picked = _apply_best_model_for(args, "Whatever") + assert picked is None + assert args.model == "haiku" + assert "no recommendation" in capsys.readouterr().err diff --git a/tests/test_synthbench_recommend.py b/tests/test_synthbench_recommend.py index 79ec88d..c63d775 100644 --- a/tests/test_synthbench_recommend.py +++ b/tests/test_synthbench_recommend.py @@ -22,6 +22,7 @@ SYNTHBENCH_REFRESH_ENV, SYNTHBENCH_URL_ENV, cache_path, + is_runnable_model_id, load_leaderboard, parse_target, rank_entries, @@ -240,6 +241,86 @@ def test_recommend_ensemble_falls_back_to_config_base(monkeypatch: pytest.Monkey assert rec is not None assert rec.is_ensemble is True assert rec.model == "claude-haiku-4-5-20251001" + assert rec.runnable is True + + +@pytest.mark.parametrize( + "model, expected", + [ + ("claude-haiku-4-5-20251001", True), + ("gemini-2.5-flash", True), + ("grok-3", True), + ("gpt-4o-mini", True), + ("openrouter/anthropic/claude-3.5", True), + ("haiku", True), + ("ollama:llama3", True), + ("llama-3.3-70b-instruct", True), + ("", False), + (" ", False), + ("SynthPanel (Gemini Flash Lite)", False), + ("Gemini Flash Lite", False), + ("claude (sonnet)", False), + ], +) +def test_is_runnable_model_id(model: str, expected: bool) -> None: + assert is_runnable_model_id(model) is expected + + +def test_recommend_display_label_marked_not_runnable() -> None: + """gh-519: a product/ensemble row whose model field is a display label + (and whose config_id yields no runnable base) must surface as + runnable=False rather than stamping a bogus model id.""" + board = { + "entries": [ + _entry( + model="SynthPanel (Gemini Flash Lite)", + sps=0.95, + framework="product", + is_ensemble=True, + # config_id tail is a hash fragment, not a resolvable base + config_id="synthpanel-gemini-flash-lite-tdefault-ba37570c", + ) + ] + } + rec = recommend(":globalopinionqa", leaderboard=board) + assert rec is not None + assert rec.is_ensemble is True + assert rec.runnable is False + # The original label is preserved verbatim so the caller can report it. + assert rec.model == "SynthPanel (Gemini Flash Lite)" + + +def test_recommend_display_label_resolves_runnable_base_from_config_id( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When a product/ensemble row carries a display label but the config_id + encodes a resolvable base model, the recommendation becomes runnable.""" + from synth_panel.llm import aliases + + monkeypatch.setattr(aliases, "_ALIASES_FILE", Path("/nonexistent/aliases.yaml")) + monkeypatch.delenv("SYNTHPANEL_MODEL_ALIASES", raising=False) + aliases._reset_cache() + board = { + "entries": [ + _entry( + model="SynthPanel (Haiku)", + sps=0.95, + framework="product", + is_ensemble=True, + config_id="synthpanel:haiku", + ) + ] + } + rec = recommend(":globalopinionqa", leaderboard=board) + assert rec is not None + assert rec.runnable is True + assert rec.model == "claude-haiku-4-5-20251001" + + +def test_recommend_plain_model_is_runnable() -> None: + rec = recommend("Economy & Work", leaderboard=SAMPLE) + assert rec is not None + assert rec.runnable is True def test_recommend_low_confidence_flag() -> None: From 0839f8e4b5532601d4675641fb94bffcbff03c44 Mon Sep 17 00:00:00 2001 From: garnet Date: Sat, 23 May 2026 15:24:53 -0500 Subject: [PATCH 2/2] style: ruff format test_best_model_for_cli.py (sy-kh3) Collapse multi-line calls that fit the project's line length so `ruff format --check` passes in CI. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_best_model_for_cli.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_best_model_for_cli.py b/tests/test_best_model_for_cli.py index 0452177..433c78b 100644 --- a/tests/test_best_model_for_cli.py +++ b/tests/test_best_model_for_cli.py @@ -44,9 +44,7 @@ def _rec(*, model: str, runnable: bool, is_ensemble: bool = False, framework: st def test_runnable_recommendation_is_stamped(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr( - synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True) - ) + monkeypatch.setattr(synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True)) args = argparse.Namespace(model=None) picked = _apply_best_model_for(args, "Technology & Digital Life") assert picked == "gemini-2.5-flash" @@ -81,9 +79,7 @@ def test_nonrunnable_recommendation_is_refused_not_stamped( assert "--model" in err -def test_no_recommendation_falls_through( - monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] -) -> None: +def test_no_recommendation_falls_through(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: monkeypatch.setattr(synthbench, "recommend", lambda spec: None) args = argparse.Namespace(model="haiku") picked = _apply_best_model_for(args, "Whatever")