Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,13 @@ The leaderboard is cached for 24 hours at
[docs/recommended-models.md](docs/recommended-models.md) for the full
rules, offline behaviour, and a use-case → top-model table.

If the top-ranked entry is a product/ensemble row that only exposes a
display label (e.g. `SynthPanel (Gemini Flash Lite)`) rather than a
runnable provider model id, `--best-model-for` refuses to stamp it onto
`--model` — it prints an actionable message and falls back to your
existing `--model`/default. Pair with `--dry-run` to see the picked
model (and any such refusal) before any LLM call is made.

### Model packs (agent guidance)

`--best-model-for` picks one model; **model packs** pick a model *mix*
Expand Down
20 changes: 20 additions & 0 deletions src/synth_panel/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,26 @@ def _apply_best_model_for(args: argparse.Namespace, spec: str) -> str | None:
return None

print(rec.format_line(), file=sys.stderr)

# gh-519: SynthBench product/ensemble rows can surface a display label
# (e.g. "SynthPanel (Gemini Flash Lite)") in the model field instead of
# a runnable provider model id. Stamping that onto --model defers the
# failure to call time as an opaque provider "not a valid model ID"
# error. Refuse here with an actionable message and fall back to the
# existing --model / default selection (matching the no-recommendation
# path), so --dry-run also makes the non-runnable pick obvious upfront.
if not rec.runnable:
print(
f"synthbench: recommendation '{rec.model}' is a display label, not a "
f"runnable model id — skipping. The top entry for '{spec}' is a "
f"product/ensemble config with no resolvable base model. Pass --model "
f"with a provider model id (e.g. claude-sonnet-4-6, gemini-2.5-flash) "
f"explicitly, or choose a different topic. Continuing with existing "
f"model selection.",
file=sys.stderr,
)
return None

if rec.source == "bundled-snapshot":
print(
"synthbench: note — recommendation derived from the package-bundled "
Expand Down
99 changes: 94 additions & 5 deletions src/synth_panel/synthbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ class Recommendation:
fetched_at: datetime
cache_age_hours: float
low_confidence: bool
runnable: bool = True
"""Whether :attr:`model` is a runnable provider model id (gh-519).

SynthBench product/ensemble rows can surface a human-readable display
label (e.g. ``"SynthPanel (Gemini Flash Lite)"``) in their ``model``
field rather than a provider-runnable id. Stamping such a label onto
``--model`` fails at call time with a provider "not a valid model ID"
error. ``recommend`` sets this to ``False`` when neither the entry's
model field nor an inferred config base resolves to a runnable id, so
the CLI can refuse with an actionable message instead of producing a
non-runnable selection. Defaults to ``True`` so test fixtures that
construct :class:`Recommendation` directly keep their prior shape.
"""
source: str = "live"
"""Provenance discriminator — one of :data:`RECOMMENDATION_SOURCES`.

Expand Down Expand Up @@ -546,6 +559,73 @@ def _resolve_model_string(raw: str) -> str:
return raw


def is_runnable_model_id(model: str) -> bool:
"""Return ``True`` if *model* looks like a runnable provider model id.

SynthBench leaderboard rows — especially product/ensemble configs —
sometimes carry a human-readable display label in their ``model``
field (e.g. ``"SynthPanel (Gemini Flash Lite)"``) rather than a
provider-runnable id. A label like that, stamped onto ``--model``,
fails at call time with a provider "not a valid model ID" error
(gh-519).

The discriminator is structural rather than an allow-list: real model
ids (``claude-haiku-4-5-20251001``, ``gemini-2.5-flash``, ``grok-3``,
``gpt-4o-mini``, ``openrouter/...``, short aliases like ``haiku``, and
``ollama:``/``local:`` specs) are single whitespace-free tokens, while
display labels contain spaces and/or parentheses. The OpenAI-compatible
and local backends accept arbitrary bare ids, so enumerating known
prefixes would wrongly reject valid local/self-hosted models — instead
we reject anything empty, whitespace-bearing, or paren-bearing and
accept the rest.
"""
if not model:
return False
text = model.strip()
if not text:
return False
if any(ch.isspace() for ch in text):
return False
return "(" not in text and ")" not in text


# Canonical provider id prefixes (mirrors the ``model_prefixes`` declared on
# each provider's ProviderConfig in synth_panel.llm.providers.*). Kept local so
# resolving a recommendation doesn't have to import the full LLM client stack;
# only used as a *recognition* heuristic, never for routing.
_KNOWN_PROVIDER_PREFIXES: tuple[str, ...] = (
"claude-",
"gemini-",
"grok-",
"gpt-",
"openrouter/",
)


def _is_recognized_model_id(model: str) -> bool:
"""Stricter than :func:`is_runnable_model_id` — used to gate config-id-derived
base models (gh-519).

``_underlying_base`` extracts a token from ``config_id`` by splitting on
separators, which for hyphenated ids like
``synthpanel-gemini-flash-lite-ba37570c`` yields a meaningless hash
fragment (``ba37570c``). That fragment is whitespace-free so it passes
:func:`is_runnable_model_id`, yet it is not a real model id. We only adopt
a config-derived base when it matches a known provider prefix or is a
registered short alias — otherwise we keep the original (display) label so
the caller refuses rather than swapping in garbage.
"""
if not is_runnable_model_id(model):
return False
text = model.strip()
if any(text.startswith(prefix) for prefix in _KNOWN_PROVIDER_PREFIXES):
return True
# Registered short alias (resolve_alias maps it to a different canonical id).
from synth_panel.llm.aliases import resolve_alias

return resolve_alias(text) != text


def _underlying_base(entry: dict[str, Any]) -> str | None:
"""If an entry is an ensemble/product config, try to surface a base model."""
config_id = entry.get("config_id")
Expand Down Expand Up @@ -602,18 +682,26 @@ def recommend(
framework = entry.get("framework") if isinstance(entry.get("framework"), str) else None
is_ensemble = bool(entry.get("is_ensemble"))

# Product/ensemble configs aren't runnable as-is — fall back to an
# inferred base model from config_id when possible.
if (is_ensemble or framework == "product") and not resolved_model:
# Product/ensemble configs aren't runnable as-is. When the entry's
# model field is empty OR a non-runnable display label (gh-519: e.g.
# "SynthPanel (Gemini Flash Lite)"), try to infer a runnable base model
# from config_id instead. Only adopt the inferred base when it itself
# resolves to a runnable id — otherwise keep the original string so the
# caller can report it verbatim and refuse rather than silently swap in
# a config-id hash fragment.
if (is_ensemble or framework == "product") and not is_runnable_model_id(resolved_model):
base = _underlying_base(entry)
if base:
resolved_model = _resolve_model_string(base)
candidate = _resolve_model_string(base)
if _is_recognized_model_id(candidate):
resolved_model = candidate

final_model = resolved_model or raw_model
run_count = _as_int(entry, "run_count") or 0
cache_age_hours = max(0.0, (datetime.now(timezone.utc) - fetched_at).total_seconds() / 3600.0)

return Recommendation(
model=resolved_model or raw_model,
model=final_model,
raw_model=raw_model,
provider=str(entry.get("provider") or ""),
dataset=dataset,
Expand All @@ -628,5 +716,6 @@ def recommend(
fetched_at=fetched_at,
cache_age_hours=cache_age_hours,
low_confidence=run_count > 0 and run_count < MIN_RUN_COUNT,
runnable=is_runnable_model_id(final_model),
source=source,
)
88 changes: 88 additions & 0 deletions tests/test_best_model_for_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""CLI-level tests for ``--best-model-for`` model stamping (gh-519).

These exercise ``_apply_best_model_for``: the bridge between a SynthBench
:class:`Recommendation` and ``args.model``. The core regression guarded here
is that a non-runnable recommendation (a product/ensemble display label like
``"SynthPanel (Gemini Flash Lite)"``) must NOT be stamped onto ``--model`` —
it has to be refused with an actionable message and fall back to the existing
selection, instead of deferring an opaque provider "not a valid model ID"
failure to call time.
"""

from __future__ import annotations

import argparse
from datetime import datetime, timezone

import pytest

from synth_panel import synthbench
from synth_panel.cli.commands import _apply_best_model_for
from synth_panel.synthbench import Recommendation


def _rec(*, model: str, runnable: bool, is_ensemble: bool = False, framework: str | None = None) -> Recommendation:
return Recommendation(
model=model,
raw_model=model,
provider="",
dataset="globalopinionqa",
topic="Technology & Digital Life",
sps=0.9,
jsd=0.1,
n=100,
cost_per_100q=0.5,
run_count=10,
framework=framework,
is_ensemble=is_ensemble,
fetched_at=datetime.now(timezone.utc),
cache_age_hours=0.0,
low_confidence=False,
runnable=runnable,
source="live",
)


def test_runnable_recommendation_is_stamped(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True))
args = argparse.Namespace(model=None)
picked = _apply_best_model_for(args, "Technology & Digital Life")
assert picked == "gemini-2.5-flash"
assert args.model == "gemini-2.5-flash"


def test_nonrunnable_recommendation_is_refused_not_stamped(
monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
monkeypatch.setattr(
synthbench,
"recommend",
lambda spec: _rec(
model="SynthPanel (Gemini Flash Lite)",
runnable=False,
is_ensemble=True,
framework="product",
),
)
args = argparse.Namespace(model="sonnet")
picked = _apply_best_model_for(args, "Technology & Digital Life")

# Refused: returns None and leaves the prior selection untouched so the
# caller falls through to --model / the default.
assert picked is None
assert args.model == "sonnet"

err = capsys.readouterr().err
# The display label is surfaced verbatim and the message is actionable.
assert "SynthPanel (Gemini Flash Lite)" in err
assert "display label" in err
assert "--model" in err


def test_no_recommendation_falls_through(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None:
monkeypatch.setattr(synthbench, "recommend", lambda spec: None)
args = argparse.Namespace(model="haiku")
picked = _apply_best_model_for(args, "Whatever")
assert picked is None
assert args.model == "haiku"
assert "no recommendation" in capsys.readouterr().err
81 changes: 81 additions & 0 deletions tests/test_synthbench_recommend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
SYNTHBENCH_REFRESH_ENV,
SYNTHBENCH_URL_ENV,
cache_path,
is_runnable_model_id,
load_leaderboard,
parse_target,
rank_entries,
Expand Down Expand Up @@ -240,6 +241,86 @@ def test_recommend_ensemble_falls_back_to_config_base(monkeypatch: pytest.Monkey
assert rec is not None
assert rec.is_ensemble is True
assert rec.model == "claude-haiku-4-5-20251001"
assert rec.runnable is True


@pytest.mark.parametrize(
"model, expected",
[
("claude-haiku-4-5-20251001", True),
("gemini-2.5-flash", True),
("grok-3", True),
("gpt-4o-mini", True),
("openrouter/anthropic/claude-3.5", True),
("haiku", True),
("ollama:llama3", True),
("llama-3.3-70b-instruct", True),
("", False),
(" ", False),
("SynthPanel (Gemini Flash Lite)", False),
("Gemini Flash Lite", False),
("claude (sonnet)", False),
],
)
def test_is_runnable_model_id(model: str, expected: bool) -> None:
assert is_runnable_model_id(model) is expected


def test_recommend_display_label_marked_not_runnable() -> None:
"""gh-519: a product/ensemble row whose model field is a display label
(and whose config_id yields no runnable base) must surface as
runnable=False rather than stamping a bogus model id."""
board = {
"entries": [
_entry(
model="SynthPanel (Gemini Flash Lite)",
sps=0.95,
framework="product",
is_ensemble=True,
# config_id tail is a hash fragment, not a resolvable base
config_id="synthpanel-gemini-flash-lite-tdefault-ba37570c",
)
]
}
rec = recommend(":globalopinionqa", leaderboard=board)
assert rec is not None
assert rec.is_ensemble is True
assert rec.runnable is False
# The original label is preserved verbatim so the caller can report it.
assert rec.model == "SynthPanel (Gemini Flash Lite)"


def test_recommend_display_label_resolves_runnable_base_from_config_id(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When a product/ensemble row carries a display label but the config_id
encodes a resolvable base model, the recommendation becomes runnable."""
from synth_panel.llm import aliases

monkeypatch.setattr(aliases, "_ALIASES_FILE", Path("/nonexistent/aliases.yaml"))
monkeypatch.delenv("SYNTHPANEL_MODEL_ALIASES", raising=False)
aliases._reset_cache()
board = {
"entries": [
_entry(
model="SynthPanel (Haiku)",
sps=0.95,
framework="product",
is_ensemble=True,
config_id="synthpanel:haiku",
)
]
}
rec = recommend(":globalopinionqa", leaderboard=board)
assert rec is not None
assert rec.runnable is True
assert rec.model == "claude-haiku-4-5-20251001"


def test_recommend_plain_model_is_runnable() -> None:
rec = recommend("Economy & Work", leaderboard=SAMPLE)
assert rec is not None
assert rec.runnable is True


def test_recommend_low_confidence_flag() -> None:
Expand Down
Loading