From 79a725737da92dc5b8305024f3814c6cc79a9ed5 Mon Sep 17 00:00:00 2001
From: garnet <openclaw@dataviking.tech>
Date: Sat, 23 May 2026 15:09:39 -0500
Subject: [PATCH 1/2] fix(synthbench): refuse display-label model recs from
 --best-model-for (sy-kh3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SynthBench product/ensemble leaderboard rows can carry a human-readable
display label (e.g. "SynthPanel (Gemini Flash Lite)") in their model
field rather than a runnable provider model id. --best-model-for stamped
that label straight onto --model, deferring the failure to call time as
an opaque provider "not a valid model ID" error (gh-519).

- Add is_runnable_model_id(): structural check rejecting whitespace/paren
  display labels while accepting bare openai-compat/local ids.
- recommend() now sets a runnable flag on Recommendation, and only adopts
  a config_id-derived base model when it's recognized (known provider
  prefix or registered alias) — never a hash fragment from a hyphenated
  config_id split.
- _apply_best_model_for() refuses a non-runnable recommendation with an
  actionable message and falls back to the existing --model/default
  instead of stamping the label; --dry-run surfaces this upfront.

Closes #519.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md                          |  7 +++
 src/synth_panel/cli/commands.py    | 20 ++++++
 src/synth_panel/synthbench.py      | 99 ++++++++++++++++++++++++++++--
 tests/test_best_model_for_cli.py   | 92 +++++++++++++++++++++++++++
 tests/test_synthbench_recommend.py | 81 ++++++++++++++++++++++++
 5 files changed, 294 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_best_model_for_cli.py

diff --git a/README.md b/README.md
index 5aea750..bc6524b 100644
--- a/README.md
+++ b/README.md
@@ -998,6 +998,13 @@ The leaderboard is cached for 24 hours at
 [docs/recommended-models.md](docs/recommended-models.md) for the full
 rules, offline behaviour, and a use-case → top-model table.
 
+If the top-ranked entry is a product/ensemble row that only exposes a
+display label (e.g. `SynthPanel (Gemini Flash Lite)`) rather than a
+runnable provider model id, `--best-model-for` refuses to stamp it onto
+`--model` — it prints an actionable message and falls back to your
+existing `--model`/default. Pair with `--dry-run` to see the picked
+model (and any such refusal) before any LLM call is made.
+
 ### Model packs (agent guidance)
 
 `--best-model-for` picks one model; **model packs** pick a model *mix*
diff --git a/src/synth_panel/cli/commands.py b/src/synth_panel/cli/commands.py
index 374fe57..b44e5b9 100644
--- a/src/synth_panel/cli/commands.py
+++ b/src/synth_panel/cli/commands.py
@@ -195,6 +195,26 @@ def _apply_best_model_for(args: argparse.Namespace, spec: str) -> str | None:
         return None
 
     print(rec.format_line(), file=sys.stderr)
+
+    # gh-519: SynthBench product/ensemble rows can surface a display label
+    # (e.g. "SynthPanel (Gemini Flash Lite)") in the model field instead of
+    # a runnable provider model id. Stamping that onto --model defers the
+    # failure to call time as an opaque provider "not a valid model ID"
+    # error. Refuse here with an actionable message and fall back to the
+    # existing --model / default selection (matching the no-recommendation
+    # path), so --dry-run also makes the non-runnable pick obvious upfront.
+    if not rec.runnable:
+        print(
+            f"synthbench: recommendation '{rec.model}' is a display label, not a "
+            f"runnable model id — skipping. The top entry for '{spec}' is a "
+            f"product/ensemble config with no resolvable base model. Pass --model "
+            f"with a provider model id (e.g. claude-sonnet-4-6, gemini-2.5-flash) "
+            f"explicitly, or choose a different topic. Continuing with existing "
+            f"model selection.",
+            file=sys.stderr,
+        )
+        return None
+
     if rec.source == "bundled-snapshot":
         print(
             "synthbench: note — recommendation derived from the package-bundled "
diff --git a/src/synth_panel/synthbench.py b/src/synth_panel/synthbench.py
index d2f6559..32aee0b 100644
--- a/src/synth_panel/synthbench.py
+++ b/src/synth_panel/synthbench.py
@@ -93,6 +93,19 @@ class Recommendation:
     fetched_at: datetime
     cache_age_hours: float
     low_confidence: bool
+    runnable: bool = True
+    """Whether :attr:`model` is a runnable provider model id (gh-519).
+
+    SynthBench product/ensemble rows can surface a human-readable display
+    label (e.g. ``"SynthPanel (Gemini Flash Lite)"``) in their ``model``
+    field rather than a provider-runnable id. Stamping such a label onto
+    ``--model`` fails at call time with a provider "not a valid model ID"
+    error. ``recommend`` sets this to ``False`` when neither the entry's
+    model field nor an inferred config base resolves to a runnable id, so
+    the CLI can refuse with an actionable message instead of producing a
+    non-runnable selection. Defaults to ``True`` so test fixtures that
+    construct :class:`Recommendation` directly keep their prior shape.
+    """
     source: str = "live"
     """Provenance discriminator — one of :data:`RECOMMENDATION_SOURCES`.
 
@@ -546,6 +559,73 @@ def _resolve_model_string(raw: str) -> str:
     return raw
 
 
+def is_runnable_model_id(model: str) -> bool:
+    """Return ``True`` if *model* looks like a runnable provider model id.
+
+    SynthBench leaderboard rows — especially product/ensemble configs —
+    sometimes carry a human-readable display label in their ``model``
+    field (e.g. ``"SynthPanel (Gemini Flash Lite)"``) rather than a
+    provider-runnable id. A label like that, stamped onto ``--model``,
+    fails at call time with a provider "not a valid model ID" error
+    (gh-519).
+
+    The discriminator is structural rather than an allow-list: real model
+    ids (``claude-haiku-4-5-20251001``, ``gemini-2.5-flash``, ``grok-3``,
+    ``gpt-4o-mini``, ``openrouter/...``, short aliases like ``haiku``, and
+    ``ollama:``/``local:`` specs) are single whitespace-free tokens, while
+    display labels contain spaces and/or parentheses. The OpenAI-compatible
+    and local backends accept arbitrary bare ids, so enumerating known
+    prefixes would wrongly reject valid local/self-hosted models — instead
+    we reject anything empty, whitespace-bearing, or paren-bearing and
+    accept the rest.
+    """
+    if not model:
+        return False
+    text = model.strip()
+    if not text:
+        return False
+    if any(ch.isspace() for ch in text):
+        return False
+    return "(" not in text and ")" not in text
+
+
+# Canonical provider id prefixes (mirrors the ``model_prefixes`` declared on
+# each provider's ProviderConfig in synth_panel.llm.providers.*). Kept local so
+# resolving a recommendation doesn't have to import the full LLM client stack;
+# only used as a *recognition* heuristic, never for routing.
+_KNOWN_PROVIDER_PREFIXES: tuple[str, ...] = (
+    "claude-",
+    "gemini-",
+    "grok-",
+    "gpt-",
+    "openrouter/",
+)
+
+
+def _is_recognized_model_id(model: str) -> bool:
+    """Stricter than :func:`is_runnable_model_id` — used to gate config-id-derived
+    base models (gh-519).
+
+    ``_underlying_base`` extracts a token from ``config_id`` by splitting on
+    separators, which for hyphenated ids like
+    ``synthpanel-gemini-flash-lite-ba37570c`` yields a meaningless hash
+    fragment (``ba37570c``). That fragment is whitespace-free so it passes
+    :func:`is_runnable_model_id`, yet it is not a real model id. We only adopt
+    a config-derived base when it matches a known provider prefix or is a
+    registered short alias — otherwise we keep the original (display) label so
+    the caller refuses rather than swapping in garbage.
+    """
+    if not is_runnable_model_id(model):
+        return False
+    text = model.strip()
+    if any(text.startswith(prefix) for prefix in _KNOWN_PROVIDER_PREFIXES):
+        return True
+    # Registered short alias (resolve_alias maps it to a different canonical id).
+    from synth_panel.llm.aliases import resolve_alias
+
+    return resolve_alias(text) != text
+
+
 def _underlying_base(entry: dict[str, Any]) -> str | None:
     """If an entry is an ensemble/product config, try to surface a base model."""
     config_id = entry.get("config_id")
@@ -602,18 +682,26 @@ def recommend(
     framework = entry.get("framework") if isinstance(entry.get("framework"), str) else None
     is_ensemble = bool(entry.get("is_ensemble"))
 
-    # Product/ensemble configs aren't runnable as-is — fall back to an
-    # inferred base model from config_id when possible.
-    if (is_ensemble or framework == "product") and not resolved_model:
+    # Product/ensemble configs aren't runnable as-is. When the entry's
+    # model field is empty OR a non-runnable display label (gh-519: e.g.
+    # "SynthPanel (Gemini Flash Lite)"), try to infer a runnable base model
+    # from config_id instead. Only adopt the inferred base when it itself
+    # resolves to a runnable id — otherwise keep the original string so the
+    # caller can report it verbatim and refuse rather than silently swap in
+    # a config-id hash fragment.
+    if (is_ensemble or framework == "product") and not is_runnable_model_id(resolved_model):
         base = _underlying_base(entry)
         if base:
-            resolved_model = _resolve_model_string(base)
+            candidate = _resolve_model_string(base)
+            if _is_recognized_model_id(candidate):
+                resolved_model = candidate
 
+    final_model = resolved_model or raw_model
     run_count = _as_int(entry, "run_count") or 0
     cache_age_hours = max(0.0, (datetime.now(timezone.utc) - fetched_at).total_seconds() / 3600.0)
 
     return Recommendation(
-        model=resolved_model or raw_model,
+        model=final_model,
         raw_model=raw_model,
         provider=str(entry.get("provider") or ""),
         dataset=dataset,
@@ -628,5 +716,6 @@ def recommend(
         fetched_at=fetched_at,
         cache_age_hours=cache_age_hours,
         low_confidence=run_count > 0 and run_count < MIN_RUN_COUNT,
+        runnable=is_runnable_model_id(final_model),
         source=source,
     )
diff --git a/tests/test_best_model_for_cli.py b/tests/test_best_model_for_cli.py
new file mode 100644
index 0000000..0452177
--- /dev/null
+++ b/tests/test_best_model_for_cli.py
@@ -0,0 +1,92 @@
+"""CLI-level tests for ``--best-model-for`` model stamping (gh-519).
+
+These exercise ``_apply_best_model_for``: the bridge between a SynthBench
+:class:`Recommendation` and ``args.model``. The core regression guarded here
+is that a non-runnable recommendation (a product/ensemble display label like
+``"SynthPanel (Gemini Flash Lite)"``) must NOT be stamped onto ``--model`` —
+it has to be refused with an actionable message and fall back to the existing
+selection, instead of deferring an opaque provider "not a valid model ID"
+failure to call time.
+"""
+
+from __future__ import annotations
+
+import argparse
+from datetime import datetime, timezone
+
+import pytest
+
+from synth_panel import synthbench
+from synth_panel.cli.commands import _apply_best_model_for
+from synth_panel.synthbench import Recommendation
+
+
+def _rec(*, model: str, runnable: bool, is_ensemble: bool = False, framework: str | None = None) -> Recommendation:
+    return Recommendation(
+        model=model,
+        raw_model=model,
+        provider="",
+        dataset="globalopinionqa",
+        topic="Technology & Digital Life",
+        sps=0.9,
+        jsd=0.1,
+        n=100,
+        cost_per_100q=0.5,
+        run_count=10,
+        framework=framework,
+        is_ensemble=is_ensemble,
+        fetched_at=datetime.now(timezone.utc),
+        cache_age_hours=0.0,
+        low_confidence=False,
+        runnable=runnable,
+        source="live",
+    )
+
+
+def test_runnable_recommendation_is_stamped(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True)
+    )
+    args = argparse.Namespace(model=None)
+    picked = _apply_best_model_for(args, "Technology & Digital Life")
+    assert picked == "gemini-2.5-flash"
+    assert args.model == "gemini-2.5-flash"
+
+
+def test_nonrunnable_recommendation_is_refused_not_stamped(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setattr(
+        synthbench,
+        "recommend",
+        lambda spec: _rec(
+            model="SynthPanel (Gemini Flash Lite)",
+            runnable=False,
+            is_ensemble=True,
+            framework="product",
+        ),
+    )
+    args = argparse.Namespace(model="sonnet")
+    picked = _apply_best_model_for(args, "Technology & Digital Life")
+
+    # Refused: returns None and leaves the prior selection untouched so the
+    # caller falls through to --model / the default.
+    assert picked is None
+    assert args.model == "sonnet"
+
+    err = capsys.readouterr().err
+    # The display label is surfaced verbatim and the message is actionable.
+    assert "SynthPanel (Gemini Flash Lite)" in err
+    assert "display label" in err
+    assert "--model" in err
+
+
+def test_no_recommendation_falls_through(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    monkeypatch.setattr(synthbench, "recommend", lambda spec: None)
+    args = argparse.Namespace(model="haiku")
+    picked = _apply_best_model_for(args, "Whatever")
+    assert picked is None
+    assert args.model == "haiku"
+    assert "no recommendation" in capsys.readouterr().err
diff --git a/tests/test_synthbench_recommend.py b/tests/test_synthbench_recommend.py
index 79ec88d..c63d775 100644
--- a/tests/test_synthbench_recommend.py
+++ b/tests/test_synthbench_recommend.py
@@ -22,6 +22,7 @@
     SYNTHBENCH_REFRESH_ENV,
     SYNTHBENCH_URL_ENV,
     cache_path,
+    is_runnable_model_id,
     load_leaderboard,
     parse_target,
     rank_entries,
@@ -240,6 +241,86 @@ def test_recommend_ensemble_falls_back_to_config_base(monkeypatch: pytest.Monkey
     assert rec is not None
     assert rec.is_ensemble is True
     assert rec.model == "claude-haiku-4-5-20251001"
+    assert rec.runnable is True
+
+
+@pytest.mark.parametrize(
+    "model, expected",
+    [
+        ("claude-haiku-4-5-20251001", True),
+        ("gemini-2.5-flash", True),
+        ("grok-3", True),
+        ("gpt-4o-mini", True),
+        ("openrouter/anthropic/claude-3.5", True),
+        ("haiku", True),
+        ("ollama:llama3", True),
+        ("llama-3.3-70b-instruct", True),
+        ("", False),
+        ("   ", False),
+        ("SynthPanel (Gemini Flash Lite)", False),
+        ("Gemini Flash Lite", False),
+        ("claude (sonnet)", False),
+    ],
+)
+def test_is_runnable_model_id(model: str, expected: bool) -> None:
+    assert is_runnable_model_id(model) is expected
+
+
+def test_recommend_display_label_marked_not_runnable() -> None:
+    """gh-519: a product/ensemble row whose model field is a display label
+    (and whose config_id yields no runnable base) must surface as
+    runnable=False rather than stamping a bogus model id."""
+    board = {
+        "entries": [
+            _entry(
+                model="SynthPanel (Gemini Flash Lite)",
+                sps=0.95,
+                framework="product",
+                is_ensemble=True,
+                # config_id tail is a hash fragment, not a resolvable base
+                config_id="synthpanel-gemini-flash-lite-tdefault-ba37570c",
+            )
+        ]
+    }
+    rec = recommend(":globalopinionqa", leaderboard=board)
+    assert rec is not None
+    assert rec.is_ensemble is True
+    assert rec.runnable is False
+    # The original label is preserved verbatim so the caller can report it.
+    assert rec.model == "SynthPanel (Gemini Flash Lite)"
+
+
+def test_recommend_display_label_resolves_runnable_base_from_config_id(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When a product/ensemble row carries a display label but the config_id
+    encodes a resolvable base model, the recommendation becomes runnable."""
+    from synth_panel.llm import aliases
+
+    monkeypatch.setattr(aliases, "_ALIASES_FILE", Path("/nonexistent/aliases.yaml"))
+    monkeypatch.delenv("SYNTHPANEL_MODEL_ALIASES", raising=False)
+    aliases._reset_cache()
+    board = {
+        "entries": [
+            _entry(
+                model="SynthPanel (Haiku)",
+                sps=0.95,
+                framework="product",
+                is_ensemble=True,
+                config_id="synthpanel:haiku",
+            )
+        ]
+    }
+    rec = recommend(":globalopinionqa", leaderboard=board)
+    assert rec is not None
+    assert rec.runnable is True
+    assert rec.model == "claude-haiku-4-5-20251001"
+
+
+def test_recommend_plain_model_is_runnable() -> None:
+    rec = recommend("Economy & Work", leaderboard=SAMPLE)
+    assert rec is not None
+    assert rec.runnable is True
 
 
 def test_recommend_low_confidence_flag() -> None:

From 0839f8e4b5532601d4675641fb94bffcbff03c44 Mon Sep 17 00:00:00 2001
From: garnet <openclaw@dataviking.tech>
Date: Sat, 23 May 2026 15:24:53 -0500
Subject: [PATCH 2/2] style: ruff format test_best_model_for_cli.py (sy-kh3)

Collapse multi-line calls that fit the project's line length so
`ruff format --check` passes in CI.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_best_model_for_cli.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/test_best_model_for_cli.py b/tests/test_best_model_for_cli.py
index 0452177..433c78b 100644
--- a/tests/test_best_model_for_cli.py
+++ b/tests/test_best_model_for_cli.py
@@ -44,9 +44,7 @@ def _rec(*, model: str, runnable: bool, is_ensemble: bool = False, framework: st
 
 
 def test_runnable_recommendation_is_stamped(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setattr(
-        synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True)
-    )
+    monkeypatch.setattr(synthbench, "recommend", lambda spec: _rec(model="gemini-2.5-flash", runnable=True))
     args = argparse.Namespace(model=None)
     picked = _apply_best_model_for(args, "Technology & Digital Life")
     assert picked == "gemini-2.5-flash"
@@ -81,9 +79,7 @@ def test_nonrunnable_recommendation_is_refused_not_stamped(
     assert "--model" in err
 
 
-def test_no_recommendation_falls_through(
-    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
-) -> None:
+def test_no_recommendation_falls_through(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None:
     monkeypatch.setattr(synthbench, "recommend", lambda spec: None)
     args = argparse.Namespace(model="haiku")
     picked = _apply_best_model_for(args, "Whatever")