diff --git a/codec_cookbook/__init__.py b/codec_cookbook/__init__.py new file mode 100644 index 0000000..aec93b0 --- /dev/null +++ b/codec_cookbook/__init__.py @@ -0,0 +1,21 @@ +"""CODEC Cookbook — local-model lifecycle management (scan → recommend → +download → serve → list → stop) for the M1 Ultra workstation. + +This is the NON-skill helper package. All OS / subprocess / PM2 / network work +lives here so the six `skills/cookbook_*.py` skill files stay thin (import only +this package + stdlib-safe modules) and therefore pass the `SkillRegistry` +load-time AST safety gate (`codec_config.is_dangerous_skill_code`, which +forbids `os`/`subprocess`/`socket`/... in skill files). + +HARD SAFETY CONTRACT (enforced in serve.py): + * Cookbook only ever stops a PM2 process it started, in the `cookbook-` + namespace, after explicit confirm=True. + * It never binds to or stops the protected ports (8083/8090/8094/9223/5678) + or any port currently bound by a non-cookbook process (live `pm2 jlist` + + socket probe at call time). + * Its own serve range is 8110-8119. + * It never issues docker stop/rm, never changes an existing service's port, + never restarts a running service. +""" + +__all__ = ["catalog", "probe", "fit", "serve", "download"] diff --git a/codec_cookbook/args.py b/codec_cookbook/args.py new file mode 100644 index 0000000..7571dce --- /dev/null +++ b/codec_cookbook/args.py @@ -0,0 +1,66 @@ +"""Tiny argument parser shared by the thin cookbook_* skills. + +Skills receive a single `task` string (CODEC's `run(task, app, ctx)` contract), +so structured options are parsed out of it here. `re` only — keeps the skill +files AST-safe (no os/subprocess) and DRY. +""" +from __future__ import annotations + +import re +from typing import Optional + +from . import catalog + + +def parse_model_id(task: str) -> Optional[str]: + """First known catalog id that appears as a whole token in `task`.""" + if not task: + return None + tokens = re.findall(r"[A-Za-z0-9_.\-]+", task.lower()) + known = set(catalog.ids()) + for tok in tokens: + if tok in known: + return tok + return None + + +def parse_context(task: str, default: int = 8192) -> int: + """`context 8192`, `context=8192`, `ctx 4096`, `context_length: 16384`.""" + m = re.search(r"(?:context|ctx)(?:[ _]?length)?\s*[=:]?\s*(\d{3,7})", (task or "").lower()) + if m: + try: + return int(m.group(1)) + except ValueError: + pass + return default + + +def parse_flag(task: str, flag: str) -> bool: + """True if `flag` appears as a whole word, or `flag=true`/`flag:yes`.""" + low = (task or "").lower() + if re.search(rf"\b{re.escape(flag)}\b\s*[=:]\s*(?:true|yes|1|on)\b", low): + return True + if re.search(rf"\b{re.escape(flag)}=(?:true|yes|1|on)\b", low): + return True + return bool(re.search(rf"\b{re.escape(flag)}\b", low)) + + +def parse_port(task: str) -> Optional[int]: + """A bare port number in the Cookbook range (8110-8119) mentioned in `task`.""" + for m in re.findall(r"\b(\d{4,5})\b", task or ""): + try: + p = int(m) + except ValueError: + continue + if 8110 <= p <= 8119: + return p + return None + + +def parse_role(task: str) -> Optional[str]: + """A recommendation role mentioned in `task` (chat/reason/code/max/fast/tiny).""" + low = (task or "").lower() + for role in ("chat", "reason", "code", "max", "fast", "tiny"): + if re.search(rf"\b{role}\b", low): + return role + return None diff --git a/codec_cookbook/catalog.json b/codec_cookbook/catalog.json new file mode 100644 index 0000000..a123cc7 --- /dev/null +++ b/codec_cookbook/catalog.json @@ -0,0 +1,7 @@ +[ + {"id":"qwen3-30b-a3b","hf_repo":"mlx-community/Qwen3-30B-A3B-Instruct-2507-4bit","backend":"mlx","roles":["chat","reason"],"anchor_gb":17.2}, + {"id":"qwen3-coder-30b","hf_repo":"mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit","backend":"mlx","roles":["code"],"anchor_gb":17.2}, + {"id":"qwen3-next-80b","hf_repo":"mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit","backend":"mlx","roles":["max"],"anchor_gb":42.0}, + {"id":"qwen3-4b","hf_repo":"mlx-community/Qwen3-4B-Instruct-2507-4bit","backend":"mlx","roles":["fast"]}, + {"id":"llama32-3b","hf_repo":"mlx-community/Llama-3.2-3B-Instruct-4bit","backend":"mlx","roles":["tiny"]} +] diff --git a/codec_cookbook/catalog.py b/codec_cookbook/catalog.py new file mode 100644 index 0000000..ba597d8 --- /dev/null +++ b/codec_cookbook/catalog.py @@ -0,0 +1,57 @@ +"""Cookbook model catalog — verified, downloadable options. + +Loaded from `catalog.json` (next to this file). The primary `qwen3.6@8083` +model that the live stack serves is intentionally NOT in here — Cookbook never +manages it. +""" +from __future__ import annotations + +import json +import os +from functools import lru_cache +from typing import Optional + +_CATALOG_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "catalog.json") + + +@lru_cache(maxsize=1) +def _load() -> list[dict]: + with open(_CATALOG_PATH, encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, list): + raise ValueError("catalog.json must be a JSON array") + return data + + +def all_entries() -> list[dict]: + """Return a copy of every catalog entry.""" + return [dict(e) for e in _load()] + + +def ids() -> list[str]: + """Return the list of known model ids.""" + return [e["id"] for e in _load()] + + +def get(model_id: Optional[str]) -> dict: + """Return the catalog entry for `model_id`. Raises KeyError if unknown so + callers fail loud rather than serving a mystery repo.""" + if not model_id: + raise KeyError("no model_id given") + for e in _load(): + if e["id"] == model_id: + return dict(e) + raise KeyError(f"unknown model id: {model_id!r} (known: {', '.join(ids())})") + + +def find(model_id: Optional[str]) -> Optional[dict]: + """Like get() but returns None instead of raising — for arg-parsing paths.""" + try: + return get(model_id) + except KeyError: + return None + + +def by_role(role: str) -> list[dict]: + """All entries advertising a given role (chat/reason/code/max/fast/tiny).""" + return [dict(e) for e in _load() if role in e.get("roles", [])] diff --git a/codec_cookbook/download.py b/codec_cookbook/download.py new file mode 100644 index 0000000..b21c81f --- /dev/null +++ b/codec_cookbook/download.py @@ -0,0 +1,130 @@ +"""Cookbook model downloads — detached Hugging Face jobs + status polling. + +Each download runs as a DETACHED subprocess (start_new_session) so it survives +across skill calls (every skill `run()` is a fresh, short-lived call). The child +writes its own per-repo status file with stdlib only (no dependency on this repo +being importable in the child), and status() reconciles a dead pid to +'interrupted'. Downloads land in the standard HF cache — they never touch the +running stack. +""" +from __future__ import annotations + +import json +import logging +import os +import subprocess +import sys +import tempfile +import time + +log = logging.getLogger("codec_cookbook.download") + +DL_DIR = os.path.expanduser("~/.codec/cookbook/downloads") + +# Self-contained child: writes {repo,state,pid,...} to argv[2] via tmp+replace. +_CHILD = r""" +import sys, json, os, time, tempfile +repo, sf = sys.argv[1], sys.argv[2] +def w(state, **kw): + d = {"repo": repo, "state": state, "pid": os.getpid(), "updated_at": time.time()} + d.update(kw) + fd, tmp = tempfile.mkstemp(dir=os.path.dirname(sf), suffix=".tmp") + with os.fdopen(fd, "w") as f: + json.dump(d, f) + os.replace(tmp, sf) +w("running", started_at=time.time()) +try: + from huggingface_hub import snapshot_download + p = snapshot_download(repo) + w("done", path=p, finished_at=time.time()) +except Exception as e: + w("error", error=str(e)[:500], finished_at=time.time()) +""" + + +def _slug(repo: str) -> str: + return repo.replace("/", "__").replace(":", "_") + + +def _status_file(repo: str) -> str: + return os.path.join(DL_DIR, _slug(repo) + ".json") + + +def _read_status(repo: str) -> dict | None: + try: + with open(_status_file(repo), encoding="utf-8") as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + return None + + +def _pid_alive(pid) -> bool: + try: + os.kill(int(pid), 0) + return True + except (OSError, ValueError, TypeError): + return False + + +def _write_initial(repo: str, pid: int | None = None) -> None: + os.makedirs(DL_DIR, exist_ok=True) + sf = _status_file(repo) + data = {"repo": repo, "state": "starting", "pid": pid, "updated_at": time.time()} + fd, tmp = tempfile.mkstemp(dir=DL_DIR, suffix=".tmp") + with os.fdopen(fd, "w") as f: + json.dump(data, f) + os.replace(tmp, sf) + + +def start(repo: str) -> dict: + """Begin (or report) a download of `repo`. Idempotent: if a job is already + running, returns its current status instead of spawning a duplicate.""" + if not repo: + return {"state": "error", "error": "no repo given"} + cur = status(repo) + if cur.get("state") in ("starting", "running"): + return cur # already in flight + os.makedirs(DL_DIR, exist_ok=True) + _write_initial(repo) + try: + proc = subprocess.Popen( + [sys.executable, "-c", _CHILD, repo, _status_file(repo)], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + start_new_session=True, # detach: survives the parent skill call + ) + except Exception as e: + return {"repo": repo, "state": "error", "error": f"spawn failed: {e}"} + _write_initial(repo, pid=proc.pid) + return {"repo": repo, "state": "starting", "pid": proc.pid} + + +def status(repo: str) -> dict: + """Current download state: not_started | starting | running | done | error | + interrupted. Reconciles a dead pid (child crashed/killed) to 'interrupted'.""" + rec = _read_status(repo) + if rec is None: + return {"repo": repo, "state": "not_started"} + state = rec.get("state") + if state in ("starting", "running"): + pid = rec.get("pid") + if pid is not None and not _pid_alive(pid): + return {**rec, "state": "interrupted", + "detail": "download process is no longer running"} + return rec + + +def list_downloads() -> list[dict]: + """All known download jobs (one per status file in DL_DIR).""" + out = [] + try: + for fn in os.listdir(DL_DIR): + if fn.endswith(".json"): + try: + with open(os.path.join(DL_DIR, fn), encoding="utf-8") as f: + rec = json.load(f) + out.append(status(rec.get("repo", ""))) + except (OSError, json.JSONDecodeError): + continue + except OSError: + pass + return out diff --git a/codec_cookbook/fit.py b/codec_cookbook/fit.py new file mode 100644 index 0000000..ad1f265 --- /dev/null +++ b/codec_cookbook/fit.py @@ -0,0 +1,124 @@ +"""Cookbook unified-memory fit math. + +Weight size comes from the model's ACTUAL Hugging Face file sizes; KV-cache +geometry from its real `config.json`. No fabricated architecture constants — +the only tunables are the conservative over-estimate (×1.10 + 1.5 GB) and the +deployment safety figures (os_reserve=24, margin=8), which are policy, not +architecture. + +MoE note: footprint = TOTAL params (every expert is resident in unified +memory); the active-param count affects speed, not memory. So the anchors are +the full-model resident sizes (Qwen3-30B-A3B-4bit ≈ 17.2 GB, +Qwen3-Next-80B-A3B-4bit ≈ 42 GB). + +Network boundary: weight_gb_from_hub() and load_config() touch the Hub. +estimate_footprint_gb() accepts `anchor_gb` (skip the weight call) and `cfg` +(skip the config fetch) so callers — and tests — can run fully offline. +""" +from __future__ import annotations + +import json +import logging +from typing import Optional + +log = logging.getLogger("codec_cookbook.fit") + +WEIGHT_EXT = (".safetensors", ".gguf", ".npz") +DEFAULT_OS_RESERVE_GB = 24 +DEFAULT_MARGIN_GB = 8 +_OVERHEAD_MULT = 1.10 # +10% for activations / fragmentation +_OVERHEAD_FLAT_GB = 1.5 # runtime / framework baseline + + +def weight_gb_from_hub(repo: str) -> float: + """Sum of the model's weight-file sizes (GB) from the Hub. Network.""" + from huggingface_hub import HfApi + info = HfApi().model_info(repo, files_metadata=True) + total = sum((f.size or 0) for f in (info.siblings or []) + if f.rfilename.endswith(WEIGHT_EXT)) + return total / 1e9 + + +def load_config(repo: str) -> dict: + """Fetch + parse the model's config.json from the Hub. Network.""" + from huggingface_hub import hf_hub_download + path = hf_hub_download(repo, "config.json") + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def kv_cache_gb(cfg: dict, ctx: int) -> float: + """fp16 K+V cache size (GB) for `ctx` tokens, from real config geometry. + + GQA-aware: uses num_key_value_heads when present (falls back to + num_attention_heads for MHA models). head_dim falls back to + hidden_size / num_attention_heads. + """ + nl = cfg["num_hidden_layers"] + nkv = cfg.get("num_key_value_heads", cfg["num_attention_heads"]) + hd = cfg.get("head_dim", cfg["hidden_size"] // cfg["num_attention_heads"]) + return (2 * nl * nkv * hd * ctx * 2) / 1e9 # 2 (K+V) * ... * 2 bytes (fp16) + + +def estimate_footprint_gb(repo: str, ctx: int, + anchor_gb: Optional[float] = None, + cfg: Optional[dict] = None) -> float: + """Conservative resident footprint (GB) = (weight + KV) × 1.10 + 1.5. + + weight: `anchor_gb` if given, else weight_gb_from_hub(repo). + KV: from `cfg` if given, else load_config(repo). If the config can't + be obtained (offline + no anchor cfg), KV is omitted and a warning + is logged — the estimate then under-counts KV, so callers relying + on the conservative guarantee should supply anchor_gb + cfg. + """ + w = anchor_gb if anchor_gb is not None else weight_gb_from_hub(repo) + kv = 0.0 + try: + c = cfg if cfg is not None else load_config(repo) + kv = kv_cache_gb(c, ctx) + except Exception as e: # offline / missing config — best-effort + log.warning("KV geometry unavailable for %s (%s) — footprint omits KV", repo, e) + return (w + kv) * _OVERHEAD_MULT + _OVERHEAD_FLAT_GB + + +def available_gb(unified_total_gb: float, resident_gb, + os_reserve_gb: int = DEFAULT_OS_RESERVE_GB) -> float: + """Unified memory free for a new model = total − os_reserve − Σ resident.""" + return unified_total_gb - os_reserve_gb - sum(resident_gb) + + +def fits(need_gb: float, avail_gb: float, + margin_gb: int = DEFAULT_MARGIN_GB) -> tuple[bool, float]: + """(ok, headroom). ok requires headroom ≥ margin_gb. headroom may be + negative (returned for the refusal message).""" + headroom = avail_gb - need_gb + return headroom >= margin_gb, headroom + + +def quick_need_gb(entry: dict, ctx: int) -> float: + """Best-effort footprint for a catalog entry, preferring its anchor_gb so + ranking doesn't require a weight download. KV is added when config is + reachable (best-effort). Used by the recommend skill.""" + return estimate_footprint_gb(entry["hf_repo"], ctx, anchor_gb=entry.get("anchor_gb")) + + +def recommend(entries: list[dict], avail_gb: float, ctx: int, + margin_gb: int = DEFAULT_MARGIN_GB) -> list[dict]: + """Rank catalog entries against available memory. Returns a list of + {entry, need_gb, fits, headroom_gb} sorted fit-first then largest-that-fits + (more capable), so the top recommendation is the biggest model that fits.""" + scored = [] + for e in entries: + need = quick_need_gb(e, ctx) + ok, headroom = fits(need, avail_gb, margin_gb) + scored.append({ + "entry": e, + "need_gb": round(need, 1), + "fits": ok, + "headroom_gb": round(headroom, 1), + }) + # fits first; within fits, biggest need (most capable) first; within + # non-fits, smallest need (closest to fitting) first. + scored.sort(key=lambda s: (not s["fits"], + -s["need_gb"] if s["fits"] else s["need_gb"])) + return scored diff --git a/codec_cookbook/probe.py b/codec_cookbook/probe.py new file mode 100644 index 0000000..89b2b44 --- /dev/null +++ b/codec_cookbook/probe.py @@ -0,0 +1,219 @@ +"""Cookbook hardware + process probe — STRICTLY READ-ONLY. + +Nothing here ever starts, stops, binds, or mutates anything. It reads: + * chip + total unified memory (sysctl / system_profiler) + * free memory (vm_stat) + * running PM2 processes + RSS (pm2 jlist) + * whether a TCP port is bound (socket connect probe) + * mlx-lm version (Qwen3 MoE needs >= 0.25.2) + +Every function is defensive: on any failure it returns a safe default +(None / 0 / [] / False) rather than raising, so a probe never breaks a skill. +""" +from __future__ import annotations + +import json +import logging +import socket +import subprocess +from functools import lru_cache +from typing import Optional + +log = logging.getLogger("codec_cookbook.probe") + +# Protected ports Cookbook must never bind to or stop. Mirrored by serve.py. +PROTECTED_PORTS = frozenset({8083, 8090, 8094, 9223, 5678}) +# Cookbook's own serve range. +SERVE_RANGE = range(8110, 8120) # 8110-8119 inclusive +OS_RESERVE_GB = 24 + + +def _run(cmd: list[str], timeout: int = 10) -> Optional[str]: + """Run a read-only command, return stdout or None. Never raises.""" + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + if r.returncode == 0: + return r.stdout + log.debug("probe cmd %s rc=%s: %s", cmd[:2], r.returncode, (r.stderr or "")[:200]) + return None + except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as e: + log.debug("probe cmd %s failed: %s", cmd[:2], e) + return None + + +@lru_cache(maxsize=1) +def chip() -> str: + """Apple Silicon chip string (e.g. 'Apple M1 Ultra'), or '' if unknown.""" + out = _run(["sysctl", "-n", "machdep.cpu.brand_string"]) + return (out or "").strip() + + +@lru_cache(maxsize=1) +def unified_total_gb() -> float: + """Total unified memory in GB (hw.memsize). 0.0 if unreadable.""" + out = _run(["sysctl", "-n", "hw.memsize"]) + if out: + try: + return int(out.strip()) / 1e9 + except ValueError: + pass + return 0.0 + + +def vm_free_gb() -> float: + """Free + inactive memory in GB from vm_stat (a live, fluctuating figure). + Used for the scan report only — fit uses the deterministic resident-sum + formula in available_gb().""" + out = _run(["vm_stat"]) + if not out: + return 0.0 + page_size = 16384 # Apple Silicon default; corrected below if vm_stat reports it + free_pages = inactive_pages = 0 + for line in out.splitlines(): + low = line.lower() + if "page size of" in low: + for tok in low.replace("(", " ").replace(")", " ").split(): + if tok.isdigit(): + page_size = int(tok) + break + elif low.startswith("pages free:"): + free_pages = _trailing_int(line) + elif low.startswith("pages inactive:"): + inactive_pages = _trailing_int(line) + return ((free_pages + inactive_pages) * page_size) / 1e9 + + +def _trailing_int(line: str) -> int: + digits = "".join(c for c in line if c.isdigit()) + return int(digits) if digits else 0 + + +def pm2_jlist() -> list[dict]: + """Raw `pm2 jlist` output as a list of dicts. [] on any failure.""" + out = _run(["pm2", "jlist"]) + if not out: + return [] + try: + data = json.loads(out) + return data if isinstance(data, list) else [] + except (json.JSONDecodeError, ValueError): + return [] + + +def pm2_processes() -> list[dict]: + """Parsed PM2 process summaries: name, status, rss_gb, pm_id, port (if the + process declares one via its PORT env or args).""" + procs = [] + for p in pm2_jlist(): + env = p.get("pm2_env", {}) or {} + monit = p.get("monit", {}) or {} + procs.append({ + "name": p.get("name", "?"), + "pm_id": p.get("pm_id"), + "status": env.get("status", "?"), + "rss_gb": round((monit.get("memory", 0) or 0) / 1e9, 3), + "port": _proc_declared_port(p), + }) + return procs + + +def _proc_declared_port(p: dict) -> Optional[int]: + """Best-effort port a PM2 proc declares (PORT env or a --port/-port arg). + Not authoritative for binding — that's what is_port_bound() is for.""" + env = p.get("pm2_env", {}) or {} + port_env = env.get("PORT") or env.get("env", {}).get("PORT") if isinstance(env.get("env"), dict) else env.get("PORT") + if port_env: + try: + return int(port_env) + except (ValueError, TypeError): + pass + # scan args for --port N / -port N / --port=N + args = env.get("args") or [] + if isinstance(args, str): + args = args.split() + for i, a in enumerate(args): + a = str(a) + if a in ("--port", "-port", "--listen-port") and i + 1 < len(args): + try: + return int(args[i + 1]) + except (ValueError, TypeError): + pass + if a.startswith("--port="): + try: + return int(a.split("=", 1)[1]) + except (ValueError, TypeError): + pass + return None + + +def resident_gb_total() -> float: + """Sum of RSS (GB) across all online PM2 processes — the live stack's + footprint, used by available_gb().""" + return round(sum(p["rss_gb"] for p in pm2_processes() + if p.get("status") == "online"), 3) + + +def available_gb(os_reserve_gb: int = OS_RESERVE_GB) -> float: + """Unified memory available for a NEW model: + total - os_reserve - sum(resident PM2 RSS). + Deterministic (doesn't depend on the fluctuating vm_stat free figure).""" + total = unified_total_gb() + if total <= 0: + return 0.0 + return round(total - os_reserve_gb - resident_gb_total(), 3) + + +def is_port_bound(port: int, host: str = "127.0.0.1") -> bool: + """True if something is listening on host:port right now (socket probe).""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(0.4) + return s.connect_ex((host, port)) == 0 + except OSError: + return False + + +def bound_ports_in_range(lo: int, hi: int) -> set[int]: + """Subset of [lo, hi) that is currently bound (socket probe).""" + return {p for p in range(lo, hi) if is_port_bound(p)} + + +@lru_cache(maxsize=1) +def mlx_version() -> Optional[str]: + try: + import mlx_lm + return getattr(mlx_lm, "__version__", None) + except Exception: + return None + + +def mlx_version_ok(minimum: tuple = (0, 25, 2)) -> bool: + """True if mlx-lm meets the Qwen3-MoE minimum. None version → False (warn).""" + v = mlx_version() + if not v: + return False + try: + parts = tuple(int(x) for x in v.split(".")[:3]) + return parts >= minimum + except ValueError: + return False + + +def snapshot() -> dict: + """Full read-only state for the scan skill.""" + procs = pm2_processes() + return { + "chip": chip(), + "unified_total_gb": round(unified_total_gb(), 1), + "vm_free_gb": round(vm_free_gb(), 1), + "resident_gb_total": resident_gb_total(), + "available_gb": available_gb(), + "os_reserve_gb": OS_RESERVE_GB, + "pm2_process_count": len(procs), + "pm2_processes": procs, + "mlx_version": mlx_version(), + "mlx_version_ok": mlx_version_ok(), + "protected_ports": sorted(PROTECTED_PORTS), + "serve_range": [SERVE_RANGE.start, SERVE_RANGE.stop - 1], + "serve_ports_bound": sorted(bound_ports_in_range(SERVE_RANGE.start, SERVE_RANGE.stop)), + } diff --git a/codec_cookbook/serve.py b/codec_cookbook/serve.py new file mode 100644 index 0000000..8b61363 --- /dev/null +++ b/codec_cookbook/serve.py @@ -0,0 +1,262 @@ +"""Cookbook PM2 serve/stop + port allocation + health — the SAFETY-CRITICAL core. + +Structural guarantees (the whole point of Cookbook): + * launch() only ever allocates a port in 8110-8119 that a live socket probe + + pm2 jlist + our own served.json all agree is free. Protected ports + (8083/8090/8094/9223/5678) are never in range and are skipped anyway. + * Every process we start is named `cookbook--`. + * stop() will ONLY delete a process that (a) we recorded in served.json, + (b) is named `cookbook-…`, and (c) is NOT on a protected port — and only + when confirm=True. Anything else returns a refusal (or a dry-run). + * Nothing here issues docker stop/rm, changes an existing service's port, or + restarts/stops a non-cookbook process. +""" +from __future__ import annotations + +import json +import logging +import os +import subprocess +import sys +import time +from datetime import datetime, timezone +from typing import Optional, Union + +from codec_jsonstore import atomic_write_json, file_lock +from . import probe + +log = logging.getLogger("codec_cookbook.serve") + +COOKBOOK_PREFIX = "cookbook-" +PROTECTED_PORTS = probe.PROTECTED_PORTS # 8083/8090/8094/9223/5678 +SERVE_RANGE = probe.SERVE_RANGE # range(8110, 8120) +SERVED_PATH = os.path.expanduser("~/.codec/cookbook/served.json") +_CONFIG_PATH = os.path.expanduser("~/.codec/config.json") +_HEALTH_TIMEOUT_S = 90 +_PM2_TIMEOUT_S = 30 + + +# ── persistence ───────────────────────────────────────────────────────────── + +def _load_served() -> list[dict]: + try: + with open(SERVED_PATH, encoding="utf-8") as f: + data = json.load(f) + return data if isinstance(data, list) else [] + except (OSError, json.JSONDecodeError): + return [] + + +def _save_served(records: list[dict]) -> None: + with file_lock(SERVED_PATH): + atomic_write_json(SERVED_PATH, records, default=str) + + +def _record_served(rec: dict) -> None: + with file_lock(SERVED_PATH): + records = _load_served() + records = [r for r in records if r.get("pm2_name") != rec["pm2_name"]] + records.append(rec) + atomic_write_json(SERVED_PATH, records, default=str) + + +def _forget_served(pm2_name: str) -> None: + with file_lock(SERVED_PATH): + records = [r for r in _load_served() if r.get("pm2_name") != pm2_name] + atomic_write_json(SERVED_PATH, records, default=str) + + +# ── interpreter discovery (no hardcoded venv) ─────────────────────────────── + +def resolve_mlx_python() -> str: + """Python that has mlx-lm: config.json:cookbook.mlx_python if set, else + sys.executable (the interpreter already running CODEC, which serves + qwen3.6 and therefore has MLX).""" + try: + with open(_CONFIG_PATH, encoding="utf-8") as f: + cfg = json.load(f) + py = (cfg.get("cookbook") or {}).get("mlx_python") + if py and os.path.exists(py): + return py + except (OSError, json.JSONDecodeError): + pass + return sys.executable + + +def resolve_llama_server() -> Optional[str]: + """`llama-server` resolved from PATH, or None if absent.""" + import shutil + return shutil.which("llama-server") + + +# ── port allocation ───────────────────────────────────────────────────────── + +def allocate_port() -> Optional[int]: + """First free port in 8110-8119. Skips: protected ports, ports a live + socket probe says are bound, ports any PM2 process declares, and ports we + already recorded in served.json. Returns None if the range is exhausted.""" + bound = probe.bound_ports_in_range(SERVE_RANGE.start, SERVE_RANGE.stop) + pm2_ports = {p["port"] for p in probe.pm2_processes() if p.get("port")} + ours = {r["port"] for r in _load_served() if r.get("port")} + for port in SERVE_RANGE: + if port in PROTECTED_PORTS: # belt-and-suspenders (none in range) + continue + if port in bound or port in pm2_ports or port in ours: + continue + return port + return None + + +# ── launch ─────────────────────────────────────────────────────────────────── + +def _build_command(entry: dict, port: int, context_length: int) -> tuple[Optional[list[str]], Optional[str]]: + """Return (pm2_argv, error). Builds the corrected serve command for the + entry's backend.""" + mid = entry["id"] + pm2_name = f"{COOKBOOK_PREFIX}{mid}-{port}" + backend = entry.get("backend", "mlx") + if backend == "mlx": + py = resolve_mlx_python() + # NOTE: `python -m mlx_lm server` (subcommand form) — NOT `-m mlx_lm.server`. + # --max-tokens is mandatory: mlx-lm defaults to 512 and silently truncates. + argv = ["pm2", "start", py, "--name", pm2_name, "--", + "-m", "mlx_lm", "server", "--model", entry["hf_repo"], + "--host", "127.0.0.1", "--port", str(port), "--max-tokens", "16384"] + return argv, None + if backend in ("gguf", "llama", "llama.cpp"): + server = resolve_llama_server() + if not server: + return None, "llama-server not found on PATH" + gguf = entry.get("gguf_path") or entry.get("hf_repo") + # Metal is on by default on Apple Silicon; -ngl 999 forces full GPU offload. + argv = ["pm2", "start", server, "--name", pm2_name, "--", + "-m", gguf, "--host", "127.0.0.1", "--port", str(port), + "-ngl", "999", "-c", str(context_length)] + return argv, None + return None, f"unknown backend: {backend!r}" + + +def _health_ok(port: int, timeout_s: int = _HEALTH_TIMEOUT_S) -> bool: + """Poll GET /v1/models until 200 or timeout.""" + import urllib.request + deadline = time.monotonic() + timeout_s + url = f"http://127.0.0.1:{port}/v1/models" + while time.monotonic() < deadline: + try: + with urllib.request.urlopen(url, timeout=3) as r: + if r.status == 200: + return True + except Exception: + pass + time.sleep(2) + return False + + +def launch(entry: dict, context_length: int = 8192, + wait_health: bool = True) -> dict: + """Allocate a free 8110-8119 port and start the model under PM2. Returns a + status dict. Never touches an existing service.""" + port = allocate_port() + if port is None: + return {"status": "error", "reason": "no_free_port", + "range": [SERVE_RANGE.start, SERVE_RANGE.stop - 1]} + argv, err = _build_command(entry, port, context_length) + if err: + return {"status": "error", "reason": err} + pm2_name = f"{COOKBOOK_PREFIX}{entry['id']}-{port}" + try: + r = subprocess.run(argv, capture_output=True, text=True, timeout=_PM2_TIMEOUT_S) + except FileNotFoundError: + return {"status": "error", "reason": "pm2_not_found"} + except subprocess.TimeoutExpired: + return {"status": "error", "reason": "pm2_start_timeout", "pm2_name": pm2_name} + if r.returncode != 0: + return {"status": "error", "reason": "pm2_start_failed", + "detail": (r.stderr or r.stdout or "")[:300], "pm2_name": pm2_name} + + rec = { + "id": entry["id"], + "port": port, + "pm2_name": pm2_name, + "backend": entry.get("backend", "mlx"), + "hf_repo": entry.get("hf_repo"), + "context": context_length, + "started_at": datetime.now(timezone.utc).isoformat(), + } + _record_served(rec) + + healthy = _health_ok(port) if wait_health else None + return {"status": "serving" if healthy or not wait_health else "started_unhealthy", + "healthy": healthy, **rec} + + +# ── stop — the guard ───────────────────────────────────────────────────────── + +def _resolve_target(target: Union[str, int]) -> Optional[dict]: + """Resolve a stop target (a cookbook pm2 name OR a port) to OUR served + record. Returns None if it isn't a process we started — the caller treats + None as a hard refusal, so we never delete anything we don't own.""" + served = _load_served() + s = str(target).strip() + if s.isdigit(): + port = int(s) + for r in served: + if r.get("port") == port: + return r + return None + for r in served: + if r.get("pm2_name") == s: + return r + return None + + +def stop(target: Union[str, int], confirm: bool = False) -> dict: + """Stop a Cookbook-started model. Layered refusals (in order): + 1. target not a process WE started (served.json) → refused + 2. resolved port is a protected port → refused + 3. resolved pm2 name not in the `cookbook-` namespace → refused + 4. confirm is not True → dry-run (would_stop) + Only after all four pass do we `pm2 delete `.""" + rec = _resolve_target(target) + if rec is None: + return {"status": "refused", "reason": "not_a_cookbook_process", + "target": str(target), + "detail": "Cookbook only stops models it started (recorded in served.json)."} + name, port = rec.get("pm2_name", ""), rec.get("port") + if port in PROTECTED_PORTS: + return {"status": "refused", "reason": "protected_port", "port": port} + if not name.startswith(COOKBOOK_PREFIX): + return {"status": "refused", "reason": "not_cookbook_namespace", "pm2_name": name} + if not confirm: + return {"status": "would_stop", "pm2_name": name, "port": port, + "hint": "re-run with confirm=true to actually stop it"} + try: + r = subprocess.run(["pm2", "delete", name], + capture_output=True, text=True, timeout=_PM2_TIMEOUT_S) + except FileNotFoundError: + return {"status": "error", "reason": "pm2_not_found", "pm2_name": name} + except subprocess.TimeoutExpired: + return {"status": "error", "reason": "pm2_delete_timeout", "pm2_name": name} + if r.returncode != 0: + return {"status": "error", "reason": "pm2_delete_failed", + "detail": (r.stderr or r.stdout or "")[:300], "pm2_name": name} + _forget_served(name) + return {"status": "stopped", "pm2_name": name, "port": port} + + +# ── list ────────────────────────────────────────────────────────────────────── + +def list_served() -> list[dict]: + """Cookbook-served models with live status (online/stopped via pm2) + + a current health probe. Read-only.""" + live = {p["name"]: p for p in probe.pm2_processes()} + out = [] + for r in _load_served(): + name = r.get("pm2_name", "") + proc = live.get(name) + out.append({ + **r, + "pm2_status": proc.get("status") if proc else "absent", + "healthy": probe.is_port_bound(r["port"]) if r.get("port") else None, + }) + return out diff --git a/skills/.manifest.json b/skills/.manifest.json index d5aa99c..c3305b3 100644 --- a/skills/.manifest.json +++ b/skills/.manifest.json @@ -26,6 +26,12 @@ "chrome_tabs.py": "bf971798a8455215655d37edb6bb326d908f4d33c0e3b232eb37d267fec68d7a", "clipboard.py": "f5ef9cc501fe38a3de95bf0b49896b928250c0e272060173668f6b195728d131", "clipboard_url_fetch.py": "c2733a92d6e99a0346b91c67bb70698e491be9570305377a82096a0ceb153488", + "cookbook_download.py": "8fd9c8a5b82f8e910cc0721904b908f2d201908ff0ac6373994be922598c382c", + "cookbook_list.py": "23c19b92742bfd88a801e74bd79e8bbd70f88d49ac1951f1c30e4857cc693a79", + "cookbook_recommend.py": "66b09eac0510ca9ca1e19f5619ab10a167bda6e34d17be13bda77ba7ffe0b5e3", + "cookbook_scan.py": "9bc1fe32073267c45c20d6dca67e293ed07f5c04dfff0109b480f778ffeaa3bb", + "cookbook_serve.py": "8a08bb5deecd988431da421b1b4825cd411c7e9c7658bc547aa16ce74c0e1ddd", + "cookbook_stop.py": "8b51e04ef08e08899df24d032af7a609bef7bf02ad72be5badf95eb0a2b1ce64", "create_skill.py": "28070280abfe2179d5c9b33eee74b4db5a5dd085f76a09f02d516302ba330036", "delegate.py": "7c595d5605cd9913a8afa331ae0013861d6996f378f8a59aca0249f1b2f3a474", "fact_extract.py": "a43ed03b8c51704415f4135de46a44e097f757305af3921dd389b8dfd0540906", diff --git a/skills/cookbook_download.py b/skills/cookbook_download.py new file mode 100644 index 0000000..18d31eb --- /dev/null +++ b/skills/cookbook_download.py @@ -0,0 +1,43 @@ +"""CODEC Skill: Download a catalog model from Hugging Face (background job).""" +from codec_cookbook import args, catalog, download + +SKILL_NAME = "cookbook_download" +SKILL_DESCRIPTION = ( + "Download a Cookbook catalog model from Hugging Face into the local cache " + "as a background job, or report an in-flight download's status." +) +SKILL_TAGS = ["cookbook", "models", "download", "huggingface", "local-llm"] +SKILL_TRIGGERS = [ + "cookbook download", "download the model", "fetch a model", "download model status", + "cookbook fetch", +] +SKILL_MCP_EXPOSE = False # spawns network download jobs — local/dashboard/voice only + + +def run(task, app="", ctx=""): + model_id = args.parse_model_id(task) + if not model_id: + return ("Which model? Say e.g. 'cookbook download qwen3-coder-30b'. Known: " + + ", ".join(catalog.ids())) + try: + entry = catalog.get(model_id) + except KeyError as e: + return str(e) + repo = entry["hf_repo"] + + # "status" / "progress" → report only; otherwise start (idempotent). + want_status = args.parse_flag(task, "status") or "progress" in (task or "").lower() + res = download.status(repo) if want_status else download.start(repo) + + state = res.get("state") + if state == "not_started": + return f"No download in progress for {model_id}. Say 'cookbook download {model_id}' to start." + if state in ("starting", "running"): + return f"⏳ Downloading {model_id} ({repo}) — state: {state} (pid {res.get('pid')})." + if state == "done": + return f"✅ {model_id} downloaded → {res.get('path', 'HF cache')}." + if state == "interrupted": + return f"⚠ Download of {model_id} was interrupted. Re-run 'cookbook download {model_id}'." + if state == "error": + return f"❌ Download of {model_id} failed: {res.get('error', 'unknown')}" + return f"{model_id}: {state}" diff --git a/skills/cookbook_list.py b/skills/cookbook_list.py new file mode 100644 index 0000000..5029995 --- /dev/null +++ b/skills/cookbook_list.py @@ -0,0 +1,29 @@ +"""CODEC Skill: List Cookbook-served models (read-only).""" +from codec_cookbook import serve + +SKILL_NAME = "cookbook_list" +SKILL_DESCRIPTION = ( + "List the local models Cookbook is currently serving (port, PM2 name, " + "live status, health). Only shows Cookbook-managed processes. Read-only." +) +SKILL_TAGS = ["cookbook", "models", "list", "local-llm"] +SKILL_TRIGGERS = [ + "cookbook list", "list served models", "cookbook models", "what models are running", + "cookbook ps", +] +SKILL_MCP_EXPOSE = True # read-only, safe to expose + + +def run(task, app="", ctx=""): + served = serve.list_served() + if not served: + return "Cookbook isn't serving any models. Use 'cookbook serve ' to start one." + lines = [f"Cookbook-served models ({len(served)}):"] + for r in served: + health = "healthy" if r.get("healthy") else "no response" + lines.append( + f" • {r.get('id', '?'):<16} port {r.get('port')} " + f"[{r.get('pm2_status', '?')}/{health}] {r.get('pm2_name')} " + f"(ctx {r.get('context')}, {r.get('backend')})" + ) + return "\n".join(lines) diff --git a/skills/cookbook_recommend.py b/skills/cookbook_recommend.py new file mode 100644 index 0000000..76853f0 --- /dev/null +++ b/skills/cookbook_recommend.py @@ -0,0 +1,42 @@ +"""CODEC Skill: Cookbook model recommendation (read-only).""" +from codec_cookbook import args, catalog, fit, probe + +SKILL_NAME = "cookbook_recommend" +SKILL_DESCRIPTION = ( + "Recommend which catalog models fit this Mac's current unified-memory " + "headroom, ranked biggest-that-fits first. Optionally filter by role " + "(chat/reason/code/max/fast/tiny). Read-only." +) +SKILL_TAGS = ["cookbook", "models", "recommend", "fit", "local-llm"] +SKILL_TRIGGERS = [ + "cookbook recommend", "recommend a model", "which model fits", "what model can i run", + "cookbook suggest", "best model for", +] +SKILL_MCP_EXPOSE = True # read-only, safe to expose + + +def run(task, app="", ctx=""): + ctx_len = args.parse_context(task) + role = args.parse_role(task) + entries = catalog.by_role(role) if role else catalog.all_entries() + if not entries: + return f"No catalog models with role '{role}'. Roles: chat, reason, code, max, fast, tiny." + avail = probe.available_gb() + ranked = fit.recommend(entries, avail, ctx_len) + header = (f"Models for ~{round(avail, 1)} GB headroom @ {ctx_len}-token context" + + (f" (role: {role})" if role else "") + ":") + lines = [header] + for r in ranked: + e = r["entry"] + mark = "✅" if r["fits"] else "✗" + lines.append( + f" {mark} {e['id']:<16} ~{r['need_gb']} GB " + f"(headroom {r['headroom_gb']:+} GB) [{','.join(e.get('roles', []))}]" + ) + top = next((r for r in ranked if r["fits"]), None) + if top: + lines.append(f"\n→ Best fit: {top['entry']['id']}. " + f"Serve it with: cookbook serve {top['entry']['id']}") + else: + lines.append("\n→ Nothing fits the current headroom. Free memory or pick a smaller context.") + return "\n".join(lines) diff --git a/skills/cookbook_scan.py b/skills/cookbook_scan.py new file mode 100644 index 0000000..7d572eb --- /dev/null +++ b/skills/cookbook_scan.py @@ -0,0 +1,37 @@ +"""CODEC Skill: Cookbook hardware scan (read-only).""" +from codec_cookbook import probe + +SKILL_NAME = "cookbook_scan" +SKILL_DESCRIPTION = ( + "Scan this Mac's hardware + live PM2 stack and report unified-memory " + "headroom available for serving additional local models. Read-only." +) +SKILL_TAGS = ["cookbook", "models", "scan", "hardware", "local-llm"] +SKILL_TRIGGERS = [ + "cookbook scan", "scan hardware", "cookbook hardware", "model memory headroom", + "how much memory for models", "cookbook status", +] +SKILL_MCP_EXPOSE = True # read-only, safe to expose + + +def run(task, app="", ctx=""): + s = probe.snapshot() + lines = [ + f"🖥 {s['chip'] or 'Apple Silicon'} — {s['unified_total_gb']} GB unified", + f" resident (PM2): {s['resident_gb_total']} GB · free (vm_stat): {s['vm_free_gb']} GB", + f" available for a new model: ~{s['available_gb']} GB " + f"(total − {s['os_reserve_gb']} GB OS reserve − resident)", + f" PM2 processes: {s['pm2_process_count']}", + ] + if not s["mlx_version_ok"]: + lines.append(f" ⚠ mlx-lm {s['mlx_version'] or 'missing'} — Qwen3 MoE needs ≥ 0.25.2") + else: + lines.append(f" mlx-lm {s['mlx_version']} ✓") + busy = s["serve_ports_bound"] + rng = s["serve_range"] + lines.append( + f" Cookbook serve range {rng[0]}-{rng[1]}: " + + (f"{len(busy)} busy ({', '.join(map(str, busy))})" if busy else "all free") + ) + lines.append(f" protected ports (never touched): {', '.join(map(str, s['protected_ports']))}") + return "\n".join(lines) diff --git a/skills/cookbook_serve.py b/skills/cookbook_serve.py new file mode 100644 index 0000000..8dc9b16 --- /dev/null +++ b/skills/cookbook_serve.py @@ -0,0 +1,49 @@ +"""CODEC Skill: Serve a local model on a dedicated Cookbook port (8110-8119).""" +from codec_cookbook import args, catalog, fit, probe, serve + +SKILL_NAME = "cookbook_serve" +SKILL_DESCRIPTION = ( + "Serve a local MLX or GGUF model on a dedicated Cookbook port (8110-8119) " + "under PM2, after a unified-memory fit check. Never touches existing services." +) +SKILL_TAGS = ["cookbook", "models", "serve", "mlx", "local-llm"] +SKILL_TRIGGERS = [ + "cookbook serve", "serve the model", "spin up a local model", "load model on cookbook", + "start a local model", +] +SKILL_MCP_EXPOSE = False # starts PM2 processes — local/dashboard/voice only (cf. pm2_control) + + +def run(task, app="", ctx=""): + model_id = args.parse_model_id(task) + if not model_id: + return ("Which model? Say e.g. 'cookbook serve qwen3-30b-a3b'. Known: " + + ", ".join(catalog.ids())) + try: + entry = catalog.get(model_id) + except KeyError as e: + return str(e) + + context_length = args.parse_context(task) + force = args.parse_flag(task, "force") + + # Fit check (conservative over-estimate vs. live headroom). + need = fit.estimate_footprint_gb(entry["hf_repo"], context_length, entry.get("anchor_gb")) + avail = probe.available_gb() + ok, headroom = fit.fits(need, avail) + if not ok and not force: + return (f"⚠ Refused to serve {model_id}: insufficient memory.\n" + f" need ~{round(need, 1)} GB, headroom {round(headroom, 1)} GB " + f"(margin {fit.DEFAULT_MARGIN_GB} GB).\n" + f" Free memory, lower the context, or re-run with 'force' to override.") + + res = serve.launch(entry, context_length) + status = res.get("status") + if status == "serving": + return (f"✅ Serving {model_id} on port {res['port']} " + f"(pm2: {res['pm2_name']}, ctx {context_length}, ~{round(need, 1)} GB). " + f"OpenAI-compatible at http://127.0.0.1:{res['port']}/v1") + if status == "started_unhealthy": + return (f"⏳ Started {res['pm2_name']} on port {res['port']} but it didn't pass the " + f"/v1/models health check in time. Check: pm2 logs {res['pm2_name']}") + return f"❌ Could not serve {model_id}: {res.get('reason')} {res.get('detail', '')}".strip() diff --git a/skills/cookbook_stop.py b/skills/cookbook_stop.py new file mode 100644 index 0000000..9cdc1d0 --- /dev/null +++ b/skills/cookbook_stop.py @@ -0,0 +1,75 @@ +"""CODEC Skill: Stop a Cookbook-served model (guarded; confirm required).""" +import re + +from codec_cookbook import args, serve + +SKILL_NAME = "cookbook_stop" +SKILL_DESCRIPTION = ( + "Stop a model Cookbook started (by id, PM2 name, or port). Refuses anything " + "outside the cookbook- namespace, any protected/non-cookbook port, and " + "requires explicit confirmation. NEVER stops an existing service." +) +SKILL_TAGS = ["cookbook", "models", "stop", "local-llm"] +SKILL_TRIGGERS = [ + "cookbook stop", "stop the model", "stop cookbook model", "unload model", + "shut down local model", +] +SKILL_MCP_EXPOSE = False # stops PM2 processes — local/dashboard/voice only (cf. pm2_control) + + +def _resolve_target(task): + """Figure out what the user means: an explicit cookbook- pm2 name, a Cookbook + port (8110-8119), or a model id mapped via served.json.""" + # explicit full pm2 name + m = re.search(r"\bcookbook-[A-Za-z0-9_.\-]+\b", task or "") + if m: + return m.group(0), None + # explicit Cookbook port + port = args.parse_port(task) + if port is not None: + return port, None + # model id → look up our served record(s) + model_id = args.parse_model_id(task) + if model_id: + matches = [r for r in serve.list_served() if r.get("id") == model_id] + if len(matches) == 1: + return matches[0]["pm2_name"], None + if len(matches) > 1: + names = ", ".join(r["pm2_name"] for r in matches) + return None, (f"Multiple {model_id} instances are served ({names}). " + f"Specify the port or full pm2 name.") + return None, (f"Cookbook isn't serving '{model_id}'. " + f"Use 'cookbook list' to see what's running.") + return None, None + + +def run(task, app="", ctx=""): + target, msg = _resolve_target(task) + if msg: + return msg + if target is None: + return ("Which model? Say e.g. 'cookbook stop llama32-3b confirm', " + "or give a port (8110-8119) or the full cookbook- pm2 name. " + "Use 'cookbook list' to see running models.") + + confirm = args.parse_flag(task, "confirm") + res = serve.stop(target, confirm=confirm) + status = res.get("status") + + if status == "would_stop": + return (f"About to stop {res['pm2_name']} (port {res['port']}). " + f"Re-run with 'confirm' to actually stop it: " + f"cookbook stop {res['pm2_name']} confirm") + if status == "stopped": + return f"🛑 Stopped {res['pm2_name']} (port {res['port']}) and freed the Cookbook port." + if status == "refused": + reason = res.get("reason") + if reason == "protected_port": + return f"⛔ Refused: port {res['port']} is a protected core service — Cookbook never touches it." + if reason == "not_a_cookbook_process": + return (f"⛔ Refused: '{res['target']}' isn't a model Cookbook started. " + f"Cookbook only stops its own cookbook- processes.") + if reason == "not_cookbook_namespace": + return f"⛔ Refused: '{res['pm2_name']}' is not in the cookbook- namespace." + return f"⛔ Refused: {reason}" + return f"❌ {res.get('reason', 'unknown error')}: {res.get('detail', '')}".strip() diff --git a/tests/test_cookbook.py b/tests/test_cookbook.py new file mode 100644 index 0000000..4fa688d --- /dev/null +++ b/tests/test_cookbook.py @@ -0,0 +1,312 @@ +"""CODEC Cookbook — local-model lifecycle tests. + +Coverage matches the build brief: + * fit.py — golden anchors (30B→17.2, 80B→42), KV math, offline anchor fallback + * serve.py — port allocation stays in 8110-8119 + skips bound ports + * STOP-GUARD (highest priority) — refuses protected ports, non-cookbook ports, + non-cookbook PM2 names; requires confirm=True + * integration — serve → served.json → list → stop(confirm), PM2/health mocked + * skills — the six thin skills parse args + format helper output + +Real PM2 / MLX / Hub calls are mocked so the suite runs offline + side-effect-free. +""" +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +_REPO = Path(__file__).resolve().parents[1] +if str(_REPO) not in sys.path: + sys.path.insert(0, str(_REPO)) + +from codec_cookbook import args, catalog, fit, probe, serve # noqa: E402 + +# A representative Qwen3-MoE-ish config for offline KV math. +_CFG = {"num_hidden_layers": 48, "num_attention_heads": 32, + "num_key_value_heads": 4, "head_dim": 128, "hidden_size": 4096} + + +# ── catalog ────────────────────────────────────────────────────────────── +class TestCatalog: + def test_known_ids(self): + ids = catalog.ids() + assert {"qwen3-30b-a3b", "qwen3-next-80b", "llama32-3b"} <= set(ids) + + def test_get_unknown_raises(self): + with pytest.raises(KeyError): + catalog.get("does-not-exist") + + def test_find_unknown_is_none(self): + assert catalog.find("nope") is None + + def test_by_role(self): + assert any(e["id"] == "qwen3-coder-30b" for e in catalog.by_role("code")) + + def test_primary_model_not_in_catalog(self): + # the live qwen3.6@8083 must never be Cookbook-managed + assert not any("Qwen3.6" in e["hf_repo"] for e in catalog.all_entries()) + + +# ── fit math ───────────────────────────────────────────────────────────── +class TestFit: + def test_golden_anchors_present(self): + assert catalog.get("qwen3-30b-a3b")["anchor_gb"] == 17.2 + assert catalog.get("qwen3-next-80b")["anchor_gb"] == 42.0 + + def test_kv_cache_math(self): + # 2 * nl * nkv * hd * ctx * 2 bytes / 1e9 + expected = (2 * 48 * 4 * 128 * 8192 * 2) / 1e9 + assert abs(fit.kv_cache_gb(_CFG, 8192) - expected) < 1e-9 + + def test_kv_gqa_falls_back_to_attention_heads(self): + mha = {"num_hidden_layers": 4, "num_attention_heads": 8, "hidden_size": 512} + # no num_key_value_heads → uses num_attention_heads; head_dim = 512/8 = 64 + expected = (2 * 4 * 8 * 64 * 1024 * 2) / 1e9 + assert abs(fit.kv_cache_gb(mha, 1024) - expected) < 1e-9 + + def test_footprint_uses_anchor_and_overhead(self): + need = fit.estimate_footprint_gb("x", 8192, anchor_gb=17.2, cfg=_CFG) + kv = fit.kv_cache_gb(_CFG, 8192) + expected = (17.2 + kv) * 1.10 + 1.5 + assert abs(need - expected) < 1e-6 + + def test_offline_anchor_fallback_no_network(self, monkeypatch): + # simulate Hub unreachable: load_config raises → KV omitted, no crash, + # and weight_gb_from_hub must NOT be called (anchor provided). + monkeypatch.setattr(fit, "load_config", + lambda r: (_ for _ in ()).throw(RuntimeError("offline"))) + called = {"hub": False} + monkeypatch.setattr(fit, "weight_gb_from_hub", + lambda r: called.__setitem__("hub", True) or 999.0) + need = fit.estimate_footprint_gb("x", 8192, anchor_gb=17.2) + assert called["hub"] is False, "anchor given → must not hit the Hub for weights" + assert abs(need - (17.2 * 1.10 + 1.5)) < 1e-6 + + def test_available_gb_formula(self): + assert fit.available_gb(192, [17.2, 42.0], os_reserve_gb=24) == pytest.approx(108.8) + + def test_fits_margin(self): + ok, hr = fit.fits(need_gb=50, avail_gb=60, margin_gb=8) + assert ok and hr == 10 + ok2, hr2 = fit.fits(need_gb=55, avail_gb=60, margin_gb=8) + assert not ok2 and hr2 == 5 + + def test_recommend_orders_fits_first_biggest(self): + entries = [ + {"id": "tiny", "hf_repo": "r/tiny", "anchor_gb": 2.0, "roles": ["tiny"]}, + {"id": "big", "hf_repo": "r/big", "anchor_gb": 40.0, "roles": ["max"]}, + {"id": "huge", "hf_repo": "r/huge", "anchor_gb": 200.0, "roles": ["max"]}, + ] + # KV unavailable offline → footprint ≈ anchor*1.1+1.5 + with patch.object(fit, "load_config", side_effect=RuntimeError("offline")): + ranked = fit.recommend(entries, avail_gb=60, ctx=8192) + ids = [r["entry"]["id"] for r in ranked] + assert ids[0] == "big", "biggest model that fits should rank first" + assert ids[-1] == "huge", "non-fitting model ranks last" + assert ranked[-1]["fits"] is False + + +# ── args parsing ─────────────────────────────────────────────────────────── +class TestArgs: + def test_model_id(self): + assert args.parse_model_id("cookbook serve qwen3-30b-a3b now") == "qwen3-30b-a3b" + assert args.parse_model_id("nothing here") is None + + def test_context(self): + assert args.parse_context("serve x context 16384") == 16384 + assert args.parse_context("serve x ctx=4096") == 4096 + assert args.parse_context("serve x") == 8192 + + def test_flags(self): + assert args.parse_flag("serve x force", "force") + assert args.parse_flag("stop x confirm=true", "confirm") + assert not args.parse_flag("serve x", "force") + + def test_port(self): + assert args.parse_port("stop port 8112") == 8112 + assert args.parse_port("stop 8083") is None # not in cookbook range + assert args.parse_port("stop x") is None + + def test_role(self): + assert args.parse_role("recommend a code model") == "code" + assert args.parse_role("recommend something") is None + + +# ── port allocation ───────────────────────────────────────────────────────── +class TestPortAllocation: + def test_stays_in_range_and_skips(self, tmp_path, monkeypatch): + monkeypatch.setattr(serve, "SERVED_PATH", str(tmp_path / "served.json")) + monkeypatch.setattr(probe, "bound_ports_in_range", lambda lo, hi: {8110, 8111}) + monkeypatch.setattr(probe, "pm2_processes", + lambda: [{"port": 8112, "name": "x", "status": "online", "rss_gb": 0}]) + serve._save_served([{"id": "a", "port": 8113, "pm2_name": "cookbook-a-8113"}]) + port = serve.allocate_port() + assert port == 8114 + assert 8110 <= port <= 8119 + + def test_exhausted_returns_none(self, tmp_path, monkeypatch): + monkeypatch.setattr(serve, "SERVED_PATH", str(tmp_path / "served.json")) + monkeypatch.setattr(probe, "bound_ports_in_range", + lambda lo, hi: set(range(8110, 8120))) + monkeypatch.setattr(probe, "pm2_processes", lambda: []) + assert serve.allocate_port() is None + + def test_never_allocates_protected_port(self, tmp_path, monkeypatch): + monkeypatch.setattr(serve, "SERVED_PATH", str(tmp_path / "served.json")) + monkeypatch.setattr(probe, "bound_ports_in_range", lambda lo, hi: set()) + monkeypatch.setattr(probe, "pm2_processes", lambda: []) + for _ in range(20): + p = serve.allocate_port() + assert p not in probe.PROTECTED_PORTS + + +# ── STOP-GUARD (highest priority) ──────────────────────────────────────────── +class TestStopGuard: + @pytest.fixture + def served(self, tmp_path, monkeypatch): + monkeypatch.setattr(serve, "SERVED_PATH", str(tmp_path / "served.json")) + serve._save_served([ + {"id": "llama32-3b", "port": 8112, + "pm2_name": "cookbook-llama32-3b-8112", "backend": "mlx"}, + ]) + return serve + + @pytest.mark.parametrize("port", [8083, 8090, 8094, 9223, 5678]) + def test_refuses_protected_ports(self, served, port): + r = served.stop(port, confirm=True) + assert r["status"] == "refused" + + def test_refuses_protected_port_even_if_in_served(self, served): + # defense-in-depth: a (hypothetical) cookbook record on a protected port + served._save_served([{"id": "x", "port": 8090, "pm2_name": "cookbook-x-8090"}]) + r = served.stop(8090, confirm=True) + assert r["status"] == "refused" and r["reason"] == "protected_port" + + @pytest.mark.parametrize("name", ["qwen3.6", "codec-dashboard", "pilot-runner", "n8n"]) + def test_refuses_non_cookbook_names(self, served, name): + r = served.stop(name, confirm=True) + assert r["status"] == "refused" and r["reason"] == "not_a_cookbook_process" + + def test_refuses_non_cookbook_namespace_record(self, served): + # a served record whose name isn't cookbook- prefixed → guard 3 + served._save_served([{"id": "x", "port": 8115, "pm2_name": "evil-proc-8115"}]) + r = served.stop("evil-proc-8115", confirm=True) + assert r["status"] == "refused" and r["reason"] == "not_cookbook_namespace" + + def test_refuses_bound_non_cookbook_port(self, served): + # a port we never served (not in served.json) → refused, never stopped + r = served.stop(8085, confirm=True) + assert r["status"] == "refused" and r["reason"] == "not_a_cookbook_process" + + def test_dry_run_without_confirm(self, served): + r = served.stop("cookbook-llama32-3b-8112", confirm=False) + assert r["status"] == "would_stop" + assert r["pm2_name"] == "cookbook-llama32-3b-8112" and r["port"] == 8112 + + def test_dry_run_by_port_without_confirm(self, served): + assert served.stop(8112, confirm=False)["status"] == "would_stop" + + def test_confirmed_stop_calls_pm2_delete(self, served, monkeypatch): + captured = {} + + def fake_run(argv, **kw): + captured["argv"] = argv + return MagicMock(returncode=0, stdout="ok", stderr="") + + monkeypatch.setattr(serve.subprocess, "run", fake_run) + r = served.stop("cookbook-llama32-3b-8112", confirm=True) + assert r["status"] == "stopped" + assert captured["argv"] == ["pm2", "delete", "cookbook-llama32-3b-8112"] + # removed from served.json + assert served.stop(8112, confirm=False)["status"] == "refused" + + +# ── integration: serve → list → stop (PM2 + health mocked) ────────────────── +class TestIntegration: + def test_serve_then_list_then_stop(self, tmp_path, monkeypatch): + monkeypatch.setattr(serve, "SERVED_PATH", str(tmp_path / "served.json")) + monkeypatch.setattr(probe, "bound_ports_in_range", lambda lo, hi: set()) + monkeypatch.setattr(probe, "pm2_processes", lambda: []) + monkeypatch.setattr(serve, "resolve_mlx_python", lambda: "/usr/bin/python3") + monkeypatch.setattr(serve, "_health_ok", lambda port, timeout_s=90: True) + monkeypatch.setattr(serve.subprocess, "run", + lambda argv, **kw: MagicMock(returncode=0, stdout="ok", stderr="")) + + entry = catalog.get("llama32-3b") + res = serve.launch(entry, context_length=8192) + assert res["status"] == "serving" + assert 8110 <= res["port"] <= 8119 + assert res["pm2_name"].startswith("cookbook-llama32-3b-") + + # appears in list + monkeypatch.setattr(probe, "is_port_bound", lambda port, host="127.0.0.1": True) + listed = serve.list_served() + assert any(r["id"] == "llama32-3b" for r in listed) + + # stop it + stopped = serve.stop(res["pm2_name"], confirm=True) + assert stopped["status"] == "stopped" + assert serve.list_served() == [] + + def test_serve_command_is_corrected_mlx_form(self, tmp_path, monkeypatch): + monkeypatch.setattr(serve, "SERVED_PATH", str(tmp_path / "served.json")) + monkeypatch.setattr(serve, "resolve_mlx_python", lambda: "/PY") + argv, err = serve._build_command(catalog.get("llama32-3b"), 8112, 8192) + assert err is None + # `-m mlx_lm server` subcommand form (NOT `-m mlx_lm.server`); --max-tokens present + assert argv[:5] == ["pm2", "start", "/PY", "--name", "cookbook-llama32-3b-8112"] + assert "mlx_lm" in argv and "server" in argv + assert "mlx_lm.server" not in argv + assert "--max-tokens" in argv + assert "--port" in argv and "8112" in argv + + +# ── skills smoke ───────────────────────────────────────────────────────────── +class TestSkills: + def test_all_six_discovered_with_expected_exposure(self): + import codec_dispatch + codec_dispatch.load_skills() + reg = codec_dispatch.registry + names = set(reg.names()) + expected = {"cookbook_scan", "cookbook_recommend", "cookbook_list", + "cookbook_serve", "cookbook_download", "cookbook_stop"} + assert expected <= names + # read-only exposed, mutating not + assert reg.get_mcp_expose("cookbook_scan") is True + assert reg.get_mcp_expose("cookbook_serve") is False + assert reg.get_mcp_expose("cookbook_stop") is False + + def test_serve_skill_refuses_on_insufficient_memory(self, monkeypatch): + import skills.cookbook_serve as cs + monkeypatch.setattr(cs.probe, "available_gb", lambda *a, **k: 10.0) + monkeypatch.setattr(cs.fit, "estimate_footprint_gb", lambda *a, **k: 42.0) + out = cs.run("cookbook serve qwen3-next-80b") + assert "Refused" in out and "insufficient memory" in out + + def test_serve_skill_force_overrides(self, monkeypatch): + import skills.cookbook_serve as cs + monkeypatch.setattr(cs.probe, "available_gb", lambda *a, **k: 10.0) + monkeypatch.setattr(cs.fit, "estimate_footprint_gb", lambda *a, **k: 42.0) + monkeypatch.setattr(cs.serve, "launch", + lambda e, c: {"status": "serving", "port": 8112, + "pm2_name": "cookbook-x-8112"}) + out = cs.run("cookbook serve qwen3-next-80b force") + assert "Serving" in out + + def test_stop_skill_requires_confirm(self, monkeypatch): + import skills.cookbook_stop as ck + monkeypatch.setattr(ck.serve, "list_served", + lambda: [{"id": "llama32-3b", "port": 8112, + "pm2_name": "cookbook-llama32-3b-8112"}]) + monkeypatch.setattr(ck.serve, "stop", + lambda t, confirm=False: ({"status": "would_stop", + "pm2_name": "cookbook-llama32-3b-8112", + "port": 8112} if not confirm + else {"status": "stopped", + "pm2_name": "cookbook-llama32-3b-8112", + "port": 8112})) + assert "confirm" in ck.run("cookbook stop llama32-3b").lower() + assert "Stopped" in ck.run("cookbook stop llama32-3b confirm")