diff --git a/README.md b/README.md index faf6f1fd..253dd0cb 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,23 @@ skilld author assemble `skilld author assemble` auto-discovers skills with pending output files. `skilld update` re-exports prompts for outdated packages. +### Local Models (Ollama) + +If [Ollama](https://ollama.com) is running, skilld auto-detects your locally-pulled text models and lists them in `skilld config` (and `-m ollama:`): + +```bash +skilld add npm:vue -m ollama:qwen2.5:14b-instruct +``` + +Generation runs locally: free, offline, no API key. Unlike the CLI and API backends, the Ollama path is one-shot (it does not explore reference files with tools), so all source material is inlined into a single prompt. Best paired with a capable instruct model. + +| Env var | Default | Purpose | +|---------|---------|---------| +| `OLLAMA_HOST` | `http://localhost:11434` | Ollama daemon address | +| `OLLAMA_NUM_CTX` | `32768` | Context window; sized to fit the full prompt + output | + +The large default context can exceed memory for big models on constrained hardware (Ollama returns a 500). Lower `OLLAMA_NUM_CTX` or pick a smaller model if generation fails to load. + ### Eject Export a skill as a portable, self-contained directory for sharing via git repos: diff --git a/src/agent/clis/executors.ts b/src/agent/clis/executors.ts index 185743dc..4bfaceb9 100644 --- a/src/agent/clis/executors.ts +++ b/src/agent/clis/executors.ts @@ -13,6 +13,7 @@ import type { SectionExecutor } from './runner.ts' import type { OptimizeModel } from './types.ts' import { getSkillReferenceDirs } from '../../cache/index.ts' import { CLI_ADAPTERS, CLI_MODELS } from './index.ts' +import { isOllamaModel, ollamaExecutor } from './ollama.ts' import { getAvailablePiAiModels, isPiAiModel, optimizeSectionPiAi } from './pi-ai.ts' import { spawnCliAndStream } from './runner.ts' @@ -64,5 +65,7 @@ function piAiExecutor(model: OptimizeModel): SectionExecutor | { error: string } /** Resolve `model` to an executor, or an error if the model is unavailable/unmapped. */ export function selectExecutor(model: OptimizeModel): SectionExecutor | { error: string } { + if (isOllamaModel(model)) + return ollamaExecutor(model) return isPiAiModel(model) ? piAiExecutor(model) : cliExecutor(model) } diff --git a/src/agent/clis/index.ts b/src/agent/clis/index.ts index 40b04120..7dde3fbf 100644 --- a/src/agent/clis/index.ts +++ b/src/agent/clis/index.ts @@ -13,6 +13,7 @@ import { agents } from '../targets/index.ts' import { adapter as claudeAdapter } from './claude.ts' import { adapter as codexAdapter } from './codex.ts' import { adapter as geminiAdapter } from './gemini.ts' +import { getAvailableOllamaModels, isOllamaModel, parseOllamaModelId } from './ollama.ts' import { getAvailablePiAiModels, isPiAiModel, parsePiAiModelId } from './pi-ai.ts' export { buildAllSectionPrompts, buildSectionPrompt, SECTION_MERGE_ORDER, SECTION_OUTPUT_FILES } from '../prompts/index.ts' @@ -62,6 +63,8 @@ export const CLI_MODELS: Partial> = Object // ── Model helpers ──────────────────────────────────────────────────── export function getModelName(id: OptimizeModel): string { + if (isOllamaModel(id)) + return parseOllamaModelId(id) ?? id if (isPiAiModel(id)) { const parsed = parsePiAiModelId(id) return parsed?.modelId ?? id @@ -70,6 +73,8 @@ export function getModelName(id: OptimizeModel): string { } export function getModelLabel(id: OptimizeModel): string { + if (isOllamaModel(id)) + return `Ollama · ${parseOllamaModelId(id) ?? id}` if (isPiAiModel(id)) { const parsed = parsePiAiModelId(id) return parsed ? `${PI_PROVIDER_NAMES[parsed.provider] ?? parsed.provider} · ${parsed.modelId}` : id @@ -85,6 +90,10 @@ export async function getAvailableModels(): Promise agents[id].cli) @@ -137,5 +146,18 @@ export async function getAvailableModels(): Promise ({ + id: m.id, + name: m.name, + hint: m.hint, + agentId: 'ollama', + agentName: 'Ollama (local)', + provider: 'ollama', + providerName: 'Ollama (local)', + vendorGroup: 'Ollama', + })) + + return [...cliModels, ...piAiEntries, ...ollamaEntries] } diff --git a/src/agent/clis/ollama.ts b/src/agent/clis/ollama.ts new file mode 100644 index 00000000..72c40f4c --- /dev/null +++ b/src/agent/clis/ollama.ts @@ -0,0 +1,200 @@ +/** + * Ollama executor — one-shot local completions via Ollama's `/api/chat`. + * + * Unlike the CLI adapters (claude/codex/gemini) this is NOT an agentic loop: + * no tools, no file exploration, no multi-turn transcript. The caller supplies + * the full prompt and we return the single completion. That makes it free + * (local) and cheap on tokens — a good fit for guide synthesis where all the + * source material is already inlined into the prompt. + * + * Ollama has no first-class provider in pi-ai, and pi's OpenAI-compatible path + * can't set Ollama's `num_ctx` option — the knob that prevents silent prompt + * truncation below — so we talk to `/api/chat` directly. + * + * Model ids are `ollama:`, e.g. `ollama:qwen2.5:14b-instruct`. The host + * defaults to http://localhost:11434, override with `OLLAMA_HOST`. + */ + +import type { SectionExecutor } from './runner.ts' +import type { OptimizeModel } from './types.ts' + +const OLLAMA_PREFIX = 'ollama:' + +export function isOllamaModel(model: string): boolean { + return model.startsWith(OLLAMA_PREFIX) +} + +/** Parse `ollama:qwen2.5:14b-instruct` → `qwen2.5:14b-instruct`. */ +export function parseOllamaModelId(model: string): string | null { + return isOllamaModel(model) ? model.slice(OLLAMA_PREFIX.length) : null +} + +const HAS_SCHEME_RE = /^https?:\/\// +const TRAILING_SLASH_RE = /\/$/ + +function ollamaHost(): string { + const raw = process.env.OLLAMA_HOST || 'http://localhost:11434' + const withScheme = HAS_SCHEME_RE.test(raw) ? raw : `http://${raw}` + return withScheme.replace(TRAILING_SLASH_RE, '') +} + +interface OllamaChatChunk { + message?: { content?: string } + done?: boolean + prompt_eval_count?: number + eval_count?: number + error?: string +} + +export interface OllamaModelInfo { + id: OptimizeModel + name: string + hint: string +} + +interface OllamaTagsResponse { + models?: Array<{ + name: string + size?: number + details?: { parameter_size?: string, quantization_level?: string } + }> +} + +interface OllamaShowResponse { + capabilities?: string[] +} + +/** + * Whether a model can generate text. `/api/tags` can't tell chat models from + * embedding-only ones (both report e.g. family `gemma3`), and an embedding + * model 500s on `/api/chat` — so probe `/api/show` for capabilities. Fail open: + * a probe error or an older Ollama that omits `capabilities` keeps the model. + */ +async function isCompletionModel(name: string): Promise { + const res = await fetch(`${ollamaHost()}/api/show`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ model: name }), + signal: AbortSignal.timeout(1500), + }).catch(() => null) + if (!res?.ok) + return true + + const data = await res.json().catch(() => null) as OllamaShowResponse | null + const caps = data?.capabilities + return !caps?.length || caps.includes('completion') || caps.includes('insert') +} + +/** + * Enumerate locally-pulled, text-capable Ollama models via `/api/tags`. Returns + * `[]` when the daemon is unreachable (not installed / not running) — discovery + * must never block or throw, it just contributes nothing to the model list. + */ +export async function getAvailableOllamaModels(): Promise { + const res = await fetch(`${ollamaHost()}/api/tags`, { signal: AbortSignal.timeout(1500) }) + .catch(() => null) + if (!res?.ok) + return [] + + const data = await res.json().catch(() => null) as OllamaTagsResponse | null + if (!data?.models?.length) + return [] + + const checked = await Promise.all(data.models.map(async (m): Promise => { + if (!(await isCompletionModel(m.name))) + return null + const params = m.details?.parameter_size + const quant = m.details?.quantization_level + const sizeGb = m.size ? `${(m.size / 1e9).toFixed(1)}GB` : undefined + const detail = params ? `${params}${quant ? ` ${quant}` : ''}` : sizeGb + return { + id: `ollama:${m.name}`, + name: m.name, + hint: detail ? `local · ${detail}` : 'local', + } + })) + + return checked.filter((m): m is OllamaModelInfo => m !== null) +} + +export function ollamaExecutor(model: OptimizeModel): SectionExecutor | { error: string } { + const modelId = parseOllamaModelId(model) + if (!modelId) + return { error: `Not an Ollama model: ${model}` } + + return { + cliCleanup: false, + run: async ({ section, prompt, timeout, onProgress }) => { + const ac = new AbortController() + const timer = setTimeout(() => ac.abort(), timeout) + onProgress?.({ chunk: '[ollama]', type: 'reasoning', text: '', reasoning: 'Generating locally…', section }) + + const res = await fetch(`${ollamaHost()}/api/chat`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + model: modelId, + messages: [{ role: 'user', content: prompt }], + stream: true, + // Ollama defaults num_ctx to ~4k, which silently truncates our + // ~16k-token prompt (release notes) and starves synthesis. Size it to + // hold the full material + output; override with OLLAMA_NUM_CTX. + options: { temperature: 0.2, num_ctx: Number(process.env.OLLAMA_NUM_CTX) || 32768 }, + }), + signal: ac.signal, + }).catch((err: Error) => ({ ok: false, statusText: err.message, body: null } as unknown as Response)) + + if (!res.ok || !res.body) { + clearTimeout(timer) + return { + text: '', + stderr: `Ollama request failed: ${res.statusText}. Is \`ollama serve\` running at ${ollamaHost()}?`, + exitCode: 1, + } + } + + let text = '' + let usage: { input: number, output: number } | undefined + + // Ollama streams NDJSON: one JSON object per line, the final one carrying + // `done: true` plus token counts. Accumulate content and surface deltas. + const drain = async (): Promise => { + const reader = res.body!.getReader() + const decoder = new TextDecoder() + let buffer = '' + for (;;) { + const { done, value } = await reader.read() + if (done) + break + buffer += decoder.decode(value, { stream: true }) + const lines = buffer.split('\n') + buffer = lines.pop() || '' + for (const line of lines) { + if (!line.trim()) + continue + const chunk = JSON.parse(line) as OllamaChatChunk + if (chunk.error) + throw new Error(chunk.error) + const delta = chunk.message?.content + if (delta) { + text += delta + onProgress?.({ chunk: delta, type: 'text', text, reasoning: '', section }) + } + if (chunk.done) + usage = { input: chunk.prompt_eval_count ?? 0, output: chunk.eval_count ?? 0 } + } + } + } + + const streamError = await drain().then(() => undefined).catch((err: Error) => err.message) + clearTimeout(timer) + + if (streamError) + return { text: '', stderr: `Ollama stream error: ${streamError}`, exitCode: 1 } + if (!text.trim()) + return { text: '', stderr: 'Ollama returned no content', exitCode: 1 } + + return { text: text.trim(), usage, cost: 0 } + }, + } +} diff --git a/src/agent/clis/types.ts b/src/agent/clis/types.ts index 98cfedb4..d46bf754 100644 --- a/src/agent/clis/types.ts +++ b/src/agent/clis/types.ts @@ -64,6 +64,8 @@ export type OptimizeModel | 'gpt-5.2-codex' // pi-ai direct API models — dynamic from pi-ai's model registry | `pi:${string}` + // Local Ollama models, one-shot completions — e.g. `ollama:qwen2.5:14b-instruct` + | `ollama:${string}` export interface ModelInfo { id: OptimizeModel