skilld-dev · harlan-zw · May 29, 2026 · May 29, 2026
diff --git a/README.md b/README.md
@@ -222,6 +222,23 @@ skilld author assemble
 
 `skilld author assemble` auto-discovers skills with pending output files. `skilld update` re-exports prompts for outdated packages.
 
+### Local Models (Ollama)
+
+If [Ollama](https://ollama.com) is running, skilld auto-detects your locally-pulled text models and lists them in `skilld config` (and `-m ollama:<name>`):
+
+```bash
+skilld add npm:vue -m ollama:qwen2.5:14b-instruct
+```
+
+Generation runs locally: free, offline, no API key. Unlike the CLI and API backends, the Ollama path is one-shot (it does not explore reference files with tools), so all source material is inlined into a single prompt. Best paired with a capable instruct model.
+
+| Env var | Default | Purpose |
+|---------|---------|---------|
+| `OLLAMA_HOST` | `http://localhost:11434` | Ollama daemon address |
+| `OLLAMA_NUM_CTX` | `32768` | Context window; sized to fit the full prompt + output |
+
+The large default context can exceed memory for big models on constrained hardware (Ollama returns a 500). Lower `OLLAMA_NUM_CTX` or pick a smaller model if generation fails to load.
+
 ### Eject
 
 Export a skill as a portable, self-contained directory for sharing via git repos:

diff --git a/src/agent/clis/executors.ts b/src/agent/clis/executors.ts
@@ -13,6 +13,7 @@ import type { SectionExecutor } from './runner.ts'
 import type { OptimizeModel } from './types.ts'
 import { getSkillReferenceDirs } from '../../cache/index.ts'
 import { CLI_ADAPTERS, CLI_MODELS } from './index.ts'
+import { isOllamaModel, ollamaExecutor } from './ollama.ts'
 import { getAvailablePiAiModels, isPiAiModel, optimizeSectionPiAi } from './pi-ai.ts'
 import { spawnCliAndStream } from './runner.ts'
 
@@ -64,5 +65,7 @@ function piAiExecutor(model: OptimizeModel): SectionExecutor | { error: string }
 
 /** Resolve `model` to an executor, or an error if the model is unavailable/unmapped. */
 export function selectExecutor(model: OptimizeModel): SectionExecutor | { error: string } {
+  if (isOllamaModel(model))
+    return ollamaExecutor(model)
   return isPiAiModel(model) ? piAiExecutor(model) : cliExecutor(model)
 }
diff --git a/src/agent/clis/index.ts b/src/agent/clis/index.ts
@@ -13,6 +13,7 @@ import { agents } from '../targets/index.ts'
 import { adapter as claudeAdapter } from './claude.ts'
 import { adapter as codexAdapter } from './codex.ts'
 import { adapter as geminiAdapter } from './gemini.ts'
+import { getAvailableOllamaModels, isOllamaModel, parseOllamaModelId } from './ollama.ts'
 import { getAvailablePiAiModels, isPiAiModel, parsePiAiModelId } from './pi-ai.ts'
 
 export { buildAllSectionPrompts, buildSectionPrompt, SECTION_MERGE_ORDER, SECTION_OUTPUT_FILES } from '../prompts/index.ts'
@@ -62,6 +63,8 @@ export const CLI_MODELS: Partial<Record<OptimizeModel, CliModelConfig>> = Object
 // ── Model helpers ────────────────────────────────────────────────────
 
 export function getModelName(id: OptimizeModel): string {
+  if (isOllamaModel(id))
+    return parseOllamaModelId(id) ?? id
   if (isPiAiModel(id)) {
     const parsed = parsePiAiModelId(id)
     return parsed?.modelId ?? id
@@ -70,6 +73,8 @@ export function getModelName(id: OptimizeModel): string {
 }
 
 export function getModelLabel(id: OptimizeModel): string {
+  if (isOllamaModel(id))
+    return `Ollama · ${parseOllamaModelId(id) ?? id}`
   if (isPiAiModel(id)) {
     const parsed = parsePiAiModelId(id)
     return parsed ? `${PI_PROVIDER_NAMES[parsed.provider] ?? parsed.provider} · ${parsed.modelId}` : id
@@ -85,6 +90,10 @@ export async function getAvailableModels(): Promise<import('./types.ts').ModelIn
   const execAsync = promisify(exec)
   const lookupCmd = isWindows ? 'where' : 'which'
 
+  // Kick off Ollama discovery concurrently with the CLI lookups below; it has
+  // its own short timeout and resolves to [] when the daemon is unreachable.
+  const ollamaModelsPromise = getAvailableOllamaModels()
+
   const installedAgents = detectInstalledAgents()
   const agentsWithCli = installedAgents.filter(id => agents[id].cli)
 
@@ -137,5 +146,18 @@ export async function getAvailableModels(): Promise<import('./types.ts').ModelIn
     }
   })
 
-  return [...cliModels, ...piAiEntries]
+  // Append locally-pulled Ollama models (one-shot, no auth, free).
+  const ollamaModels = await ollamaModelsPromise
+  const ollamaEntries = ollamaModels.map(m => ({
+    id: m.id,
+    name: m.name,
+    hint: m.hint,
+    agentId: 'ollama',
+    agentName: 'Ollama (local)',
+    provider: 'ollama',
+    providerName: 'Ollama (local)',
+    vendorGroup: 'Ollama',
+  }))
+
+  return [...cliModels, ...piAiEntries, ...ollamaEntries]
 }
diff --git a/src/agent/clis/ollama.ts b/src/agent/clis/ollama.ts
@@ -0,0 +1,200 @@
+/**
+ * Ollama executor — one-shot local completions via Ollama's `/api/chat`.
+ *
+ * Unlike the CLI adapters (claude/codex/gemini) this is NOT an agentic loop:
+ * no tools, no file exploration, no multi-turn transcript. The caller supplies
+ * the full prompt and we return the single completion. That makes it free
+ * (local) and cheap on tokens — a good fit for guide synthesis where all the
+ * source material is already inlined into the prompt.
+ *
+ * Ollama has no first-class provider in pi-ai, and pi's OpenAI-compatible path
+ * can't set Ollama's `num_ctx` option — the knob that prevents silent prompt
+ * truncation below — so we talk to `/api/chat` directly.
+ *
+ * Model ids are `ollama:<name>`, e.g. `ollama:qwen2.5:14b-instruct`. The host
+ * defaults to http://localhost:11434, override with `OLLAMA_HOST`.
+ */
+
+import type { SectionExecutor } from './runner.ts'
+import type { OptimizeModel } from './types.ts'
+
+const OLLAMA_PREFIX = 'ollama:'
+
+export function isOllamaModel(model: string): boolean {
+  return model.startsWith(OLLAMA_PREFIX)
+}
+
+/** Parse `ollama:qwen2.5:14b-instruct` → `qwen2.5:14b-instruct`. */
+export function parseOllamaModelId(model: string): string | null {
+  return isOllamaModel(model) ? model.slice(OLLAMA_PREFIX.length) : null
+}
+
+const HAS_SCHEME_RE = /^https?:\/\//
+const TRAILING_SLASH_RE = /\/$/
+
+function ollamaHost(): string {
+  const raw = process.env.OLLAMA_HOST || 'http://localhost:11434'
+  const withScheme = HAS_SCHEME_RE.test(raw) ? raw : `http://${raw}`
+  return withScheme.replace(TRAILING_SLASH_RE, '')
+}
+
+interface OllamaChatChunk {
+  message?: { content?: string }
+  done?: boolean
+  prompt_eval_count?: number
+  eval_count?: number
+  error?: string
+}
+
+export interface OllamaModelInfo {
+  id: OptimizeModel
+  name: string
+  hint: string
+}
+
+interface OllamaTagsResponse {
+  models?: Array<{
+    name: string
+    size?: number
+    details?: { parameter_size?: string, quantization_level?: string }
+  }>
+}
+
+interface OllamaShowResponse {
+  capabilities?: string[]
+}
+
+/**
+ * Whether a model can generate text. `/api/tags` can't tell chat models from
+ * embedding-only ones (both report e.g. family `gemma3`), and an embedding
+ * model 500s on `/api/chat` — so probe `/api/show` for capabilities. Fail open:
+ * a probe error or an older Ollama that omits `capabilities` keeps the model.
+ */
+async function isCompletionModel(name: string): Promise<boolean> {
+  const res = await fetch(`${ollamaHost()}/api/show`, {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify({ model: name }),
+    signal: AbortSignal.timeout(1500),
+  }).catch(() => null)
+  if (!res?.ok)
+    return true
+
+  const data = await res.json().catch(() => null) as OllamaShowResponse | null
+  const caps = data?.capabilities
+  return !caps?.length || caps.includes('completion') || caps.includes('insert')
+}
+
+/**
+ * Enumerate locally-pulled, text-capable Ollama models via `/api/tags`. Returns
+ * `[]` when the daemon is unreachable (not installed / not running) — discovery
+ * must never block or throw, it just contributes nothing to the model list.
+ */
+export async function getAvailableOllamaModels(): Promise<OllamaModelInfo[]> {
+  const res = await fetch(`${ollamaHost()}/api/tags`, { signal: AbortSignal.timeout(1500) })
+    .catch(() => null)
+  if (!res?.ok)
+    return []
+
+  const data = await res.json().catch(() => null) as OllamaTagsResponse | null
+  if (!data?.models?.length)
+    return []
+
+  const checked = await Promise.all(data.models.map(async (m): Promise<OllamaModelInfo | null> => {
+    if (!(await isCompletionModel(m.name)))
+      return null
+    const params = m.details?.parameter_size
+    const quant = m.details?.quantization_level
+    const sizeGb = m.size ? `${(m.size / 1e9).toFixed(1)}GB` : undefined
+    const detail = params ? `${params}${quant ? ` ${quant}` : ''}` : sizeGb
+    return {
+      id: `ollama:${m.name}`,
+      name: m.name,
+      hint: detail ? `local · ${detail}` : 'local',
+    }
+  }))
+
+  return checked.filter((m): m is OllamaModelInfo => m !== null)
+}
+
+export function ollamaExecutor(model: OptimizeModel): SectionExecutor | { error: string } {
+  const modelId = parseOllamaModelId(model)
+  if (!modelId)
+    return { error: `Not an Ollama model: ${model}` }
+
+  return {
+    cliCleanup: false,
+    run: async ({ section, prompt, timeout, onProgress }) => {
+      const ac = new AbortController()
+      const timer = setTimeout(() => ac.abort(), timeout)
+      onProgress?.({ chunk: '[ollama]', type: 'reasoning', text: '', reasoning: 'Generating locally…', section })
+
+      const res = await fetch(`${ollamaHost()}/api/chat`, {
+        method: 'POST',
+        headers: { 'content-type': 'application/json' },
+        body: JSON.stringify({
+          model: modelId,
+          messages: [{ role: 'user', content: prompt }],
+          stream: true,
+          // Ollama defaults num_ctx to ~4k, which silently truncates our
+          // ~16k-token prompt (release notes) and starves synthesis. Size it to
+          // hold the full material + output; override with OLLAMA_NUM_CTX.
+          options: { temperature: 0.2, num_ctx: Number(process.env.OLLAMA_NUM_CTX) || 32768 },
+        }),
+        signal: ac.signal,
+      }).catch((err: Error) => ({ ok: false, statusText: err.message, body: null } as unknown as Response))
+
+      if (!res.ok || !res.body) {
+        clearTimeout(timer)
+        return {
+          text: '',
+          stderr: `Ollama request failed: ${res.statusText}. Is \`ollama serve\` running at ${ollamaHost()}?`,
+          exitCode: 1,
+        }
+      }
+
+      let text = ''
+      let usage: { input: number, output: number } | undefined
+
+      // Ollama streams NDJSON: one JSON object per line, the final one carrying
+      // `done: true` plus token counts. Accumulate content and surface deltas.
+      const drain = async (): Promise<void> => {
+        const reader = res.body!.getReader()
+        const decoder = new TextDecoder()
+        let buffer = ''
+        for (;;) {
+          const { done, value } = await reader.read()
+          if (done)
+            break
+          buffer += decoder.decode(value, { stream: true })
+          const lines = buffer.split('\n')
+          buffer = lines.pop() || ''
+          for (const line of lines) {
+            if (!line.trim())
+              continue
+            const chunk = JSON.parse(line) as OllamaChatChunk
+            if (chunk.error)
+              throw new Error(chunk.error)
+            const delta = chunk.message?.content
+            if (delta) {
+              text += delta
+              onProgress?.({ chunk: delta, type: 'text', text, reasoning: '', section })
+            }
+            if (chunk.done)
+              usage = { input: chunk.prompt_eval_count ?? 0, output: chunk.eval_count ?? 0 }
+          }
+        }
+      }
+
+      const streamError = await drain().then(() => undefined).catch((err: Error) => err.message)
+      clearTimeout(timer)
+
+      if (streamError)
+        return { text: '', stderr: `Ollama stream error: ${streamError}`, exitCode: 1 }
+      if (!text.trim())
+        return { text: '', stderr: 'Ollama returned no content', exitCode: 1 }
+
+      return { text: text.trim(), usage, cost: 0 }
+    },
+  }
+}
diff --git a/src/agent/clis/types.ts b/src/agent/clis/types.ts
@@ -64,6 +64,8 @@ export type OptimizeModel
     | 'gpt-5.2-codex'
     // pi-ai direct API models — dynamic from pi-ai's model registry
     | `pi:${string}`
+    // Local Ollama models, one-shot completions — e.g. `ollama:qwen2.5:14b-instruct`
+    | `ollama:${string}`
 
 export interface ModelInfo {
   id: OptimizeModel