From f534552633ea47ecf27cd0c91de1570449e37492 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 14 May 2026 23:38:33 +0100 Subject: [PATCH 1/4] fix: marked v15 + marked-terminal v7 incompat in markdown-renderer marked v15's use() iterates 'for (prop in pack.renderer)' and validates every enumerable key against its known renderer method list, throwing "renderer 'o' does not exist" at module init. The legacy 'new TerminalRenderer(opts)' route assigns config to own enumerable properties (this.o, this.tab, ...), so the first iteration hits an unknown key and crashes. This broke the agent on every 'just start' since PR #135 landed; CI never noticed because no test imports the module. Switch to the modern markedTerminal() factory which returns a clean MarkedExtension containing only renderer method keys, and add a regression test that import-loads the module and smoke-tests rendering so a future bump can't reintroduce this class of crash. Signed-off-by: Simon Davies --- src/agent/markdown-renderer.ts | 39 ++++++++------ tests/markdown-renderer.test.ts | 95 +++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 16 deletions(-) create mode 100644 tests/markdown-renderer.test.ts diff --git a/src/agent/markdown-renderer.ts b/src/agent/markdown-renderer.ts index d9bbc13..b92cd57 100644 --- a/src/agent/markdown-renderer.ts +++ b/src/agent/markdown-renderer.ts @@ -8,8 +8,8 @@ // import { renderMarkdown } from "./markdown-renderer.js"; // console.log(renderMarkdown("# Hello\n**bold** and `code`")); -import { Marked, type MarkedOptions } from "marked"; -import TerminalRenderer from "marked-terminal"; +import { Marked, type MarkedExtension } from "marked"; +import { markedTerminal } from "marked-terminal"; import { createRequire } from "node:module"; import { resolve } from "node:path"; import kqlLanguage from "./hljs-kql.js"; @@ -28,20 +28,27 @@ hljsInstance.registerLanguage("kql", kqlLanguage); // Use a local Marked instance so we don't mutate the global marked // singleton — other code importing marked won't accidentally get // terminal-rendered output instead of HTML. -const terminalRenderer = new TerminalRenderer({ - // Indent code blocks for visual separation - tab: 2, - // Show URLs inline rather than as footnotes - showSectionPrefix: true, - // Convert HTML entities back to characters - unescape: true, -}); - -// marked-terminal's renderer type doesn't match marked v15's _Renderer -// exactly, but it works at runtime. Cast to satisfy the type checker. -const localMarked = new Marked({ - renderer: terminalRenderer as unknown as MarkedOptions["renderer"], -}); +// +// `markedTerminal()` returns a MarkedExtension (`{ renderer, useNewRenderer }`) +// suitable for marked v15's strict `use()` validation. Constructing a +// `TerminalRenderer` instance directly and passing it as `{ renderer }` no +// longer works in marked >=15 because TerminalRenderer's constructor sets +// own enumerable properties (`this.o`, `this.tab`, …) and marked iterates +// every enumerable key of the renderer object, rejecting anything that +// isn't a known renderer method. +// +// The @types/marked-terminal declaration mis-types the return as +// `TerminalRenderer`; cast through `unknown` to the correct shape. +const localMarked = new Marked( + markedTerminal({ + // Indent code blocks for visual separation + tab: 2, + // Show URLs inline rather than as footnotes + showSectionPrefix: true, + // Convert HTML entities back to characters + unescape: true, + }) as unknown as MarkedExtension, +); /** * Render a markdown string as ANSI-formatted terminal output. diff --git a/tests/markdown-renderer.test.ts b/tests/markdown-renderer.test.ts new file mode 100644 index 0000000..f6e5216 --- /dev/null +++ b/tests/markdown-renderer.test.ts @@ -0,0 +1,95 @@ +// ── Markdown Renderer Tests ──────────────────────────────────────── +// +// These tests exist primarily to detect *module-load crashes* in +// `markdown-renderer.ts`. The module wires `marked` + `marked-terminal` +// at import time and any incompatibility between the two pinned versions +// will throw on first require — long before any test exercises the +// rendered output. +// +// Regression: in marked v15 + marked-terminal v7, passing a +// `TerminalRenderer` *instance* via `new Marked({ renderer })` throws +// "renderer 'o' does not exist" because marked enumerates every key on +// the renderer object and the legacy `Renderer` constructor stores +// config as own properties (`this.o`, `this.tab`, …). The fix is to +// use the `markedTerminal()` factory which returns a clean +// `MarkedExtension`. These tests would have caught that bug. +// +// ──────────────────────────────────────────────────────────────────── + +import { describe, it, expect } from "vitest"; +import { + renderMarkdown, + looksLikeMarkdown, +} from "../src/agent/markdown-renderer.js"; + +describe("renderMarkdown", () => { + it("renders plain text without crashing", () => { + // The smoke test that matters most — if module init fails this + // throws before reaching the assertion. + const out = renderMarkdown("hello world"); + expect(typeof out).toBe("string"); + expect(out.length).toBeGreaterThan(0); + }); + + it("renders headings, code, lists and code fences", () => { + const md = [ + "# Title", + "", + "**bold** and `inline code`", + "", + "- one", + "- two", + "", + "```js", + "const x = 1;", + "```", + ].join("\n"); + + const out = renderMarkdown(md); + + // We don't assert on ANSI codes (they vary by terminal capability) + // — just that the meaningful content survived rendering. + expect(out).toContain("Title"); + expect(out).toContain("bold"); + expect(out).toContain("inline code"); + expect(out).toContain("one"); + expect(out).toContain("two"); + expect(out).toContain("const x = 1"); + }); + + it("trims trailing newlines", () => { + const out = renderMarkdown("hello\n\n"); + expect(out.endsWith("\n")).toBe(false); + }); + + it("handles empty input", () => { + expect(renderMarkdown("")).toBe(""); + }); +}); + +describe("looksLikeMarkdown", () => { + it("detects headings", () => { + expect(looksLikeMarkdown("# Heading")).toBe(true); + expect(looksLikeMarkdown("###### h6")).toBe(true); + }); + + it("detects fenced code blocks", () => { + expect(looksLikeMarkdown("```js\nconst x = 1;\n```")).toBe(true); + }); + + it("detects tables", () => { + expect(looksLikeMarkdown("| a | b |\n| - | - |\n| 1 | 2 |")).toBe(true); + }); + + it("detects ordered lists", () => { + expect(looksLikeMarkdown("1. first\n2. second")).toBe(true); + }); + + it("rejects plain text and weak signals", () => { + // Bold markers and unordered bullets alone are too noisy to count + // as markdown (false positives on git output, log lines, etc.). + expect(looksLikeMarkdown("just a sentence")).toBe(false); + expect(looksLikeMarkdown("**not strong enough**")).toBe(false); + expect(looksLikeMarkdown("- bullet alone")).toBe(false); + }); +}); From 2a6fb86a49eedc3c40d9eac8f358f6b1238a97ee Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 14 May 2026 23:39:54 +0100 Subject: [PATCH 2/4] feat: user-generated skills from session learnings Lets users persist what the agent learned in a session as a reusable skill at ~/.hyperagent/skills//SKILL.md, surviving upgrades and overriding system skills with the same name. Triggered via: - /save-skill [name] - slash command that builds a synthetic prompt from session context (tool history, MCP servers, modules registered, recent errors) and asks the LLM to call generate_skill() - 'save this as a skill' (natural language) - system message documents the generate_skill tool so the LLM can call it directly Components added: - src/agent/skill-writer.ts: validation + CRUD for user skills, with HYPERAGENT_USER_SKILLS_DIR env override for tests - src/agent/session-context.ts: pure extractor that rolls up tool history, MCP servers, modules registered, and recent errors into a prompt-ready string - generate_skill tool: registered in all three gating points (tools[], ALLOWED_TOOLS, availableTools[]) with interactive approval - /skills enhanced with 'info ', 'edit ', 'delete ', override-detection badge for user skills - skill-loader now supports loading from multiple directories with override semantics (later dirs win) - state.ts tracks toolCallHistory (capped FIFO), mcpServersUsed, modulesRegistered, pendingPrompt; populated by onPostToolUse hook and registerModuleImpl - system-message.ts documents the saving workflow for the LLM - docs/SKILLS.md adds 'User Skills (Persist What You Learn)' section Tests: 39 new (skill-writer 22, session-context 9, skill-loader +8). All 2443 TS tests pass; 124 Rust tests pass; lint clean. Signed-off-by: Simon Davies --- docs/SKILLS.md | 59 ++++++ src/agent/approach-resolver.ts | 13 +- src/agent/commands.ts | 20 +- src/agent/index.ts | 333 ++++++++++++++++++++++++++++++- src/agent/session-context.ts | 152 ++++++++++++++ src/agent/skill-loader.ts | 37 +++- src/agent/skill-writer.ts | 342 ++++++++++++++++++++++++++++++++ src/agent/slash-commands.ts | 271 +++++++++++++++++++++---- src/agent/state.ts | 44 ++++ src/agent/system-message.ts | 8 + src/agent/tool-gating.ts | 1 + tests/approach-resolver.test.ts | 8 + tests/intent-matcher.test.ts | 2 + tests/session-context.test.ts | 198 ++++++++++++++++++ tests/skill-loader.test.ts | 90 ++++++++- tests/skill-writer.test.ts | 308 ++++++++++++++++++++++++++++ 16 files changed, 1839 insertions(+), 47 deletions(-) create mode 100644 src/agent/session-context.ts create mode 100644 src/agent/skill-writer.ts create mode 100644 tests/session-context.test.ts create mode 100644 tests/skill-writer.test.ts diff --git a/docs/SKILLS.md b/docs/SKILLS.md index aae636b..f0be8b4 100644 --- a/docs/SKILLS.md +++ b/docs/SKILLS.md @@ -304,6 +304,65 @@ They work well together: hyperagent --skill web-scraper --profile web-research ``` +## User Skills (Persist What You Learn) + +Hyperagent ships with a curated set of *system skills* (the table above) — and +also lets you grow your own library of *user skills* during real work. When you +and the agent have just spent ten minutes figuring out how to do something +non-obvious, save the lesson so the next session starts where you left off. + +### Saving a Skill + +In any REPL session, run: + +``` +/save-skill # let the LLM pick a name +/save-skill teams-transcript-finder +``` + +What happens: + +1. Hyperagent collects a structured summary of the session — tool calls, + MCP servers used, modules registered, recent errors. +2. That summary plus instructions is sent to the LLM as a synthetic user + turn. +3. The LLM calls the `generate_skill` tool with a proposed SKILL.md (and + optionally a companion module). You approve before anything is written. +4. The skill is persisted to `~/.hyperagent/skills//SKILL.md`. + +User skills are loaded automatically on every startup. If a user skill has +the same name as a system skill, the user version wins (overrides are +flagged with `👤 (overrides built-in)` in `/skills`). + +### Managing User Skills + +``` +/skills # list system + user skills (👤 = user) +/skills info # show the SKILL.md contents +/skills edit # print the path for your $EDITOR +/skills delete # remove a user skill (system ones are immutable) +``` + +User skills live in `~/.hyperagent/skills//SKILL.md` and follow the +same format as system skills documented above. Edit them in your editor of +choice — changes apply on the next `/suggest_approach` invocation. + +### When to Save vs Not Save + +Save a skill when: + +- The workflow took non-trivial effort to figure out and is likely to recur. +- The lesson would be lost between sessions (modules + skill capture it + together). +- A few well-chosen triggers will reliably match future prompts on the same + topic. + +Don't save a skill when: + +- The task was one-off (no expected recurrence). +- The lesson is generic ("use the right tool"). Skills are for *specific* + domain knowledge. + ## See Also - [PATTERNS.md](PATTERNS.md) - Code generation patterns diff --git a/src/agent/approach-resolver.ts b/src/agent/approach-resolver.ts index d989ca0..1023053 100644 --- a/src/agent/approach-resolver.ts +++ b/src/agent/approach-resolver.ts @@ -8,7 +8,7 @@ import type { Skill } from "./skill-loader.js"; import type { Pattern } from "./pattern-loader.js"; import { matchIntent } from "./intent-matcher.js"; -import { loadSkills } from "./skill-loader.js"; +import { loadSkills, loadSkillsFromDirs } from "./skill-loader.js"; import { loadPatterns } from "./pattern-loader.js"; import { loadModule, type ModuleHints } from "./module-store.js"; @@ -323,20 +323,25 @@ export function formatGuidance(guidance: MaterialisedGuidance): string { * * @param prompt - The user's prompt text * @param preLoadedSkills - Pre-loaded skill names (from --skill flag) - * @param skillsDir - Path to skills/ directory + * @param skillsDir - Path to skills directory(ies). A single string loads + * only system skills; pass an array of `{ dir, source }` records to load + * user skills alongside system skills (user wins on name collision). * @param patternsDir - Path to patterns/ directory * @param debugLog - Optional debug logger */ export function runSuggestApproach( prompt: string, preLoadedSkills: string[], - skillsDir: string, + skillsDir: string | Array<{ dir: string; source: "system" | "user" }>, patternsDir: string, debugLog?: (msg: string) => void, ): SuggestApproachResult { const log = debugLog ?? (() => {}); - const skills = loadSkills(skillsDir); + const skills = + typeof skillsDir === "string" + ? loadSkills(skillsDir) + : loadSkillsFromDirs(skillsDir); const patterns = loadPatterns(patternsDir); let matchedSkillNames: string[]; diff --git a/src/agent/commands.ts b/src/agent/commands.ts index ad6b189..ac697d1 100644 --- a/src/agent/commands.ts +++ b/src/agent/commands.ts @@ -488,12 +488,28 @@ const COMMANDS: readonly CommandEntry[] = Object.freeze([ help: "List and invoke available skills", group: "General", detail: - "/skills — list all available skills\n" + + "/skills — list system + user skills (user-authored marked 👤)\n" + "/skills — invoke a skill (injects domain expertise)\n" + + "/skills info — show the full SKILL.md\n" + + "/skills edit — print path of user skill for $EDITOR\n" + + "/skills delete — remove a user skill (system ones are immutable)\n" + "Skills are SKILL.md files in the skills/ directory.\n" + - "Invoke a skill to get specialised instructions for a task.\n" + + "User skills live in ~/.hyperagent/skills/ and override system ones.\n" + "Example: /skills pptx-expert — expert at building PPTX presentations.", }, + { + completion: "/save-skill", + help: "Save session learnings as a reusable skill", + group: "General", + detail: + "/save-skill — capture what we learned this session as a SKILL.md\n" + + "/save-skill — same, but suggest a name to the LLM\n" + + "Sends a structured summary of the session's tool activity, MCP\n" + + "servers used, modules registered, and errors hit to the LLM,\n" + + "which then calls the generate_skill tool to write SKILL.md to\n" + + "~/.hyperagent/skills//. The skill is loaded on next start\n" + + "and triggered automatically by /suggest_approach.", + }, { completion: "/help", help: "Show this help (or /help for details)", diff --git a/src/agent/index.ts b/src/agent/index.ts index f78faa3..1852fa7 100644 --- a/src/agent/index.ts +++ b/src/agent/index.ts @@ -100,12 +100,18 @@ import { formatCompact, } from "./format-exports.js"; import { loadPatterns } from "./pattern-loader.js"; -import { loadSkills } from "./skill-loader.js"; +import { loadSkillsFromDirs } from "./skill-loader.js"; import { runSuggestApproach, formatGuidance, type MCPServerStatus, } from "./approach-resolver.js"; +import { + getUserSkillsDir, + writeUserSkill, + userSkillExists, + type SkillData, +} from "./skill-writer.js"; import { validatePath } from "../../plugins/shared/path-jail.js"; import { validateJavaScript as validateJavaScriptGuest, @@ -690,6 +696,14 @@ const DEFAULT_SEND_TIMEOUT_MS = 300_000; const SEND_TIMEOUT_MS: number = parseInt(cli.sendTimeout, 10) || DEFAULT_SEND_TIMEOUT_MS; +/** + * Maximum number of tool-call entries kept in `state.toolCallHistory`. + * Older entries are evicted FIFO to keep memory flat for long sessions. + * The cap is small because /save-skill only needs a representative + * sample of recent activity — not a full transcript. + */ +const MAX_TOOL_HISTORY = 200; + // ── Sandbox Setup ──────────────────────────────────────────────────── /** @@ -1117,6 +1131,9 @@ async function handleSlashCommand( drainAndWarn, mcpManager, // Real MCP manager (or null if no config) syncPlugins: syncPluginsToSandbox, + submitToLLM: (prompt: string) => { + state.pendingPrompt = prompt; + }, }; return handleSlashCommandImpl(rawInput, rl, slashDeps); } @@ -4952,6 +4969,12 @@ async function registerModuleImpl(params: { ` ${C.ok("📦 Module registered:")} ${params.name} (${exports.length} exports, ${params.source.length} bytes)`, ); + // Record for /save-skill — a module registered this session is a + // candidate companion module when the lesson learned is saved. + if (!state.modulesRegistered.includes(params.name)) { + state.modulesRegistered.push(params.name); + } + return { success: true, name: info.name, @@ -5394,6 +5417,252 @@ const deleteModuleTool = defineTool("delete_module", { }, }); +// ── generate_skill ─────────────────────────────────────────────────── +// +// Persist a user-generated skill to `~/.hyperagent/skills//SKILL.md` +// so the lesson learned in this session is available — and automatically +// surfaced via /suggest_approach trigger matching — in future sessions. +// +// Optionally writes a companion ES module (saved like /register_module) +// so the skill can reference a reusable bundle of helper functions. +// +// Skill names collide deterministically: a user skill with the same name +// as a built-in skill overrides the built-in (see loadSkillsFromDirs). + +const generateSkillTool = defineTool("generate_skill", { + description: [ + "Save a reusable SKILL.md (and optional companion module) to the user's", + "skill library based on what was learned in the current session.", + "", + "Use this when the user asks to 'save what we learned' or runs", + "/save-skill. The skill is persisted to ~/.hyperagent/skills//", + "and is automatically loaded on next start; its triggers are matched", + "by /suggest_approach so it's surfaced again when relevant.", + "", + "Choose triggers carefully — short, distinctive keywords/phrases that", + "will appear in future prompts about the same task. Guidance should be", + "concise, action-oriented domain knowledge, NOT a session transcript.", + "", + "If the workflow uses a non-trivial bundle of helpers, include a", + "companionModule so future sessions get the code too.", + ].join("\n"), + parameters: { + type: "object", + properties: { + name: { + type: "string", + description: + "Skill name in kebab-case, used as the directory name (e.g. 'teams-transcript-finder')", + }, + description: { + type: "string", + description: + "One-line summary (≤280 chars) shown in `/skill list` and surfaced to suggest_approach", + }, + triggers: { + type: "array", + items: { type: "string" }, + description: + "Short keywords/phrases that match future prompts about this topic (≤50)", + }, + guidance: { + type: "string", + description: + "Markdown body — domain knowledge, workflow steps, gotchas. Concise + action-oriented.", + }, + patterns: { + type: "array", + items: { type: "string" }, + description: + "Optional: names of built-in patterns (in patterns/) this skill draws on", + }, + antiPatterns: { + type: "array", + items: { type: "string" }, + description: + "Optional: 'DO NOT' rules surfaced first when this skill activates", + }, + requiresMcp: { + type: "array", + items: { type: "string" }, + description: + "Optional: MCP server names this skill depends on (e.g. ['workiq'])", + }, + allowedTools: { + type: "array", + items: { type: "string" }, + description: + "Tools this skill is permitted to invoke (must be a subset of the agent's allowed tools)", + }, + companionModule: { + type: "object", + description: + "Optional: a reusable ES module to save alongside the skill", + properties: { + name: { + type: "string", + description: "Module name in kebab-case", + }, + source: { + type: "string", + description: "ES module JavaScript source code (must use `export`)", + }, + description: { + type: "string", + description: "One-line description of the module", + }, + }, + required: ["name", "source", "description"], + }, + overwrite: { + type: "boolean", + description: + "Set true to overwrite an existing user skill with the same name", + }, + }, + required: ["name", "description", "triggers", "guidance", "allowedTools"], + }, + handler: async (params: { + name: string; + description: string; + triggers: string[]; + guidance: string; + patterns?: string[]; + antiPatterns?: string[]; + requiresMcp?: string[]; + allowedTools: string[]; + companionModule?: { name: string; source: string; description: string }; + overwrite?: boolean; + }) => { + const release = await acquireInteractiveLock(); + try { + const rl = state.readlineInstance; + if (!rl) { + return { success: false, error: "No readline available" }; + } + + // Refuse silent overwrite — the LLM must opt in explicitly. + if (!params.overwrite && userSkillExists(params.name)) { + return { + success: false, + error: `Skill "${params.name}" already exists. Set overwrite=true to replace it.`, + }; + } + + // Build the typed payload — strip undefined fields so the YAML stays clean. + const skillData: SkillData = { + name: params.name, + description: params.description, + triggers: params.triggers, + guidance: params.guidance, + allowedTools: params.allowedTools, + ...(params.patterns && params.patterns.length > 0 + ? { patterns: params.patterns } + : {}), + ...(params.antiPatterns && params.antiPatterns.length > 0 + ? { antiPatterns: params.antiPatterns } + : {}), + ...(params.requiresMcp && params.requiresMcp.length > 0 + ? { requiresMcp: params.requiresMcp } + : {}), + }; + + // Show the user what we're about to save and require approval. + spinner.stop(); + const triggerPreview = params.triggers.slice(0, 5).join(", "); + const triggerSuffix = + params.triggers.length > 5 + ? ` (+${params.triggers.length - 5} more)` + : ""; + console.log(`\n ${C.warn("📚 Save skill:")} ${C.tool(params.name)}`); + console.log(` ${params.description}`); + console.log(` Triggers: ${triggerPreview}${triggerSuffix}`); + console.log( + ` Tools: ${params.allowedTools.join(", ")} ` + + `(${params.guidance.length} bytes of guidance)`, + ); + if (params.companionModule) { + console.log( + ` ${C.warn("+ module:")} ${params.companionModule.name} ` + + `(${params.companionModule.source.length} bytes)`, + ); + } + + await drainAndWarn(rl); + const approval = state.autoApprove + ? "y" + : await promptUser(rl, ` ${C.dim("Save skill? [y/n] ")}`); + if (approval.trim().toLowerCase() !== "y") { + console.log(` ${C.dim("Denied by user.")}`); + return { success: false, error: "Skill save denied by user" }; + } + + // Write the skill — validation happens inside writeUserSkill(). + let skillPath: string; + try { + skillPath = writeUserSkill(skillData, join(CONTENT_ROOT, "patterns")); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` ${C.err("❌ " + msg)}`); + return { success: false, error: msg }; + } + + // Optionally save a companion module (delegated to register_module + // so we get the same validation pipeline + sandbox cache update). + let moduleResult: + | { name: string; importAs: string; sizeBytes?: number } + | undefined; + if (params.companionModule) { + const m = params.companionModule; + const moduleSaveResult = (await registerModuleImpl({ + name: m.name, + source: m.source, + description: m.description, + })) as { + success: boolean; + error?: string; + name?: string; + importAs?: string; + sourceSize?: number; + }; + if (!moduleSaveResult.success) { + // The skill itself saved fine — surface the module failure but + // don't roll back the skill (the user can re-run the module step). + console.error( + ` ${C.warn("⚠️")} Skill saved, but companion module failed: ${moduleSaveResult.error}`, + ); + return { + success: true, + skill: { name: params.name, filePath: skillPath }, + moduleError: moduleSaveResult.error, + }; + } + moduleResult = { + name: moduleSaveResult.name ?? m.name, + importAs: + moduleSaveResult.importAs ?? `import { ... } from "ha:${m.name}"`, + sizeBytes: moduleSaveResult.sourceSize, + }; + } + + console.error( + ` ${C.ok("📚 Skill saved:")} ${params.name} → ${skillPath}`, + ); + return { + success: true, + skill: { + name: params.name, + filePath: skillPath, + triggers: params.triggers, + }, + module: moduleResult, + }; + } finally { + release(); + } + }, +}); + // ── Startup Module Loading ─────────────────────────────────────────── // // Load builtin system modules from builtin-modules/ directory and @@ -5514,7 +5783,9 @@ function buildSessionConfig() { workingDirectory: process.cwd(), // Skills — markdown instruction files that inject expertise on demand. // The LLM can invoke /pptx-expert to get PPTX building best practices. - skillDirectories: [join(CONTENT_ROOT, "skills")], + // System skills ship with the binary; user skills live in + // ~/.hyperagent/skills/ and override system skills with the same name. + skillDirectories: [join(CONTENT_ROOT, "skills"), getUserSkillsDir()], systemMessage: { mode: "replace" as const, content: fullSystemMessage, @@ -5539,6 +5810,7 @@ function buildSessionConfig() { listModulesTool, moduleInfoTool, deleteModuleTool, + generateSkillTool, writeOutputTool, readInputTool, readOutputTool, @@ -5574,6 +5846,7 @@ function buildSessionConfig() { "list_modules", "module_info", "delete_module", + "generate_skill", "write_output", "read_input", "read_output", @@ -5746,7 +6019,10 @@ function buildSessionConfig() { const result = runSuggestApproach( input.prompt, state.preLoadedSkills, - join(CONTENT_ROOT, "skills"), + [ + { dir: join(CONTENT_ROOT, "skills"), source: "system" }, + { dir: getUserSkillsDir(), source: "user" }, + ], join(CONTENT_ROOT, "patterns"), debugLog, ); @@ -5847,6 +6123,35 @@ function buildSessionConfig() { }) => { const { toolName, toolResult } = input; + // ── Session learning tracking ──────────────────────────── + // Capture lightweight per-tool history for /save-skill to + // mine when authoring SKILL.md. Bounded FIFO so memory + // stays flat over long sessions. + const success = toolResult.resultType !== "failure"; + const errorSummary = + !success && toolResult.error + ? toolResult.error.split("\n")[0].slice(0, 200) + : undefined; + state.toolCallHistory.push({ + tool: toolName, + success, + errorSummary, + timestamp: Date.now(), + }); + if (state.toolCallHistory.length > MAX_TOOL_HISTORY) { + state.toolCallHistory.splice( + 0, + state.toolCallHistory.length - MAX_TOOL_HISTORY, + ); + } + // Tools whose name looks like `mcp____` count + // their server as "used" — that's how the SDK exposes MCP + // tools to the LLM. See manage_mcp + listMCPServersTool. + if (success && toolName.startsWith("mcp__")) { + const server = toolName.split("__")[1]; + if (server) state.mcpServersUsed.add(server); + } + // For sandbox memory errors, tell the LLM about the current // heap size and how to suggest an increase to the user. if ( @@ -6146,10 +6451,19 @@ async function main(): Promise { loadAllModules(); // ── Load skills and patterns for suggest_approach ────────────── - const allSkills = loadSkills(join(CONTENT_ROOT, "skills")); + // User skills (in ~/.hyperagent/skills/) are loaded after system skills, + // so a user skill with the same name overrides the built-in version. + const userSkillsDir = getUserSkillsDir(); + const allSkills = loadSkillsFromDirs([ + { dir: join(CONTENT_ROOT, "skills"), source: "system" }, + { dir: userSkillsDir, source: "user" }, + ]); const allPatterns = loadPatterns(join(CONTENT_ROOT, "patterns")); + const userSkillCount = Array.from(allSkills.values()).filter( + (s) => s.source === "user", + ).length; debugLog( - `Loaded ${allSkills.size} skills, ${allPatterns.size} patterns for suggest_approach`, + `Loaded ${allSkills.size} skills (${userSkillCount} user), ${allPatterns.size} patterns for suggest_approach`, ); // Wire --skill CLI flag → preLoadedSkills @@ -6581,6 +6895,15 @@ async function main(): Promise { console.log( `${ANSI.bold}${ANSI.cyan}You: ${ANSI.reset}${C.dim("(continuing after config change…)")}`, ); + } else if (state.pendingPrompt) { + // A slash command (e.g. /save-skill) queued a synthetic prompt + // for the LLM. Drain it before reading stdin so it reaches + // the model exactly once. + trimmed = state.pendingPrompt; + state.pendingPrompt = null; + console.log( + `${ANSI.bold}${ANSI.cyan}You: ${ANSI.reset}${C.dim("(submitting queued prompt…)")}`, + ); } else { const userInput = await questionCapturingPaste( rl, diff --git a/src/agent/session-context.ts b/src/agent/session-context.ts new file mode 100644 index 0000000..565de6a --- /dev/null +++ b/src/agent/session-context.ts @@ -0,0 +1,152 @@ +// ── agent/session-context.ts — Session learning summariser ────────── +// +// Extracts a concise, LLM-friendly summary of "what happened in this +// session" from `AgentState`. The summary is fed back into the LLM +// via the `/save-skill` slash command so the model can author a +// well-grounded SKILL.md instead of guessing. +// +// Design goals: +// • Bounded output (no transcripts; just structured rollups). +// • Deterministic ordering so test snapshots stay stable. +// • Zero side effects — pure function of `AgentState`. +// +// ───────────────────────────────────────────────────────────────────── + +import type { AgentState } from "./state.js"; + +// ── Constants ──────────────────────────────────────────────────────── + +/** + * Maximum number of distinct error summaries surfaced in the context. + * Beyond this the user is unlikely to remember individual failures and + * the LLM gets enough signal from the leading examples to know that + * "errors happened repeatedly" — adding more dilutes the prompt. + */ +const MAX_ERRORS_REPORTED = 8; + +/** + * Maximum tool names shown in the leading-tools list. Keeps the + * prompt focused on the dominant verbs of the workflow. + */ +const MAX_TOP_TOOLS = 10; + +// ── Types ──────────────────────────────────────────────────────────── + +/** + * A summary of the agent's activity in the current conversation, + * suitable for inclusion in an LLM prompt that asks the model to + * author a SKILL.md. + */ +export interface SessionContext { + /** + * The user's most recent prompt — anchors the LLM on the task the + * user wants captured. May be empty in tests or fresh sessions. + */ + userPrompt: string; + /** + * Tool names sorted by call count (descending), each with a count. + * Capped at `MAX_TOP_TOOLS` to keep the prompt focused. + */ + topTools: Array<{ tool: string; count: number }>; + /** MCP server names whose tools the LLM actually invoked. */ + mcpServersUsed: string[]; + /** Module names the LLM registered via register_module. */ + modulesRegistered: string[]; + /** + * First N distinct error messages (deduplicated by summary text) + * to give the LLM signal about what dead-ends were hit. + */ + errorsSeen: string[]; + /** Total number of tool invocations in the bounded history. */ + totalToolCalls: number; + /** Count of failed tool invocations in the bounded history. */ + failedToolCalls: number; +} + +// ── Public API ─────────────────────────────────────────────────────── + +/** + * Build a {@link SessionContext} from the current {@link AgentState}. + * + * Pure function — does NOT mutate state. Safe to call multiple times. + * + * @param state - The agent's runtime state (typically `state` in index.ts). + */ +export function extractSessionContext(state: AgentState): SessionContext { + // Count tool invocations. Map preserves first-seen order, which we + // use as a stable tiebreaker when counts are equal. + const toolCounts = new Map(); + const errorSet = new Set(); + let failedToolCalls = 0; + + for (const entry of state.toolCallHistory) { + toolCounts.set(entry.tool, (toolCounts.get(entry.tool) ?? 0) + 1); + if (!entry.success) { + failedToolCalls++; + if (entry.errorSummary && errorSet.size < MAX_ERRORS_REPORTED) { + errorSet.add(entry.errorSummary); + } + } + } + + // Convert to sorted array — count descending, then first-seen order. + const topTools = Array.from(toolCounts.entries()) + .map(([tool, count]) => ({ tool, count })) + .sort((a, b) => b.count - a.count) + .slice(0, MAX_TOP_TOOLS); + + return { + userPrompt: state.currentUserPrompt, + topTools, + mcpServersUsed: Array.from(state.mcpServersUsed).sort(), + modulesRegistered: [...state.modulesRegistered].sort(), + errorsSeen: Array.from(errorSet), + totalToolCalls: state.toolCallHistory.length, + failedToolCalls, + }; +} + +/** + * Render a {@link SessionContext} as a markdown block suitable for + * dropping into an LLM prompt. Sections with no data are omitted so + * the LLM doesn't see "Errors: (none)" style noise. + */ +export function renderSessionContext(ctx: SessionContext): string { + const lines: string[] = []; + + if (ctx.userPrompt.trim().length > 0) { + lines.push(`Original prompt: ${ctx.userPrompt.trim()}`); + lines.push(""); + } + + lines.push( + `Tool activity: ${ctx.totalToolCalls} calls (${ctx.failedToolCalls} failed)`, + ); + + if (ctx.topTools.length > 0) { + lines.push("Top tools used:"); + for (const { tool, count } of ctx.topTools) { + lines.push(` • ${tool} ×${count}`); + } + } + + if (ctx.mcpServersUsed.length > 0) { + lines.push(""); + lines.push(`MCP servers used: ${ctx.mcpServersUsed.join(", ")}`); + } + + if (ctx.modulesRegistered.length > 0) { + lines.push(""); + lines.push(`Modules registered: ${ctx.modulesRegistered.join(", ")}`); + } + + if (ctx.errorsSeen.length > 0) { + lines.push(""); + lines.push("Errors / dead-ends encountered:"); + for (const e of ctx.errorsSeen) { + lines.push(` • ${e}`); + } + } + + return lines.join("\n"); +} diff --git a/src/agent/skill-loader.ts b/src/agent/skill-loader.ts index 0002d85..4ff7769 100644 --- a/src/agent/skill-loader.ts +++ b/src/agent/skill-loader.ts @@ -32,6 +32,10 @@ export interface Skill { requiresMcp: string[]; /** The markdown body — domain-specific guidance text. */ guidance: string; + /** Origin of this skill ("system" = bundled, "user" = ~/.hyperagent/skills/). */ + source: "system" | "user"; + /** Absolute path to the SKILL.md file (for /skill edit, /skill delete). */ + filePath: string; } // ── YAML Frontmatter Parser ────────────────────────────────────────── @@ -128,9 +132,13 @@ function parseFrontmatter(content: string): { * Each subdirectory should contain a SKILL.md file. * * @param dir - Path to skills directory (e.g. "./skills") + * @param source - Origin of these skills ("system" or "user"); defaults to "system" * @returns Map of skill name → Skill */ -export function loadSkills(dir: string): Map { +export function loadSkills( + dir: string, + source: "system" | "user" = "system", +): Map { const skills = new Map(); if (!existsSync(dir)) return skills; @@ -169,8 +177,35 @@ export function loadSkills(dir: string): Map { antiPatterns, requiresMcp, guidance: body, + source, + filePath: skillFile, }); } return skills; } + +/** + * Load skills from multiple directories and merge them. + * + * The first entry in `dirs` is the lowest precedence (typically the system + * skills directory); later entries override earlier ones when names collide. + * The last directory in the list is conventionally the user skills directory + * (`~/.hyperagent/skills/`), which allows users to override or extend the + * built-in skill catalogue. + * + * @param dirs - Ordered list of `{ dir, source }` pairs, lowest precedence first. + * @returns Merged map of skill name → Skill (user entries win on collision). + */ +export function loadSkillsFromDirs( + dirs: Array<{ dir: string; source: "system" | "user" }>, +): Map { + const merged = new Map(); + for (const { dir, source } of dirs) { + const loaded = loadSkills(dir, source); + for (const [name, skill] of loaded) { + merged.set(name, skill); + } + } + return merged; +} diff --git a/src/agent/skill-writer.ts b/src/agent/skill-writer.ts new file mode 100644 index 0000000..3933ccd --- /dev/null +++ b/src/agent/skill-writer.ts @@ -0,0 +1,342 @@ +// ── agent/skill-writer.ts — User skill persistence ────────────────── +// +// Validates and persists user-created skills to ~/.hyperagent/skills/. +// User skills follow the same SKILL.md format as built-in (system) skills +// — see skill-loader.ts for the parsed shape and .github/instructions/ +// skills.instructions.md for authoring guidelines. +// +// User skills override system skills with the same name (see +// loadSkillsFromDirs in skill-loader.ts). +// +// ───────────────────────────────────────────────────────────────────── + +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + rmSync, + statSync, + writeFileSync, +} from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { loadPatterns } from "./pattern-loader.js"; +import { ALLOWED_TOOLS } from "./tool-gating.js"; + +// ── Constants ──────────────────────────────────────────────────────── + +/** + * Root directory for user-created skills. + * + * Defaults to `~/.hyperagent/skills/`. The `HYPERAGENT_USER_SKILLS_DIR` + * environment variable overrides the default — tests use this to point + * at a temporary directory without polluting the real user library. + */ +const DEFAULT_USER_SKILLS_DIR = + process.env.HYPERAGENT_USER_SKILLS_DIR ?? + join(homedir(), ".hyperagent", "skills"); + +/** Maximum total size of a SKILL.md file (frontmatter + body). */ +const MAX_SKILL_SIZE_BYTES = 64 * 1024; + +/** Maximum length of the description field. */ +const MAX_DESCRIPTION_LENGTH = 280; + +/** Maximum number of triggers per skill. */ +const MAX_TRIGGERS = 50; + +/** Kebab-case name pattern (lowercase letters, digits, hyphens). */ +const VALID_NAME_RE = /^[a-z][a-z0-9-]*$/; + +// ── Types ──────────────────────────────────────────────────────────── + +/** Input data for a new skill, mirroring SKILL.md frontmatter fields. */ +export interface SkillData { + /** Skill identifier (kebab-case, used as directory name). */ + name: string; + /** One-line description shown in `/skill list`. */ + description: string; + /** Keyword triggers for intent matching. */ + triggers: string[]; + /** Markdown body — domain knowledge, workflow steps, tips. */ + guidance: string; + /** Optional list of built-in pattern names this skill references. */ + patterns?: string[]; + /** Optional list of "DO NOT" rules surfaced to the LLM first. */ + antiPatterns?: string[]; + /** Optional list of MCP server names required (e.g. ["fabric-rti-mcp"]). */ + requiresMcp?: string[]; + /** Tools this skill can invoke (must be a subset of ALLOWED_TOOLS). */ + allowedTools: string[]; +} + +/** Metadata about a stored user skill. */ +export interface UserSkillInfo { + /** Skill identifier. */ + name: string; + /** Absolute path to the SKILL.md file. */ + filePath: string; + /** ISO timestamp of last modification (file mtime). */ + modified: string; + /** File size in bytes. */ + sizeBytes: number; +} + +// ── Directory Helpers ──────────────────────────────────────────────── + +/** + * Get (and create on first call) the user skills directory path. + * The directory is created lazily so test environments can override + * the path before the first call. + */ +export function getUserSkillsDir(): string { + mkdirSync(DEFAULT_USER_SKILLS_DIR, { recursive: true }); + return DEFAULT_USER_SKILLS_DIR; +} + +// ── Validation ─────────────────────────────────────────────────────── + +/** + * Validate a skill name. Returns an error message string, or null if valid. + * + * Rules: kebab-case (lowercase letters, digits, hyphens; must start with a + * letter), ≤64 characters, no path traversal characters. + */ +export function validateSkillName(name: string): string | null { + if (!name) return "Skill name must not be empty"; + if (name.length > 64) return "Skill name must be ≤64 characters"; + if (!VALID_NAME_RE.test(name)) { + return "Skill name must be lowercase letters, digits, and hyphens (e.g. 'teams-transcript', 'kql-expert')"; + } + if (name.includes("..") || name.includes("/") || name.includes("\\")) { + return "Skill name must not contain path traversal characters"; + } + return null; +} + +/** + * Validate the structured fields of a skill payload. + * Returns a combined error message string, or null when the payload is + * well-formed. Each individual problem is surfaced so the LLM (or user) + * can fix them all at once instead of round-tripping for every issue. + * + * @param data - The skill payload to validate. + * @param builtinPatternNames - Set of valid built-in pattern names. Used to + * validate `patterns[]` references. Pass an empty set to skip the check + * (only useful in tests). + */ +export function validateSkillData( + data: SkillData, + builtinPatternNames: Set, +): string | null { + const errors: string[] = []; + + const nameError = validateSkillName(data.name); + if (nameError) errors.push(nameError); + + if (!data.description || data.description.trim().length === 0) { + errors.push("Skill description must not be empty"); + } else if (data.description.length > MAX_DESCRIPTION_LENGTH) { + errors.push( + `Skill description must be ≤${MAX_DESCRIPTION_LENGTH} characters`, + ); + } + + if (!Array.isArray(data.triggers) || data.triggers.length === 0) { + errors.push("Skill must have at least one trigger"); + } else if (data.triggers.length > MAX_TRIGGERS) { + errors.push(`Skill must have ≤${MAX_TRIGGERS} triggers`); + } else { + for (const t of data.triggers) { + if (typeof t !== "string" || t.trim().length === 0) { + errors.push("Triggers must be non-empty strings"); + break; + } + } + } + + if (!data.guidance || data.guidance.trim().length === 0) { + errors.push("Skill guidance body must not be empty"); + } + + if (!Array.isArray(data.allowedTools) || data.allowedTools.length === 0) { + errors.push("Skill must declare at least one allowed-tool"); + } else { + for (const tool of data.allowedTools) { + if (!ALLOWED_TOOLS.has(tool)) { + errors.push(`Unknown tool in allowed-tools: '${tool}'`); + } + } + } + + if ( + data.patterns && + data.patterns.length > 0 && + builtinPatternNames.size > 0 + ) { + for (const p of data.patterns) { + if (!builtinPatternNames.has(p)) { + errors.push( + `Unknown pattern: '${p}' (must reference a built-in pattern)`, + ); + } + } + } + + return errors.length > 0 ? errors.join("; ") : null; +} + +// ── Serialisation ──────────────────────────────────────────────────── + +/** + * Render a SkillData payload to SKILL.md format (YAML frontmatter + body). + * + * The output is intentionally minimal — we only emit fields that are present + * — so generated skills stay easy for humans to read and edit. + */ +export function renderSkillMarkdown(data: SkillData): string { + const lines: string[] = ["---"]; + lines.push(`name: ${data.name}`); + lines.push(`description: ${data.description.trim()}`); + + lines.push("triggers:"); + for (const t of data.triggers) lines.push(` - ${t}`); + + if (data.patterns && data.patterns.length > 0) { + lines.push("patterns:"); + for (const p of data.patterns) lines.push(` - ${p}`); + } + + if (data.antiPatterns && data.antiPatterns.length > 0) { + lines.push("antiPatterns:"); + for (const a of data.antiPatterns) lines.push(` - ${a}`); + } + + if (data.requiresMcp && data.requiresMcp.length > 0) { + lines.push("requires-mcp:"); + for (const m of data.requiresMcp) lines.push(` - ${m}`); + } + + lines.push("allowed-tools:"); + for (const tool of data.allowedTools) lines.push(` - ${tool}`); + + lines.push("---", ""); + lines.push(data.guidance.trim(), ""); + + return lines.join("\n"); +} + +// ── Public API ─────────────────────────────────────────────────────── + +/** + * Persist a user skill to `~/.hyperagent/skills//SKILL.md`. + * + * Validates the payload (name, fields, pattern references, tool gating), + * writes the SKILL.md, and returns the absolute path on success. + * + * @param data - The skill payload to persist. + * @param patternsDir - Path to the built-in patterns directory; used to + * validate `patterns[]` references against real patterns on disk. + * @returns The absolute path of the written SKILL.md file. + * @throws Error if validation fails or the skill exceeds the size limit. + */ +export function writeUserSkill(data: SkillData, patternsDir: string): string { + const builtinPatterns = loadPatterns(patternsDir); + const validationError = validateSkillData( + data, + new Set(builtinPatterns.keys()), + ); + if (validationError) { + throw new Error(`Skill validation failed: ${validationError}`); + } + + const rendered = renderSkillMarkdown(data); + if (rendered.length > MAX_SKILL_SIZE_BYTES) { + throw new Error( + `SKILL.md exceeds maximum size (${rendered.length} bytes > ${MAX_SKILL_SIZE_BYTES} bytes)`, + ); + } + + const userSkillsDir = getUserSkillsDir(); + const skillDir = join(userSkillsDir, data.name); + mkdirSync(skillDir, { recursive: true }); + + const filePath = join(skillDir, "SKILL.md"); + writeFileSync(filePath, rendered, "utf-8"); + return filePath; +} + +/** + * List all user skills currently stored on disk. + * + * @returns Sorted (by name) array of `UserSkillInfo` records. + */ +export function listUserSkills(): UserSkillInfo[] { + const dir = getUserSkillsDir(); + if (!existsSync(dir)) return []; + + const entries = readdirSync(dir, { withFileTypes: true }); + const result: UserSkillInfo[] = []; + + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const filePath = join(dir, entry.name, "SKILL.md"); + if (!existsSync(filePath)) continue; + + const stat = statSync(filePath); + result.push({ + name: entry.name, + filePath, + modified: stat.mtime.toISOString(), + sizeBytes: stat.size, + }); + } + + result.sort((a, b) => a.name.localeCompare(b.name)); + return result; +} + +/** + * Read the raw SKILL.md contents for a user skill. + * + * @param name - Skill identifier. + * @returns Full file contents, or null if the skill does not exist. + */ +export function readUserSkill(name: string): string | null { + const nameError = validateSkillName(name); + if (nameError) return null; + + const filePath = join(getUserSkillsDir(), name, "SKILL.md"); + if (!existsSync(filePath)) return null; + return readFileSync(filePath, "utf-8"); +} + +/** + * Delete a user skill directory (and its SKILL.md). + * + * @param name - Skill identifier. + * @throws Error if the skill name is invalid or the skill does not exist. + */ +export function deleteUserSkill(name: string): void { + const nameError = validateSkillName(name); + if (nameError) throw new Error(nameError); + + const skillDir = join(getUserSkillsDir(), name); + if (!existsSync(skillDir)) { + throw new Error(`User skill not found: '${name}'`); + } + + rmSync(skillDir, { recursive: true, force: true }); +} + +/** + * Check whether a user skill with the given name exists. + * + * @param name - Skill identifier. + */ +export function userSkillExists(name: string): boolean { + const nameError = validateSkillName(name); + if (nameError) return false; + return existsSync(join(getUserSkillsDir(), name, "SKILL.md")); +} diff --git a/src/agent/slash-commands.ts b/src/agent/slash-commands.ts index f70967d..7939f00 100644 --- a/src/agent/slash-commands.ts +++ b/src/agent/slash-commands.ts @@ -47,6 +47,17 @@ import { createMCPPluginAdapter, generateMCPDeclarations, } from "./mcp/plugin-adapter.js"; +import { + listUserSkills, + readUserSkill, + deleteUserSkill, + userSkillExists, + getUserSkillsDir, +} from "./skill-writer.js"; +import { + extractSessionContext, + renderSessionContext, +} from "./session-context.js"; // ── Constants ──────────────────────────────────────────────────────── @@ -96,6 +107,12 @@ export interface SlashCommandDeps { mcpManager: MCPClientManager | null; /** Callback to sync plugins to sandbox after MCP changes. */ syncPlugins: () => Promise; + /** + * Queue a synthetic prompt for the LLM (used by `/save-skill` and + * similar commands that want to inject a user-turn-shaped message). + * The prompt is drained at the top of the next REPL iteration. + */ + submitToLLM: (prompt: string) => void; } // ── Handler ────────────────────────────────────────────────────────── @@ -557,6 +574,11 @@ export async function handleSlashCommand( ...buildSessionConfig(), } as any); registerEventHandler(state.activeSession); + // Reset session-learning fields — the new conversation starts + // with no history of what we did last time. + state.toolCallHistory = []; + state.mcpServersUsed = new Set(); + state.modulesRegistered = []; console.log( ` ${C.ok("🆕 New session started.")} Conversation history cleared.`, ); @@ -2091,56 +2113,237 @@ export async function handleSlashCommand( } case "/skills": { - // Resolve skills dir: in dev mode go up two levels (src/agent/ → repo root), - // in binary mode runtime content is alongside the bundle. + // Resolve built-in skills dir: in dev mode go up two levels + // (src/agent/ → repo root), in binary mode runtime content is + // alongside the bundle. const __scDirname = dirname(new URL(import.meta.url).pathname); const skillsDir = existsSync(join(__scDirname, "skills")) ? join(__scDirname, "skills") : resolve(__scDirname, "../..", "skills"); + + const sub = parts[1]?.trim()?.toLowerCase() ?? ""; + const arg = parts[2]?.trim(); + + // /skills info — show full SKILL.md (user > system precedence) + if (sub === "info") { + if (!arg) { + console.log(` ${C.dim("Usage: /skills info ")}`); + return true; + } + const userContent = readUserSkill(arg); + if (userContent) { + console.log( + ` ${C.label("📚 User skill:")} ${C.tool(arg)} ${C.dim("(👤)")}\n`, + ); + console.log(userContent); + return true; + } + const systemFile = join(skillsDir, arg, "SKILL.md"); + if (existsSync(systemFile)) { + const { readFileSync } = await import("node:fs"); + console.log(` ${C.label("📚 System skill:")} ${C.tool(arg)}\n`); + console.log(readFileSync(systemFile, "utf8")); + return true; + } + console.log(` ${C.err("❌ Skill not found:")} ${arg}`); + return true; + } + + // /skills edit — print the user-skill path for $EDITOR. + if (sub === "edit") { + if (!arg) { + console.log(` ${C.dim("Usage: /skills edit ")}`); + return true; + } + if (!userSkillExists(arg)) { + console.log( + ` ${C.err("❌ User skill not found:")} ${arg} ` + + `${C.dim("(system skills cannot be edited from the REPL)")}`, + ); + return true; + } + const filePath = join(getUserSkillsDir(), arg, "SKILL.md"); + console.log(` ${C.label("📝 Edit:")} ${filePath}`); + console.log( + ` ${C.dim("Open in your editor, save, then the change applies on next /suggest_approach.")}`, + ); + return true; + } + + // /skills delete — remove a user skill (system ones are immutable). + if (sub === "delete") { + if (!arg) { + console.log(` ${C.dim("Usage: /skills delete ")}`); + return true; + } + if (!userSkillExists(arg)) { + console.log( + ` ${C.err("❌ User skill not found:")} ${arg} ` + + `${C.dim("(system skills cannot be deleted)")}`, + ); + return true; + } + // Confirm before deletion — destructive, no undo. + await drainAndWarn(rl); + const confirmed = state.autoApprove + ? true + : ( + await rl.question( + ` ${C.warn("🗑️ Delete user skill")} ${C.tool(arg)}? [y/n] `, + ) + ) + .trim() + .toLowerCase() === "y"; + if (!confirmed) { + console.log(` ${C.dim("Cancelled.")}`); + return true; + } + deleteUserSkill(arg); + console.log(` ${C.ok("🗑️ Deleted user skill:")} ${arg}`); + return true; + } + + // /skills — invoke a skill (delegates to SDK). const skillArg = parts[1]?.trim(); + if (skillArg && sub !== "list") { + console.log(` ${C.info("📚")} Invoking skill: ${C.tool(skillArg)}`); + return false; + } - if (!skillArg) { - // List available skills - try { - const { readdirSync, readFileSync, existsSync } = - await import("node:fs"); - if (!existsSync(skillsDir)) { - console.log(" No skills directory found."); - return true; - } + // /skills (no args) or /skills list — list system + user skills, + // tagging user-authored ones with the 👤 badge. Merged so the user + // sees a single namespace and any user override is obvious. + try { + const { readdirSync, readFileSync } = await import("node:fs"); + type Row = { + name: string; + desc: string; + source: "system" | "user"; + overridden: boolean; + }; + const rows: Map = new Map(); + + if (existsSync(skillsDir)) { const dirs = readdirSync(skillsDir, { withFileTypes: true }).filter( (d) => d.isDirectory(), ); - if (dirs.length === 0) { - console.log(" No skills found."); - return true; - } - console.log( - ` ${C.label("📚 Available skills")} (${dirs.length}):\n`, - ); for (const dir of dirs) { const skillFile = join(skillsDir, dir.name, "SKILL.md"); - if (existsSync(skillFile)) { - const content = readFileSync(skillFile, "utf8"); - // Extract description from YAML frontmatter - const descMatch = content.match(/^description:\s*(.+)$/m); - const desc = descMatch ? descMatch[1] : "(no description)"; - console.log(` /${dir.name}`); - console.log(` ${C.dim(desc)}\n`); - } + if (!existsSync(skillFile)) continue; + const content = readFileSync(skillFile, "utf8"); + const descMatch = content.match(/^description:\s*(.+)$/m); + rows.set(dir.name, { + name: dir.name, + desc: descMatch ? descMatch[1] : "(no description)", + source: "system", + overridden: false, + }); } - console.log(` ${C.dim("Invoke: /skills or /")}`); - } catch { - console.log(" Error reading skills directory."); } + + for (const info of listUserSkills()) { + const content = readUserSkill(info.name); + const descMatch = content?.match(/^description:\s*(.+)$/m); + const wasSystem = rows.has(info.name); + rows.set(info.name, { + name: info.name, + desc: descMatch ? descMatch[1] : "(no description)", + source: "user", + overridden: wasSystem, + }); + } + + if (rows.size === 0) { + console.log(" No skills found."); + return true; + } + + const sorted = Array.from(rows.values()).sort((a, b) => + a.name.localeCompare(b.name), + ); + console.log( + ` ${C.label("📚 Available skills")} (${sorted.length}):\n`, + ); + for (const row of sorted) { + const badge = + row.source === "user" + ? row.overridden + ? " 👤 (overrides built-in)" + : " 👤" + : ""; + console.log(` /${row.name}${badge}`); + console.log(` ${C.dim(row.desc)}\n`); + } + console.log( + ` ${C.dim("Invoke: /skills · Manage: /skills info|edit|delete ")}`, + ); + } catch { + console.log(" Error reading skills directory."); + } + return true; + } + + case "/save-skill": { + // Capture session learnings as a user-authored SKILL.md. + // + // We don't author the SKILL.md ourselves — we hand a structured + // summary of "what happened this session" to the LLM and ask it + // to call the generate_skill tool with the right shape. The + // model has all the context (it's been the agent the whole time), + // so it's better placed than the REPL to write the guidance text. + const desiredName = parts[1]?.trim(); + if (!state.activeSession) { + console.log( + ` ${C.err("❌ No active session — start the agent first.")}`, + ); return true; } - // Invoke a specific skill by name — delegate to the SDK's - // skill handling via the slash command - console.log(` ${C.info("📚")} Invoking skill: ${C.tool(skillArg)}`); - // The SDK handles /skill-name natively — just fall through - return false; + const ctx = extractSessionContext(state); + if (ctx.totalToolCalls === 0 && !ctx.userPrompt) { + console.log( + ` ${C.warn("⚠️ Nothing to learn from yet — run a task first, then /save-skill.")}`, + ); + return true; + } + + const rendered = renderSessionContext(ctx); + const nameDirective = desiredName + ? `Use the name "${desiredName}" for the skill.` + : "Pick a kebab-case name that summarises the workflow (e.g. 'teams-transcript-finder')."; + + const promptLines = [ + "The user has asked me to save what we learned in this session as a reusable skill.", + "Call the `generate_skill` tool with a SKILL.md that captures the workflow.", + "", + nameDirective, + "", + "Authoring rules:", + " • description: ≤280 chars, action-oriented (what the skill helps the user do).", + " • triggers: 3–10 distinctive phrases that future related prompts are likely to contain.", + " • guidance: concise markdown — workflow steps, gotchas, do/don't. NOT a transcript.", + " • requiresMcp: only include MCP servers actually needed by the workflow.", + " • allowedTools: the smallest set of tools needed to execute this workflow.", + " • companionModule: include ONLY if there's reusable JS that should ship with the skill.", + "", + "Session context to draw on:", + "", + rendered, + "", + "After calling `generate_skill`, briefly tell me what you saved and what it'll be triggered by.", + ]; + const synthetic = promptLines.join("\n"); + + console.log( + ` ${C.label("📝 Capturing session learnings…")}` + + (desiredName ? ` ${C.dim("(name: " + desiredName + ")")}` : ""), + ); + console.log( + ` ${C.dim("Context: " + ctx.totalToolCalls + " tool calls, " + ctx.topTools.length + " distinct tools.")}`, + ); + + deps.submitToLLM(synthetic); + return true; } case "/clear": { diff --git a/src/agent/state.ts b/src/agent/state.ts index a5a7a78..2f4ce6a 100644 --- a/src/agent/state.ts +++ b/src/agent/state.ts @@ -255,6 +255,44 @@ export interface AgentState { */ lastGuidance: string | null; + // ── Session Learning Context (feeds /save-skill) ──────────────── + + /** + * Recent tool invocations in this session. Used by /save-skill to + * extract patterns about what the agent actually did so the LLM + * can author an accurate SKILL.md. + * + * Capped at MAX_TOOL_HISTORY entries (oldest evicted FIFO) to keep + * memory bounded for long-running sessions. Reset on /new session. + */ + toolCallHistory: Array<{ + tool: string; + success: boolean; + errorSummary?: string; + timestamp: number; + }>; + + /** + * Names of MCP servers whose tools the LLM actually invoked during + * this session. Surfaces real dependencies for `requires-mcp:` in + * skills authored via /save-skill. + */ + mcpServersUsed: Set; + + /** + * Names of modules the LLM registered (via register_module) during + * this session. Candidates for `companionModule` in saved skills. + */ + modulesRegistered: string[]; + + /** + * A prompt queued by a slash command (e.g. /save-skill) to be sent + * as the NEXT user turn. The main REPL loop drains this before + * waiting on stdin so the synthetic prompt reaches the LLM exactly + * once. Null when nothing is queued. + */ + pendingPrompt: string | null; + // ── Token Tracking ─────────────────────────────────────────────── /** Cumulative input tokens across all LLM requests this session. */ @@ -354,6 +392,12 @@ export function createAgentState( modulesInspected: new Set(), lastGuidance: null, + // Session learning context + toolCallHistory: [], + mcpServersUsed: new Set(), + modulesRegistered: [], + pendingPrompt: null, + // Token tracking totalInputTokens: 0, totalOutputTokens: 0, diff --git a/src/agent/system-message.ts b/src/agent/system-message.ts index 3d500a6..eb20e13 100644 --- a/src/agent/system-message.ts +++ b/src/agent/system-message.ts @@ -142,6 +142,14 @@ BUILDING REUSABLE MODULES: it. Modules persist across sessions and compound in value over time. Import your module with: import { fn } from "ha:" +SAVING WHAT YOU LEARN: + When the user asks to "save what we learned", "remember this for next time", + or runs /save-skill, call generate_skill to persist a SKILL.md to + ~/.hyperagent/skills/. Triggers in the SKILL.md let /suggest_approach + auto-surface the lesson in future related sessions. If the workflow needs + reusable JavaScript helpers, include a companionModule so future sessions + pick up the code too. + PLUGINS: Require explicit enable via manage_plugin. Host plugin functions return values directly (not Promises). You CAN use async/await — it works — but await on a plugin call diff --git a/src/agent/tool-gating.ts b/src/agent/tool-gating.ts index 3a5f00e..1b08f42 100644 --- a/src/agent/tool-gating.ts +++ b/src/agent/tool-gating.ts @@ -29,6 +29,7 @@ export const ALLOWED_TOOLS = new Set([ "list_modules", // List available modules (system + user) "module_info", // Detailed module info + exports "delete_module", // Delete a user module + "generate_skill", // Persist a SKILL.md learned this session "write_output", // Write text content directly to fs-write base directory "read_input", // Read text content directly from fs-read base directory "read_output", // Read content from a previously written output diff --git a/tests/approach-resolver.test.ts b/tests/approach-resolver.test.ts index 9de4bf8..9f5cb1f 100644 --- a/tests/approach-resolver.test.ts +++ b/tests/approach-resolver.test.ts @@ -22,6 +22,8 @@ function makeSkill( antiPatterns, requiresMcp: [], guidance, + source: "system", + filePath: `/tmp/test/${name}/SKILL.md`, }; } @@ -186,6 +188,8 @@ describe("approach-resolver", () => { antiPatterns: [], requiresMcp: ["fabric-rti-mcp"], guidance: "", + source: "system", + filePath: "/tmp/test/kql/SKILL.md", }; const skills = new Map([["kql", skill]]); const patterns = new Map(); @@ -205,6 +209,8 @@ describe("approach-resolver", () => { antiPatterns: [], requiresMcp: ["fabric-rti-mcp"], guidance: "", + source: "system", + filePath: "/tmp/test/s1/SKILL.md", }; const s2: Skill = { name: "s2", @@ -214,6 +220,8 @@ describe("approach-resolver", () => { antiPatterns: [], requiresMcp: ["fabric-rti-mcp", "other-mcp"], guidance: "", + source: "system", + filePath: "/tmp/test/s2/SKILL.md", }; const skills = new Map([ ["s1", s1], diff --git a/tests/intent-matcher.test.ts b/tests/intent-matcher.test.ts index b4c58fb..6d29c45 100644 --- a/tests/intent-matcher.test.ts +++ b/tests/intent-matcher.test.ts @@ -17,6 +17,8 @@ function makeSkill( antiPatterns: [], requiresMcp: [], guidance: "", + source: "system", + filePath: `/tmp/test/${name}/SKILL.md`, }; } diff --git a/tests/session-context.test.ts b/tests/session-context.test.ts new file mode 100644 index 0000000..844a9ff --- /dev/null +++ b/tests/session-context.test.ts @@ -0,0 +1,198 @@ +// ── Session Context Tests ─────────────────────────────────────────── +// +// Covers extractSessionContext + renderSessionContext. Both are pure +// functions of `AgentState`, so we build a minimal hand-rolled state +// object rather than spinning up a real session. +// +// ───────────────────────────────────────────────────────────────────── + +import { describe, it, expect } from "vitest"; +import { + extractSessionContext, + renderSessionContext, +} from "../src/agent/session-context.js"; +import type { AgentState } from "../src/agent/state.js"; + +/** + * Build a minimal `AgentState` that satisfies the structural fields the + * session-context functions read. Everything else is left as a + * placeholder cast — never accessed by the code under test. + */ +function makeState( + overrides: Partial< + Pick< + AgentState, + | "toolCallHistory" + | "mcpServersUsed" + | "modulesRegistered" + | "currentUserPrompt" + > + > = {}, +): AgentState { + // Cast through unknown so we don't have to fill in every unrelated + // field of AgentState — `extractSessionContext` only reads the four + // fields above. + return { + toolCallHistory: [], + mcpServersUsed: new Set(), + modulesRegistered: [], + currentUserPrompt: "", + ...overrides, + } as unknown as AgentState; +} + +describe("extractSessionContext", () => { + it("returns empty context for an unused session", () => { + const ctx = extractSessionContext(makeState()); + expect(ctx.totalToolCalls).toBe(0); + expect(ctx.failedToolCalls).toBe(0); + expect(ctx.topTools).toEqual([]); + expect(ctx.mcpServersUsed).toEqual([]); + expect(ctx.modulesRegistered).toEqual([]); + expect(ctx.errorsSeen).toEqual([]); + expect(ctx.userPrompt).toBe(""); + }); + + it("aggregates tool counts and sorts descending by count", () => { + const ctx = extractSessionContext( + makeState({ + toolCallHistory: [ + { tool: "execute_javascript", success: true, timestamp: 1 }, + { tool: "execute_javascript", success: true, timestamp: 2 }, + { tool: "execute_javascript", success: true, timestamp: 3 }, + { tool: "register_handler", success: true, timestamp: 4 }, + { tool: "list_modules", success: true, timestamp: 5 }, + { tool: "register_handler", success: true, timestamp: 6 }, + ], + }), + ); + expect(ctx.topTools).toEqual([ + { tool: "execute_javascript", count: 3 }, + { tool: "register_handler", count: 2 }, + { tool: "list_modules", count: 1 }, + ]); + expect(ctx.totalToolCalls).toBe(6); + expect(ctx.failedToolCalls).toBe(0); + }); + + it("counts failures and deduplicates error summaries", () => { + const ctx = extractSessionContext( + makeState({ + toolCallHistory: [ + { + tool: "execute_javascript", + success: false, + errorSummary: "Out of memory", + timestamp: 1, + }, + { + tool: "execute_javascript", + success: false, + errorSummary: "Out of memory", + timestamp: 2, + }, + { + tool: "execute_bash", + success: false, + errorSummary: "Permission denied", + timestamp: 3, + }, + { tool: "list_modules", success: true, timestamp: 4 }, + ], + }), + ); + expect(ctx.failedToolCalls).toBe(3); + expect(ctx.errorsSeen).toEqual(["Out of memory", "Permission denied"]); + }); + + it("sorts mcpServersUsed and modulesRegistered alphabetically", () => { + const ctx = extractSessionContext( + makeState({ + mcpServersUsed: new Set(["zebra", "alpha", "mike"]), + modulesRegistered: ["zebra-mod", "alpha-mod", "mike-mod"], + }), + ); + expect(ctx.mcpServersUsed).toEqual(["alpha", "mike", "zebra"]); + expect(ctx.modulesRegistered).toEqual([ + "alpha-mod", + "mike-mod", + "zebra-mod", + ]); + }); + + it("propagates currentUserPrompt verbatim", () => { + const ctx = extractSessionContext( + makeState({ currentUserPrompt: "Find the Teams transcript" }), + ); + expect(ctx.userPrompt).toBe("Find the Teams transcript"); + }); + + it("caps the topTools list at 10 entries", () => { + // Build 15 distinct tools each called once, plus one called 20 times. + const history: AgentState["toolCallHistory"] = []; + for (let i = 0; i < 20; i++) { + history.push({ tool: "popular", success: true, timestamp: i }); + } + for (let i = 0; i < 15; i++) { + history.push({ tool: `tool-${i}`, success: true, timestamp: 100 + i }); + } + const ctx = extractSessionContext(makeState({ toolCallHistory: history })); + expect(ctx.topTools).toHaveLength(10); + expect(ctx.topTools[0].tool).toBe("popular"); + expect(ctx.topTools[0].count).toBe(20); + }); + + it("does not mutate the input state", () => { + const state = makeState({ + toolCallHistory: [ + { tool: "execute_javascript", success: true, timestamp: 1 }, + ], + mcpServersUsed: new Set(["server-a"]), + modulesRegistered: ["mod-a"], + }); + const beforeSize = state.mcpServersUsed.size; + extractSessionContext(state); + extractSessionContext(state); + expect(state.toolCallHistory).toHaveLength(1); + expect(state.mcpServersUsed.size).toBe(beforeSize); + expect(state.modulesRegistered).toEqual(["mod-a"]); + }); +}); + +describe("renderSessionContext", () => { + it("omits sections that have no data", () => { + const text = renderSessionContext({ + userPrompt: "", + topTools: [], + mcpServersUsed: [], + modulesRegistered: [], + errorsSeen: [], + totalToolCalls: 0, + failedToolCalls: 0, + }); + expect(text).not.toMatch(/Top tools used/); + expect(text).not.toMatch(/MCP servers used/); + expect(text).not.toMatch(/Modules registered/); + expect(text).not.toMatch(/Errors/); + expect(text).toMatch(/Tool activity: 0 calls/); + }); + + it("emits each populated section in a stable order", () => { + const text = renderSessionContext({ + userPrompt: "Original task", + topTools: [{ tool: "execute_javascript", count: 5 }], + mcpServersUsed: ["workiq"], + modulesRegistered: ["teams-transcript"], + errorsSeen: ["Permission denied"], + totalToolCalls: 10, + failedToolCalls: 1, + }); + expect(text).toMatch(/Original prompt: Original task/); + expect(text).toMatch(/Tool activity: 10 calls \(1 failed\)/); + expect(text).toMatch(/Top tools used:\n {2}• execute_javascript ×5/); + expect(text).toMatch(/MCP servers used: workiq/); + expect(text).toMatch(/Modules registered: teams-transcript/); + expect(text).toMatch(/Errors \/ dead-ends/); + expect(text).toMatch(/• Permission denied/); + }); +}); diff --git a/tests/skill-loader.test.ts b/tests/skill-loader.test.ts index d16f695..a29bf06 100644 --- a/tests/skill-loader.test.ts +++ b/tests/skill-loader.test.ts @@ -9,7 +9,7 @@ import { describe, it, expect, beforeEach, afterEach } from "vitest"; import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "fs"; import { join } from "path"; import { tmpdir } from "os"; -import { loadSkills } from "../src/agent/skill-loader.js"; +import { loadSkills, loadSkillsFromDirs } from "../src/agent/skill-loader.js"; let tempDir: string; @@ -181,4 +181,92 @@ description: Unnamed skill expect(skills.size).toBe(0); }); }); + + describe("source + filePath fields", () => { + it("defaults source to 'system' and records the absolute SKILL.md path", () => { + writeSkill( + "src-default", + `---\nname: src-default\ndescription: x\ntriggers:\n - x\n---\nBody.\n`, + ); + const skills = loadSkills(tempDir); + const s = skills.get("src-default"); + expect(s).toBeDefined(); + expect(s!.source).toBe("system"); + expect(s!.filePath).toBe(join(tempDir, "src-default", "SKILL.md")); + }); + + it("propagates an explicit source argument", () => { + writeSkill( + "src-user", + `---\nname: src-user\ndescription: x\ntriggers:\n - x\n---\nBody.\n`, + ); + const skills = loadSkills(tempDir, "user"); + expect(skills.get("src-user")!.source).toBe("user"); + }); + }); + + describe("loadSkillsFromDirs", () => { + let userDir: string; + beforeEach(() => { + userDir = mkdtempSync(join(tmpdir(), "skill-loader-user-")); + }); + afterEach(() => { + rmSync(userDir, { recursive: true, force: true }); + }); + + function writeUserSkill(name: string, content: string): void { + const dir = join(userDir, name); + mkdirSync(dir, { recursive: true }); + writeFileSync(join(dir, "SKILL.md"), content, "utf-8"); + } + + it("merges skills from multiple directories", () => { + writeSkill( + "system-only", + `---\nname: system-only\ndescription: sys\ntriggers:\n - x\n---\nA\n`, + ); + writeUserSkill( + "user-only", + `---\nname: user-only\ndescription: usr\ntriggers:\n - x\n---\nB\n`, + ); + const skills = loadSkillsFromDirs([ + { dir: tempDir, source: "system" }, + { dir: userDir, source: "user" }, + ]); + expect(skills.size).toBe(2); + expect(skills.get("system-only")!.source).toBe("system"); + expect(skills.get("user-only")!.source).toBe("user"); + }); + + it("user skills override system skills with the same name", () => { + writeSkill( + "shared", + `---\nname: shared\ndescription: SYSTEM\ntriggers:\n - x\n---\nA\n`, + ); + writeUserSkill( + "shared", + `---\nname: shared\ndescription: USER\ntriggers:\n - x\n---\nB\n`, + ); + const skills = loadSkillsFromDirs([ + { dir: tempDir, source: "system" }, + { dir: userDir, source: "user" }, + ]); + const s = skills.get("shared")!; + expect(s.source).toBe("user"); + expect(s.description).toBe("USER"); + }); + + it("ignores nonexistent directories without throwing", () => { + writeSkill( + "exists", + `---\nname: exists\ndescription: x\ntriggers:\n - x\n---\nA\n`, + ); + const skills = loadSkillsFromDirs([ + { dir: tempDir, source: "system" }, + { dir: "/nonexistent/path/zzz", source: "user" }, + ]); + expect(skills.size).toBe(1); + expect(skills.get("exists")).toBeDefined(); + }); + }); }); diff --git a/tests/skill-writer.test.ts b/tests/skill-writer.test.ts new file mode 100644 index 0000000..9b4e564 --- /dev/null +++ b/tests/skill-writer.test.ts @@ -0,0 +1,308 @@ +// ── Skill Writer Tests ────────────────────────────────────────────── +// +// Covers validation, SKILL.md serialisation, and the persistence CRUD +// surface (writeUserSkill, listUserSkills, readUserSkill, deleteUserSkill, +// userSkillExists). +// +// Each test fixes HYPERAGENT_USER_SKILLS_DIR to a tmpdir so the real +// ~/.hyperagent/skills/ library is never touched. Because skill-writer +// reads the env var at module load time, we re-import the module fresh +// inside each `beforeEach` after setting the env var. +// ───────────────────────────────────────────────────────────────────── + +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; + +/** + * Module-under-test, re-imported per test so it reads the freshly set + * `HYPERAGENT_USER_SKILLS_DIR` env var. Vitest caches modules across + * the file by default; `vi.resetModules()` plus a fresh dynamic import + * gives each test a clean slate. + */ +type SkillWriter = typeof import("../src/agent/skill-writer.js"); + +let tempUserSkillsDir: string; +let tempPatternsDir: string; +let writer: SkillWriter; +let savedEnv: string | undefined; + +beforeEach(async () => { + tempUserSkillsDir = mkdtempSync(join(tmpdir(), "skill-writer-test-")); + tempPatternsDir = mkdtempSync(join(tmpdir(), "skill-writer-patterns-")); + // Seed a pattern so we can exercise pattern-reference validation. + const patternDir = join(tempPatternsDir, "two-handler-pipeline"); + mkdirSync(patternDir, { recursive: true }); + writeFileSync( + join(patternDir, "PATTERN.md"), + `---\nname: two-handler-pipeline\ndescription: Test pattern\nmodules: []\nheapMb: 256\ncpuTimeoutMs: 10000\nwallTimeoutMs: 30000\n---\nBody.\n`, + "utf-8", + ); + + savedEnv = process.env.HYPERAGENT_USER_SKILLS_DIR; + process.env.HYPERAGENT_USER_SKILLS_DIR = tempUserSkillsDir; + // Reset module cache so the constants re-evaluate against the env var. + vi.resetModules(); + writer = await import("../src/agent/skill-writer.js"); +}); + +afterEach(() => { + if (savedEnv === undefined) delete process.env.HYPERAGENT_USER_SKILLS_DIR; + else process.env.HYPERAGENT_USER_SKILLS_DIR = savedEnv; + rmSync(tempUserSkillsDir, { recursive: true, force: true }); + rmSync(tempPatternsDir, { recursive: true, force: true }); +}); + +// ── validateSkillName ──────────────────────────────────────────────── + +describe("validateSkillName", () => { + it("accepts kebab-case names", () => { + expect(writer.validateSkillName("teams-transcript-finder")).toBeNull(); + expect(writer.validateSkillName("a")).toBeNull(); + expect(writer.validateSkillName("foo123")).toBeNull(); + }); + + it("rejects empty names", () => { + expect(writer.validateSkillName("")).toMatch(/empty/); + }); + + it("rejects names with uppercase, spaces, or symbols", () => { + expect(writer.validateSkillName("Teams-Skill")).toMatch(/kebab|lowercase/i); + expect(writer.validateSkillName("teams skill")).toMatch(/kebab|lowercase/i); + expect(writer.validateSkillName("teams_skill")).toMatch(/kebab|lowercase/i); + }); + + it("rejects names with path traversal characters", () => { + // The regex blocks these before the traversal check fires, but the + // outcome we care about is "rejected". + expect(writer.validateSkillName("../evil")).not.toBeNull(); + expect(writer.validateSkillName("foo/bar")).not.toBeNull(); + expect(writer.validateSkillName("foo\\bar")).not.toBeNull(); + }); + + it("rejects names longer than 64 characters", () => { + expect(writer.validateSkillName("a".repeat(65))).toMatch(/64/); + }); +}); + +// ── validateSkillData ──────────────────────────────────────────────── + +describe("validateSkillData", () => { + const validData = () => ({ + name: "test-skill", + description: "A test skill", + triggers: ["test"], + guidance: "Do the thing.", + allowedTools: ["execute_javascript"], + }); + + it("accepts a minimal valid payload", () => { + expect(writer.validateSkillData(validData(), new Set())).toBeNull(); + }); + + it("rejects missing description", () => { + const err = writer.validateSkillData( + { ...validData(), description: "" }, + new Set(), + ); + expect(err).toMatch(/description/); + }); + + it("rejects empty triggers", () => { + const err = writer.validateSkillData( + { ...validData(), triggers: [] }, + new Set(), + ); + expect(err).toMatch(/trigger/); + }); + + it("rejects empty allowedTools", () => { + const err = writer.validateSkillData( + { ...validData(), allowedTools: [] }, + new Set(), + ); + expect(err).toMatch(/allowed-tool/); + }); + + it("rejects unknown tools", () => { + const err = writer.validateSkillData( + { ...validData(), allowedTools: ["nuclear_launch"] }, + new Set(), + ); + expect(err).toMatch(/nuclear_launch/); + }); + + it("rejects descriptions over 280 chars", () => { + const err = writer.validateSkillData( + { ...validData(), description: "x".repeat(281) }, + new Set(), + ); + expect(err).toMatch(/280/); + }); + + it("rejects unknown pattern references", () => { + const err = writer.validateSkillData( + { ...validData(), patterns: ["not-real"] }, + new Set(["two-handler-pipeline"]), + ); + expect(err).toMatch(/not-real/); + }); + + it("accepts known pattern references", () => { + expect( + writer.validateSkillData( + { ...validData(), patterns: ["two-handler-pipeline"] }, + new Set(["two-handler-pipeline"]), + ), + ).toBeNull(); + }); +}); + +// ── renderSkillMarkdown ────────────────────────────────────────────── + +describe("renderSkillMarkdown", () => { + it("emits all required fields", () => { + const md = writer.renderSkillMarkdown({ + name: "demo", + description: "Demo skill", + triggers: ["demo", "example"], + guidance: "Run the demo.", + allowedTools: ["execute_javascript"], + }); + expect(md).toMatch(/^---\n/); + expect(md).toMatch(/name: demo\n/); + expect(md).toMatch(/description: Demo skill\n/); + expect(md).toMatch(/triggers:\n {2}- demo\n {2}- example\n/); + expect(md).toMatch(/allowed-tools:\n {2}- execute_javascript\n/); + expect(md).toMatch(/\n---\n\nRun the demo\.\n$/); + }); + + it("omits optional sections when empty", () => { + const md = writer.renderSkillMarkdown({ + name: "demo", + description: "Demo skill", + triggers: ["demo"], + guidance: "Body.", + allowedTools: ["execute_javascript"], + }); + expect(md).not.toMatch(/patterns:/); + expect(md).not.toMatch(/antiPatterns:/); + expect(md).not.toMatch(/requires-mcp:/); + }); + + it("includes optional sections when present", () => { + const md = writer.renderSkillMarkdown({ + name: "demo", + description: "Demo skill", + triggers: ["demo"], + guidance: "Body.", + patterns: ["two-handler-pipeline"], + antiPatterns: ["Do not delete production data"], + requiresMcp: ["workiq"], + allowedTools: ["execute_javascript"], + }); + expect(md).toMatch(/patterns:\n {2}- two-handler-pipeline\n/); + expect(md).toMatch(/antiPatterns:\n {2}- Do not delete production data\n/); + expect(md).toMatch(/requires-mcp:\n {2}- workiq\n/); + }); +}); + +// ── writeUserSkill + CRUD round-trip ───────────────────────────────── + +describe("writeUserSkill / listUserSkills / readUserSkill / deleteUserSkill", () => { + it("persists a skill to disk and round-trips through list/read", () => { + const filePath = writer.writeUserSkill( + { + name: "round-trip", + description: "Round trip test", + triggers: ["round"], + guidance: "Do round trips.", + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ); + expect(filePath).toContain("round-trip"); + expect(filePath).toMatch(/SKILL\.md$/); + + expect(writer.userSkillExists("round-trip")).toBe(true); + const listed = writer.listUserSkills(); + expect(listed).toHaveLength(1); + expect(listed[0].name).toBe("round-trip"); + expect(listed[0].sizeBytes).toBeGreaterThan(0); + + const content = writer.readUserSkill("round-trip"); + expect(content).not.toBeNull(); + expect(content).toMatch(/name: round-trip/); + expect(content).toMatch(/Do round trips\./); + }); + + it("listUserSkills returns names sorted alphabetically", () => { + for (const name of ["zebra", "alpha", "mike"]) { + writer.writeUserSkill( + { + name, + description: "x", + triggers: ["x"], + guidance: "x", + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ); + } + expect(writer.listUserSkills().map((s) => s.name)).toEqual([ + "alpha", + "mike", + "zebra", + ]); + }); + + it("deleteUserSkill removes the directory and marks skill absent", () => { + writer.writeUserSkill( + { + name: "doomed", + description: "x", + triggers: ["x"], + guidance: "x", + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ); + expect(writer.userSkillExists("doomed")).toBe(true); + writer.deleteUserSkill("doomed"); + expect(writer.userSkillExists("doomed")).toBe(false); + expect(writer.readUserSkill("doomed")).toBeNull(); + }); + + it("rejects invalid skill names before writing anything", () => { + expect(() => + writer.writeUserSkill( + { + name: "Bad Name", + description: "x", + triggers: ["x"], + guidance: "x", + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ), + ).toThrow(/validation/i); + expect(writer.listUserSkills()).toEqual([]); + }); + + it("rejects unknown pattern references against the patterns dir", () => { + expect(() => + writer.writeUserSkill( + { + name: "bad-pattern", + description: "x", + triggers: ["x"], + guidance: "x", + allowedTools: ["execute_javascript"], + patterns: ["not-a-real-pattern"], + }, + tempPatternsDir, + ), + ).toThrow(/not-a-real-pattern/); + }); +}); From 9ed18e149fa9e5eb2ee284ce86d95f0963817b9a Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 14 May 2026 23:57:36 +0100 Subject: [PATCH 3/4] docs: add hand-off test plan for user-generated skills Standalone walkthrough at docs/TESTING-USER-SKILLS.md covering smoke test, full workout, override behaviour, boundary cases, and likely failure modes. Intended to be passed to reviewers / testers who want to exercise the feature without reading the implementation. Signed-off-by: Simon Davies --- docs/TESTING-USER-SKILLS.md | 193 ++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 docs/TESTING-USER-SKILLS.md diff --git a/docs/TESTING-USER-SKILLS.md b/docs/TESTING-USER-SKILLS.md new file mode 100644 index 0000000..326213b --- /dev/null +++ b/docs/TESTING-USER-SKILLS.md @@ -0,0 +1,193 @@ +# Testing the User-Generated Skills Feature + +A walkthrough for verifying the **user skills** feature end-to-end. The +feature lets a user persist what HyperAgent learned in a session as a +reusable skill at `~/.hyperagent/skills//SKILL.md`, surviving +upgrades and overriding system skills with the same name. + +Implemented on branch `feat-user-skills` (atop fix +`fix-marked-v15-renderer` for the markdown renderer crash). + +--- + +## Prerequisites + +- A working HyperAgent checkout on the `feat-user-skills` branch +- `just setup` already run (Rust addons built, deps installed) — see the + project [README](../README.md) and [DEVELOPMENT.md](DEVELOPMENT.md) +- A terminal where `just start` launches the agent successfully +- A working GitHub Copilot login for the agent's LLM calls + +--- + +## 1. Smoke Test (~2 minutes) + +This is the minimum bar — if this works, the feature is wired up +end-to-end. + +```bash +# Use a throwaway skills dir so you don't pollute ~/.hyperagent/skills/ +export HYPERAGENT_USER_SKILLS_DIR=/tmp/ha-skills-test +mkdir -p "$HYPERAGENT_USER_SKILLS_DIR" + +just start +``` + +In the agent REPL: + +```text +> /skills +``` + +Confirms baseline — only **system** skills should appear, none with the +👤 (user) badge. + +Now do some work the agent will remember: + +```text +> use the fetch plugin to grab https://example.com and tell me the title +``` + +Let it run to completion. Then ask the agent to save what it learned: + +```text +> /save-skill fetch-page-title +``` + +**Expected behaviour:** + +1. The agent receives a synthetic prompt summarising the session + context (tools used, MCP servers, modules registered, recent errors) +2. The LLM calls the `generate_skill(...)` tool +3. You see an interactive approval prompt with a preview of the + `SKILL.md` content +4. Hit `y` to approve + +Verify the file landed on disk: + +```bash +cat /tmp/ha-skills-test/fetch-page-title/SKILL.md +``` + +You should see a valid SKILL.md with YAML frontmatter (`name`, +`description`, `triggers`, etc.) and a markdown guidance body. + +If that file exists, **the feature works.** 🎉 + +--- + +## 2. Full Workout + +Exercise every command path. From a fresh `just start`: + +```text +> /skills # list both system + user skills +> /skills info code-review # show full detail for a system skill +> /save-skill # no name → LLM picks one +> /skills # user skill now shows with 👤 +> /skills info fetch-page-title # user skill detail +> /skills edit fetch-page-title # opens $EDITOR for hand-tuning +> exit +``` + +Then restart the agent and repeat the original task — the matching +`/suggest_approach` should surface the saved skill via its triggers. + +--- + +## 3. Override Test + +User skills must override system skills with the same name. Drop a user +skill that shadows an existing system one: + +```bash +mkdir -p "$HYPERAGENT_USER_SKILLS_DIR/code-review" +cat > "$HYPERAGENT_USER_SKILLS_DIR/code-review/SKILL.md" << 'EOF' +--- +name: code-review +description: My customised code review skill +triggers: [review, audit] +--- +This overrides the system version. +EOF + +just start +``` + +In the REPL: + +```text +> /skills info code-review +``` + +**Expected:** the **user** description ("My customised code review +skill") appears, and an **override** flag/note is present. + +--- + +## 4. Negative / Boundary Tests + +Validation should reject bad input cleanly without crashing the agent: + +| Input | Expected outcome | +|-------|------------------| +| `/save-skill BadName` | Rejected — not kebab-case | +| `/save-skill ../escape` | Rejected — path traversal | +| `/save-skill thisnameisreallylongandshouldfailitsbeyondsixtyfourcharactersnowforsure` | Rejected — exceeds 64 chars | +| `/save-skill fetch-page-title` (second time) | Overwrite confirmation prompt | + +--- + +## 5. Cleanup + +```bash +rm -rf /tmp/ha-skills-test +unset HYPERAGENT_USER_SKILLS_DIR +``` + +--- + +## Verification Checklist + +| Symptom | Confirms | +|---------|----------| +| `generate_skill` appears in the tool log after `/save-skill` | LLM picked up the system-message guidance ✅ | +| Approval prompt shows a skill preview | Tool handler validation working ✅ | +| `.md` file lands on disk under `$HYPERAGENT_USER_SKILLS_DIR` | `writeUserSkill()` working ✅ | +| `/skills` shows the 👤 badge for the new skill | Multi-dir loader + `source` field working ✅ | +| `/skills info ` shows the override flag for shadowed system skills | Name-collision detection working ✅ | +| Restarting the agent matches the skill on similar prompts | `loadSkillsFromDirs` + boot wiring working ✅ | + +--- + +## Likely Failure Modes & Where to Look + +- **`/save-skill` runs but the LLM never calls `generate_skill`** — the + synthetic prompt from `submitToLLM` may be too weak. See + [src/agent/slash-commands.ts](../src/agent/slash-commands.ts) (the + `/save-skill` handler) and + [src/agent/system-message.ts](../src/agent/system-message.ts) + ("SAVING WHAT YOU LEARN" section). +- **Tool not allowed** — every new tool needs registration at THREE + points: `tools[]` array, `ALLOWED_TOOLS` in + [src/agent/tool-gating.ts](../src/agent/tool-gating.ts), and + `availableTools[]` in the session config. Triple-check. +- **File written but `/skills` doesn't list it** — + `loadSkillsFromDirs()` in + [src/agent/skill-loader.ts](../src/agent/skill-loader.ts) may not be + reading the user dir. Verify `skillDirectories` in + [src/agent/index.ts](../src/agent/index.ts) includes + `getUserSkillsDir()`. + +--- + +## Reporting Results + +If something doesn't work, please capture: + +1. The full agent REPL transcript +2. Contents of `$HYPERAGENT_USER_SKILLS_DIR` after the test (`ls -laR`) +3. The agent's debug log (`~/.hyperagent/logs/debug-*.log`) +4. The output of `just check` from the same checkout + +…and share with the implementer. Good hunting. 🎯 From 7944070a3ea16e1d141b1ebbf4ac638e728f8780 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Fri, 15 May 2026 00:30:55 +0100 Subject: [PATCH 4/4] fix: address PR #139 review feedback (18 issues) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security & correctness - skill-writer: cap on UTF-8 byte length (not String.length) so a multi-byte payload can't bypass the 64 KB limit - skill-writer: reject reserved /skills subcommand names (info, edit, delete, list) to prevent shadowing the CLI surface - skill-writer: reject description/triggers containing newlines or a bare '---' line so they can't break out of YAML frontmatter - slash-commands /skills info|edit|delete: validate via validateSkillName before any filesystem join — closes the path traversal vector pointed out by the reviewer UX correctness - index.ts generate_skill: surface an 'Overwrite existing user skill?' confirmation when overwrite=true and the file already exists - slash-commands /save-skill: pass skipAutoSuggest=true so the synthetic prompt's scaffolding terms don't trigger unrelated skills via runSuggestApproach - slash-commands /new: also reset currentUserPrompt + lastGuidance - slash-commands /resume: reset toolCallHistory, mcpServersUsed, modulesRegistered, currentUserPrompt, lastGuidance — local session-learning state can't be reconstructed from a resumed remote session - slash-commands /save-skill: fix 'distinct tools' status line to count the full tool history, not the bounded topTools view - session-context: truncate currentUserPrompt to 2000 chars with an ellipsis so a giant paste can't dominate the prompt MCP session-learning correctness - mcp/plugin-adapter: add optional onCall observer; agent wires it to state.mcpServersUsed so calls made from inside execute_javascript via host:mcp- imports are now tracked - state.ts: add skipNextAutoSuggest flag (consumed in onUserPromptSubmitted) Documentation - docs/TESTING-USER-SKILLS.md: drop branch-name reference, switch override example from non-existent 'code-review' to bundled 'kql-expert', clarify '/skills edit' prints a path (no $EDITOR), describe the now-correct overwrite confirmation flow, note that the override badge surfaces in '/skills' list view, fix approval prompt wording (summary, not full content) Tests - Reserved-name rejection - YAML-unsafe newline rejection (description + trigger) - UTF-8 byte-length cap (32 KB of 4-byte chars) - User-prompt truncation contract Quality gate: 2448 TS tests pass (+5), 124 Rust tests pass. Signed-off-by: Simon Davies --- docs/TESTING-USER-SKILLS.md | 46 +++++++++++++++----------- src/agent/index.ts | 48 +++++++++++++++++++++++---- src/agent/mcp/plugin-adapter.ts | 25 ++++++++++++++ src/agent/session-context.ts | 19 ++++++++++- src/agent/skill-writer.ts | 52 +++++++++++++++++++++++++---- src/agent/slash-commands.ts | 55 ++++++++++++++++++++++++++++--- src/agent/state.ts | 11 +++++++ tests/session-context.test.ts | 15 +++++++++ tests/skill-writer.test.ts | 58 +++++++++++++++++++++++++++++++++ 9 files changed, 292 insertions(+), 37 deletions(-) diff --git a/docs/TESTING-USER-SKILLS.md b/docs/TESTING-USER-SKILLS.md index 326213b..3b95a7f 100644 --- a/docs/TESTING-USER-SKILLS.md +++ b/docs/TESTING-USER-SKILLS.md @@ -5,14 +5,11 @@ feature lets a user persist what HyperAgent learned in a session as a reusable skill at `~/.hyperagent/skills//SKILL.md`, surviving upgrades and overriding system skills with the same name. -Implemented on branch `feat-user-skills` (atop fix -`fix-marked-v15-renderer` for the markdown renderer crash). - --- ## Prerequisites -- A working HyperAgent checkout on the `feat-user-skills` branch +- A working HyperAgent checkout - `just setup` already run (Rust addons built, deps installed) — see the project [README](../README.md) and [DEVELOPMENT.md](DEVELOPMENT.md) - A terminal where `just start` launches the agent successfully @@ -59,8 +56,10 @@ Let it run to completion. Then ask the agent to save what it learned: 1. The agent receives a synthetic prompt summarising the session context (tools used, MCP servers, modules registered, recent errors) 2. The LLM calls the `generate_skill(...)` tool -3. You see an interactive approval prompt with a preview of the - `SKILL.md` content +3. You see an interactive approval prompt showing a **summary** — the + skill name, the one-line description, a preview of the first few + triggers, the allowed-tools list, and a byte count for the guidance + body. (The full content is *not* echoed to stdout.) 4. Hit `y` to approve Verify the file landed on disk: @@ -82,14 +81,19 @@ Exercise every command path. From a fresh `just start`: ```text > /skills # list both system + user skills -> /skills info code-review # show full detail for a system skill +> /skills info kql-expert # show full detail for a bundled system skill > /save-skill # no name → LLM picks one > /skills # user skill now shows with 👤 > /skills info fetch-page-title # user skill detail -> /skills edit fetch-page-title # opens $EDITOR for hand-tuning +> /skills edit fetch-page-title # prints the user-skill path; open it in your editor > exit ``` +> `/skills edit ` does **not** spawn `$EDITOR`. It just prints +> the absolute path to the user-skill `SKILL.md` so you can open it +> in your own editor of choice. Save the file, then restart (or run +> `/suggest_approach`) and the change takes effect. + Then restart the agent and repeat the original task — the matching `/suggest_approach` should surface the saved skill via its triggers. @@ -98,15 +102,17 @@ Then restart the agent and repeat the original task — the matching ## 3. Override Test User skills must override system skills with the same name. Drop a user -skill that shadows an existing system one: +skill that shadows an existing system one (pick any skill that `ls +skills/` shows — here we use `kql-expert`): ```bash -mkdir -p "$HYPERAGENT_USER_SKILLS_DIR/code-review" -cat > "$HYPERAGENT_USER_SKILLS_DIR/code-review/SKILL.md" << 'EOF' +mkdir -p "$HYPERAGENT_USER_SKILLS_DIR/kql-expert" +cat > "$HYPERAGENT_USER_SKILLS_DIR/kql-expert/SKILL.md" << 'EOF' --- -name: code-review -description: My customised code review skill -triggers: [review, audit] +name: kql-expert +description: My customised KQL skill +triggers: [kql, kusto, query] +allowed-tools: [execute_javascript] --- This overrides the system version. EOF @@ -117,11 +123,12 @@ just start In the REPL: ```text -> /skills info code-review +> /skills ``` -**Expected:** the **user** description ("My customised code review -skill") appears, and an **override** flag/note is present. +**Expected:** the `kql-expert` row appears with the **`👤 (overrides +built-in)`** badge in the list view. Running `/skills info kql-expert` +then shows the **user** description ("My customised KQL skill"). --- @@ -134,7 +141,8 @@ Validation should reject bad input cleanly without crashing the agent: | `/save-skill BadName` | Rejected — not kebab-case | | `/save-skill ../escape` | Rejected — path traversal | | `/save-skill thisnameisreallylongandshouldfailitsbeyondsixtyfourcharactersnowforsure` | Rejected — exceeds 64 chars | -| `/save-skill fetch-page-title` (second time) | Overwrite confirmation prompt | +| `/save-skill info` | Rejected — reserved subcommand name | +| `/save-skill fetch-page-title` (second time, fresh session) | `generate_skill` first errors with "already exists — set overwrite=true"; the LLM retries with `overwrite=true`, and you get an **"Overwrite existing user skill?"** confirmation before the file is replaced | --- @@ -155,7 +163,7 @@ unset HYPERAGENT_USER_SKILLS_DIR | Approval prompt shows a skill preview | Tool handler validation working ✅ | | `.md` file lands on disk under `$HYPERAGENT_USER_SKILLS_DIR` | `writeUserSkill()` working ✅ | | `/skills` shows the 👤 badge for the new skill | Multi-dir loader + `source` field working ✅ | -| `/skills info ` shows the override flag for shadowed system skills | Name-collision detection working ✅ | +| `/skills` shows `👤 (overrides built-in)` for shadowed system skills | Name-collision detection working ✅ | | Restarting the agent matches the skill on similar prompts | `loadSkillsFromDirs` + boot wiring working ✅ | --- diff --git a/src/agent/index.ts b/src/agent/index.ts index 1852fa7..68291ef 100644 --- a/src/agent/index.ts +++ b/src/agent/index.ts @@ -1016,6 +1016,14 @@ async function syncPluginsToSandbox(): Promise { conn, mcpManager, mcpWriteSafetyGate, + // Session-learning: record any MCP server the LLM + // actually exercised — including calls made from inside + // `execute_javascript` via `host:mcp-` imports + // (which never surface as a top-level `mcp__*` tool name + // and so are invisible to onPostToolUse). + (serverName: string) => { + state.mcpServersUsed.add(serverName); + }, ); registrations.push(adapter); } @@ -1131,8 +1139,11 @@ async function handleSlashCommand( drainAndWarn, mcpManager, // Real MCP manager (or null if no config) syncPlugins: syncPluginsToSandbox, - submitToLLM: (prompt: string) => { + submitToLLM: (prompt: string, options?: { skipAutoSuggest?: boolean }) => { state.pendingPrompt = prompt; + if (options?.skipAutoSuggest) { + state.skipNextAutoSuggest = true; + } }, }; return handleSlashCommandImpl(rawInput, rl, slashDeps); @@ -5574,7 +5585,18 @@ const generateSkillTool = defineTool("generate_skill", { params.triggers.length > 5 ? ` (+${params.triggers.length - 5} more)` : ""; - console.log(`\n ${C.warn("📚 Save skill:")} ${C.tool(params.name)}`); + // Surface the overwrite path explicitly — the LLM passing + // `overwrite=true` is necessary but not sufficient. The user + // gets a chance to refuse before we replace existing content. + const isOverwrite = + params.overwrite === true && userSkillExists(params.name); + if (isOverwrite) { + console.log( + `\n ${C.warn("⚠️ Overwrite existing user skill:")} ${C.tool(params.name)}`, + ); + } else { + console.log(`\n ${C.warn("📚 Save skill:")} ${C.tool(params.name)}`); + } console.log(` ${params.description}`); console.log(` Triggers: ${triggerPreview}${triggerSuffix}`); console.log( @@ -5589,9 +5611,12 @@ const generateSkillTool = defineTool("generate_skill", { } await drainAndWarn(rl); + const promptLabel = isOverwrite + ? ` ${C.dim("Overwrite skill? [y/n] ")}` + : ` ${C.dim("Save skill? [y/n] ")}`; const approval = state.autoApprove ? "y" - : await promptUser(rl, ` ${C.dim("Save skill? [y/n] ")}`); + : await promptUser(rl, promptLabel); if (approval.trim().toLowerCase() !== "y") { console.log(` ${C.dim("Denied by user.")}`); return { success: false, error: "Skill save denied by user" }; @@ -6013,9 +6038,16 @@ function buildSessionConfig() { state.currentUserPrompt = input.prompt; state.hasCalledListModules = false; + // Slash commands that queue a synthetic prompt (e.g. /save-skill) + // set state.skipNextAutoSuggest so the auto-suggest pass doesn't + // match unrelated skills on scaffolding terms like "MCP" or + // "SKILL.md". Consume the flag before the suggest pass below. + const skipAutoSuggest = state.skipNextAutoSuggest; + state.skipNextAutoSuggest = false; + // Auto-invoke suggest_approach for non-trivial prompts const isNonTrivial = input.prompt.length > 25; - if (isNonTrivial) { + if (isNonTrivial && !skipAutoSuggest) { const result = runSuggestApproach( input.prompt, state.preLoadedSkills, @@ -6144,9 +6176,11 @@ function buildSessionConfig() { state.toolCallHistory.length - MAX_TOOL_HISTORY, ); } - // Tools whose name looks like `mcp____` count - // their server as "used" — that's how the SDK exposes MCP - // tools to the LLM. See manage_mcp + listMCPServersTool. + // Top-level MCP tool fallback: when an SDK ever surfaces a + // tool as `mcp____` we still count the server. + // The primary tracking path is the MCP plugin-adapter's onCall + // observer (wired in syncPluginsToSandbox above) which catches + // calls made via `host:mcp-` imports too. if (success && toolName.startsWith("mcp__")) { const server = toolName.split("__")[1]; if (server) state.mcpServersUsed.add(server); diff --git a/src/agent/mcp/plugin-adapter.ts b/src/agent/mcp/plugin-adapter.ts index 2abfa0a..01a24ac 100644 --- a/src/agent/mcp/plugin-adapter.ts +++ b/src/agent/mcp/plugin-adapter.ts @@ -33,6 +33,22 @@ export type WriteSafetyGate = ( annotations: MCPToolAnnotations | undefined, ) => Promise; +/** + * Callback invoked at the start of every MCP tool call routed through + * the adapter. Used by the agent to track which MCP servers were + * actually exercised in this session (independent of whether the LLM + * called the tool via a top-level `mcp__*` name or imported it as a + * `host:mcp-` module from inside `execute_javascript`). + * + * Fires after the write-safety gate (if any) has approved, but before + * the call is dispatched to the manager. Synchronous on purpose so + * tracking cannot delay the call. + * + * @param serverName - MCP server name (e.g. "work-iq-mail") + * @param toolName - Tool name being invoked + */ +export type MCPCallObserver = (serverName: string, toolName: string) => void; + /** * PluginRegistration-compatible interface. * Matches the shape expected by src/sandbox/tool.js setPlugins(). @@ -55,12 +71,16 @@ export interface MCPPluginRegistration { * @param manager - The client manager for making tool calls. * @param gate - Optional write-safety gate. When provided, non-read-only * tools are checked before execution. + * @param onCall - Optional observer fired on every successful gate-pass + * immediately before the tool is dispatched. Used by the agent for + * session-learning tracking (see state.mcpServersUsed). * @returns A PluginRegistration that can be passed to setPlugins(). */ export function createMCPPluginAdapter( conn: MCPConnection, manager: MCPClientManager, gate?: WriteSafetyGate, + onCall?: MCPCallObserver, ): MCPPluginRegistration { const moduleName = `mcp-${conn.name}`; @@ -93,6 +113,11 @@ export function createMCPPluginAdapter( } } + // Notify any observer that we are about to dispatch. The + // observer is intentionally invoked AFTER the gate so denied + // calls do not pollute session-learning state. + onCall?.(conn.name, tool.name); + return manager.callTool(conn.name, tool.name, toolArgs); }; } diff --git a/src/agent/session-context.ts b/src/agent/session-context.ts index 565de6a..12fd1f3 100644 --- a/src/agent/session-context.ts +++ b/src/agent/session-context.ts @@ -30,6 +30,15 @@ const MAX_ERRORS_REPORTED = 8; */ const MAX_TOP_TOOLS = 10; +/** + * Maximum characters of the user's most-recent prompt kept in the + * session-context summary. Anything longer is truncated with an + * ellipsis — the LLM only needs the gist of the original task to + * anchor the SKILL.md it writes, and a 50-KB paste here would + * dominate the prompt and crowd out the actual session history. + */ +const MAX_USER_PROMPT_CHARS = 2000; + // ── Types ──────────────────────────────────────────────────────────── /** @@ -95,8 +104,16 @@ export function extractSessionContext(state: AgentState): SessionContext { .sort((a, b) => b.count - a.count) .slice(0, MAX_TOP_TOOLS); + // Truncate the user prompt so a giant paste doesn't dominate the + // session-context summary. We keep the head — the leading phrase + // is the strongest signal of intent. + const userPrompt = + state.currentUserPrompt.length > MAX_USER_PROMPT_CHARS + ? state.currentUserPrompt.slice(0, MAX_USER_PROMPT_CHARS) + "…" + : state.currentUserPrompt; + return { - userPrompt: state.currentUserPrompt, + userPrompt, topTools, mcpServersUsed: Array.from(state.mcpServersUsed).sort(), modulesRegistered: [...state.modulesRegistered].sort(), diff --git a/src/agent/skill-writer.ts b/src/agent/skill-writer.ts index 3933ccd..7137240 100644 --- a/src/agent/skill-writer.ts +++ b/src/agent/skill-writer.ts @@ -29,9 +29,11 @@ import { ALLOWED_TOOLS } from "./tool-gating.js"; /** * Root directory for user-created skills. * - * Defaults to `~/.hyperagent/skills/`. The `HYPERAGENT_USER_SKILLS_DIR` - * environment variable overrides the default — tests use this to point - * at a temporary directory without polluting the real user library. + * Resolved at MODULE LOAD TIME. Tests that need to redirect the path + * to a tmpdir must set `HYPERAGENT_USER_SKILLS_DIR` and then re-import + * this module via `vi.resetModules()` + dynamic `import()` — see + * `tests/skill-writer.test.ts` for the pattern. Setting the env var + * AFTER the first import has no effect. */ const DEFAULT_USER_SKILLS_DIR = process.env.HYPERAGENT_USER_SKILLS_DIR ?? @@ -49,6 +51,27 @@ const MAX_TRIGGERS = 50; /** Kebab-case name pattern (lowercase letters, digits, hyphens). */ const VALID_NAME_RE = /^[a-z][a-z0-9-]*$/; +/** + * Names that double as `/skills` subcommands — accepting them as skill + * names would let `/skills ` shadow `/skills info|edit|delete|list` + * and create confusing CLI behaviour. + */ +const RESERVED_SKILL_NAMES: ReadonlySet = new Set([ + "info", + "edit", + "delete", + "list", +]); + +/** + * Pattern matching YAML-frontmatter characters that would break a + * single-line `key: value` representation — newlines split fields and + * a literal `---` terminates the frontmatter block. Used by + * `validateSkillData` to reject payloads that would otherwise need + * heavy YAML escaping in `renderSkillMarkdown`. + */ +const YAML_UNSAFE_RE = /[\r\n]|^---$/; + // ── Types ──────────────────────────────────────────────────────────── /** Input data for a new skill, mirroring SKILL.md frontmatter fields. */ @@ -101,7 +124,8 @@ export function getUserSkillsDir(): string { * Validate a skill name. Returns an error message string, or null if valid. * * Rules: kebab-case (lowercase letters, digits, hyphens; must start with a - * letter), ≤64 characters, no path traversal characters. + * letter), ≤64 characters, no path traversal characters, and not one of + * the reserved `/skills` subcommand names. */ export function validateSkillName(name: string): string | null { if (!name) return "Skill name must not be empty"; @@ -112,6 +136,9 @@ export function validateSkillName(name: string): string | null { if (name.includes("..") || name.includes("/") || name.includes("\\")) { return "Skill name must not contain path traversal characters"; } + if (RESERVED_SKILL_NAMES.has(name)) { + return `Skill name '${name}' is reserved (collides with a /skills subcommand)`; + } return null; } @@ -141,6 +168,10 @@ export function validateSkillData( errors.push( `Skill description must be ≤${MAX_DESCRIPTION_LENGTH} characters`, ); + } else if (YAML_UNSAFE_RE.test(data.description)) { + errors.push( + "Skill description must be a single line (no newlines or '---')", + ); } if (!Array.isArray(data.triggers) || data.triggers.length === 0) { @@ -153,6 +184,12 @@ export function validateSkillData( errors.push("Triggers must be non-empty strings"); break; } + if (YAML_UNSAFE_RE.test(t)) { + errors.push( + "Triggers must be single-line strings (no newlines or '---')", + ); + break; + } } } @@ -252,9 +289,12 @@ export function writeUserSkill(data: SkillData, patternsDir: string): string { } const rendered = renderSkillMarkdown(data); - if (rendered.length > MAX_SKILL_SIZE_BYTES) { + // Size cap is on disk-bytes (UTF-8) — `.length` would under-count for + // multi-byte characters and let a payload sneak past the limit. + const renderedBytes = Buffer.byteLength(rendered, "utf-8"); + if (renderedBytes > MAX_SKILL_SIZE_BYTES) { throw new Error( - `SKILL.md exceeds maximum size (${rendered.length} bytes > ${MAX_SKILL_SIZE_BYTES} bytes)`, + `SKILL.md exceeds maximum size (${renderedBytes} bytes > ${MAX_SKILL_SIZE_BYTES} bytes)`, ); } diff --git a/src/agent/slash-commands.ts b/src/agent/slash-commands.ts index 7939f00..61e0ac0 100644 --- a/src/agent/slash-commands.ts +++ b/src/agent/slash-commands.ts @@ -53,6 +53,7 @@ import { deleteUserSkill, userSkillExists, getUserSkillsDir, + validateSkillName, } from "./skill-writer.js"; import { extractSessionContext, @@ -111,8 +112,16 @@ export interface SlashCommandDeps { * Queue a synthetic prompt for the LLM (used by `/save-skill` and * similar commands that want to inject a user-turn-shaped message). * The prompt is drained at the top of the next REPL iteration. + * + * `options.skipAutoSuggest` suppresses the automatic + * `runSuggestApproach` pass on the next turn — synthetic prompts + * contain scaffolding text (e.g. "MCP", "SKILL.md") that would + * otherwise match unrelated skill triggers. */ - submitToLLM: (prompt: string) => void; + submitToLLM: ( + prompt: string, + options?: { skipAutoSuggest?: boolean }, + ) => void; } // ── Handler ────────────────────────────────────────────────────────── @@ -575,10 +584,14 @@ export async function handleSlashCommand( } as any); registerEventHandler(state.activeSession); // Reset session-learning fields — the new conversation starts - // with no history of what we did last time. + // with no history of what we did last time, and the previous + // user-prompt should not anchor /save-skill or auto-suggest on + // the next turn either. state.toolCallHistory = []; state.mcpServersUsed = new Set(); state.modulesRegistered = []; + state.currentUserPrompt = ""; + state.lastGuidance = null; console.log( ` ${C.ok("🆕 New session started.")} Conversation history cleared.`, ); @@ -777,6 +790,14 @@ export async function handleSlashCommand( } as any, ); registerEventHandler(state.activeSession); + // Local session-learning state cannot be reconstructed from a + // resumed remote session — clear it so /save-skill doesn't + // mine stale history from the previously active conversation. + state.toolCallHistory = []; + state.mcpServersUsed = new Set(); + state.modulesRegistered = []; + state.currentUserPrompt = ""; + state.lastGuidance = null; console.log( ` ${C.ok("⏮️ Resumed session:")} ${C.val(targetId.slice(0, 12) + "…")}`, ); @@ -2130,6 +2151,13 @@ export async function handleSlashCommand( console.log(` ${C.dim("Usage: /skills info ")}`); return true; } + // Validate the name BEFORE touching the filesystem so a value + // like "../etc" can never be `join`-ed into a path we read. + const nameError = validateSkillName(arg); + if (nameError) { + console.log(` ${C.err("❌ Invalid skill name:")} ${nameError}`); + return true; + } const userContent = readUserSkill(arg); if (userContent) { console.log( @@ -2155,6 +2183,11 @@ export async function handleSlashCommand( console.log(` ${C.dim("Usage: /skills edit ")}`); return true; } + const nameError = validateSkillName(arg); + if (nameError) { + console.log(` ${C.err("❌ Invalid skill name:")} ${nameError}`); + return true; + } if (!userSkillExists(arg)) { console.log( ` ${C.err("❌ User skill not found:")} ${arg} ` + @@ -2176,6 +2209,11 @@ export async function handleSlashCommand( console.log(` ${C.dim("Usage: /skills delete ")}`); return true; } + const nameError = validateSkillName(arg); + if (nameError) { + console.log(` ${C.err("❌ Invalid skill name:")} ${nameError}`); + return true; + } if (!userSkillExists(arg)) { console.log( ` ${C.err("❌ User skill not found:")} ${arg} ` + @@ -2338,11 +2376,20 @@ export async function handleSlashCommand( ` ${C.label("📝 Capturing session learnings…")}` + (desiredName ? ` ${C.dim("(name: " + desiredName + ")")}` : ""), ); + // "distinct tools" = full-history cardinality, NOT the bounded + // topTools view (capped at MAX_TOP_TOOLS). Computing it from + // toolCallHistory keeps the status line honest. + const distinctToolCount = new Set( + state.toolCallHistory.map((e) => e.tool), + ).size; console.log( - ` ${C.dim("Context: " + ctx.totalToolCalls + " tool calls, " + ctx.topTools.length + " distinct tools.")}`, + ` ${C.dim("Context: " + ctx.totalToolCalls + " tool calls, " + distinctToolCount + " distinct tools.")}`, ); - deps.submitToLLM(synthetic); + // Bypass auto suggest_approach on the next turn — the synthetic + // prompt mentions "MCP", "SKILL.md", etc. as scaffolding and + // would otherwise match unrelated skill triggers. + deps.submitToLLM(synthetic, { skipAutoSuggest: true }); return true; } diff --git a/src/agent/state.ts b/src/agent/state.ts index 2f4ce6a..fe2244c 100644 --- a/src/agent/state.ts +++ b/src/agent/state.ts @@ -293,6 +293,16 @@ export interface AgentState { */ pendingPrompt: string | null; + /** + * When set, the next call to `onUserPromptSubmitted` skips the + * automatic `runSuggestApproach` pass. Slash commands that queue a + * synthetic prompt (e.g. `/save-skill`) set this so the agent does + * not match a skill on terms that are part of the synthetic prompt's + * scaffolding rather than the user's real intent. Cleared by the + * hook on consumption. + */ + skipNextAutoSuggest: boolean; + // ── Token Tracking ─────────────────────────────────────────────── /** Cumulative input tokens across all LLM requests this session. */ @@ -397,6 +407,7 @@ export function createAgentState( mcpServersUsed: new Set(), modulesRegistered: [], pendingPrompt: null, + skipNextAutoSuggest: false, // Token tracking totalInputTokens: 0, diff --git a/tests/session-context.test.ts b/tests/session-context.test.ts index 844a9ff..624949d 100644 --- a/tests/session-context.test.ts +++ b/tests/session-context.test.ts @@ -127,6 +127,21 @@ describe("extractSessionContext", () => { expect(ctx.userPrompt).toBe("Find the Teams transcript"); }); + it("truncates the user prompt when it exceeds the cap", () => { + // The cap is an internal constant (MAX_USER_PROMPT_CHARS = 2000), + // but the contract is: long prompts come back truncated with an + // ellipsis so a giant paste cannot dominate the saved context. + const giant = "x".repeat(5000); + const ctx = extractSessionContext(makeState({ currentUserPrompt: giant })); + expect(ctx.userPrompt.length).toBeLessThan(giant.length); + expect(ctx.userPrompt.endsWith("…")).toBe(true); + // Short prompts pass through untouched (regression guard). + const short = extractSessionContext( + makeState({ currentUserPrompt: "short prompt" }), + ); + expect(short.userPrompt).toBe("short prompt"); + }); + it("caps the topTools list at 10 entries", () => { // Build 15 distinct tools each called once, plus one called 20 times. const history: AgentState["toolCallHistory"] = []; diff --git a/tests/skill-writer.test.ts b/tests/skill-writer.test.ts index 9b4e564..6cf11ef 100644 --- a/tests/skill-writer.test.ts +++ b/tests/skill-writer.test.ts @@ -84,6 +84,14 @@ describe("validateSkillName", () => { it("rejects names longer than 64 characters", () => { expect(writer.validateSkillName("a".repeat(65))).toMatch(/64/); }); + + it("rejects reserved /skills subcommand names", () => { + // These would shadow `/skills info|edit|delete|list` if accepted. + expect(writer.validateSkillName("info")).toMatch(/reserved/i); + expect(writer.validateSkillName("edit")).toMatch(/reserved/i); + expect(writer.validateSkillName("delete")).toMatch(/reserved/i); + expect(writer.validateSkillName("list")).toMatch(/reserved/i); + }); }); // ── validateSkillData ──────────────────────────────────────────────── @@ -305,4 +313,54 @@ describe("writeUserSkill / listUserSkills / readUserSkill / deleteUserSkill", () ), ).toThrow(/not-a-real-pattern/); }); + + it("rejects descriptions containing newlines (would break YAML frontmatter)", () => { + expect(() => + writer.writeUserSkill( + { + name: "bad-desc", + description: "first line\nrogue: injection", + triggers: ["x"], + guidance: "x", + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ), + ).toThrow(/single line|newlines/i); + }); + + it("rejects triggers containing newlines (would break YAML frontmatter)", () => { + expect(() => + writer.writeUserSkill( + { + name: "bad-trigger", + description: "ok", + triggers: ["fine", "bad\nrogue: injection"], + guidance: "x", + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ), + ).toThrow(/single-line|newlines/i); + }); + + it("enforces size cap in UTF-8 bytes, not JS string units", () => { + // 32 KB of a 4-byte UTF-8 character ⇒ ~128 KB of bytes — well past + // MAX_SKILL_SIZE_BYTES (64 KB). If the cap counted `String.length` + // (UTF-16 code units) instead of UTF-8 bytes this would slip + // through; the byte-length test makes sure it doesn't. + const fatGuidance = "𠮷".repeat(32 * 1024); // U+20BB7, 4 bytes in UTF-8 + expect(() => + writer.writeUserSkill( + { + name: "oversized", + description: "x", + triggers: ["x"], + guidance: fatGuidance, + allowedTools: ["execute_javascript"], + }, + tempPatternsDir, + ), + ).toThrow(/exceeds maximum size/); + }); });