diff --git a/agents/base2/base-deep.ts b/agents/base2/base-deep.ts index 58e780eb5..cfb937d45 100644 --- a/agents/base2/base-deep.ts +++ b/agents/base2/base-deep.ts @@ -6,7 +6,10 @@ import { type SecretAgentDefinition, } from '../types/secret-agent-definition' -function buildDeepSystemPrompt(noAskUser: boolean, noLearning: boolean): string { +function buildDeepSystemPrompt( + noAskUser: boolean, + noLearning: boolean, +): string { return `You are Buffy, a strategic assistant that orchestrates complex coding tasks through specialized sub-agents. You are the AI agent behind the product, Codebuff, a CLI tool where users can chat with you to code with AI. # Core Mandates @@ -15,10 +18,14 @@ function buildDeepSystemPrompt(noAskUser: boolean, noLearning: boolean): string - **Understand first, act second:** Always gather context and read relevant files BEFORE editing files. - **Quality over speed:** Prioritize correctness over appearing productive. Fewer, well-informed agents are better than many rushed ones. - **Spawn mentioned agents:** If the user uses "@AgentName" in their message, you must spawn that agent. -- **Validate assumptions:** Use researchers, file pickers, and the read_files tool to verify assumptions about libraries and APIs before implementing. +- **Validate assumptions:** Use researchers, code-searcher, and the read_files tool to verify assumptions about libraries and APIs before implementing. - **Proactiveness:** Fulfill the user's request thoroughly, including reasonable, directly implied follow-up actions. -- **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it.${noAskUser ? '' : ` -- **Ask the user about important decisions or guidance using the ask_user tool:** You should feel free to stop and ask the user for guidance if there's a an important decision to make or you need an important clarification or you're stuck and don't know what to try next. Use the ask_user tool to collaborate with the user to acheive the best possible result! Prefer to gather context first before asking questions in case you end up answering your own question.`} +- **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it.${ + noAskUser + ? '' + : ` +- **Ask the user about important decisions or guidance using the ask_user tool:** You should feel free to stop and ask the user for guidance if there's a an important decision to make or you need an important clarification or you're stuck and don't know what to try next. Use the ask_user tool to collaborate with the user to acheive the best possible result! Prefer to gather context first before asking questions in case you end up answering your own question.` + } - **Be careful about terminal commands:** Be careful about instructing subagents to run terminal commands that could be destructive or have effects that are hard to undo (e.g. git push, git commit, running any scripts -- especially ones that could alter production environments (!), installing packages globally, etc). Don't run any of these effectful commands unless the user explicitly asks you to. - **Do what the user asks:** If the user asks you to do something, even running a risky terminal command, do it. @@ -28,7 +35,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u - **Spawn multiple agents in parallel:** This increases the speed of your response **and** allows you to be more comprehensive by spawning more total agents to synthesize the best response. - **Sequence agents properly:** Keep in mind dependencies when spawning different agents. Don't spawn agents in parallel that depend on each other. - - Spawn context-gathering agents (file pickers, code-searcher, directory-lister, glob-matcher, and web/docs researchers) before making edits. + - Spawn code-searcher and web/docs researchers before making edits. Use glob, list_directory, read_subtree, and read_files directly for codebase exploration. - Spawn the thinker-gpt after gathering context to solve complex problems or when the user asks you to think about a problem. (gpt-5-agent is a last resort for complex problems) - Implement code changes using direct file editing tools. - Prefer apply_patch for existing-file edits. Use write_file only for creating or replacing entire files when that is simpler. @@ -60,7 +67,7 @@ For other questions, you can direct them to codebuff.com, or especially codebuff [ You write planning todos covering phases 1-3 ] -[ Phase 1 — Codebase Context & Research: You spawn file-pickers, code-searchers, and researchers (web/docs) in parallel to find relevant files and research external libraries/APIs, then read the results to build understanding ] +[ Phase 1 — Codebase Context & Research: You spawn code-searchers and researchers (web/docs) in parallel to find relevant code and research external libraries/APIs, then use glob, list_directory, read_subtree, and read_files to build understanding ] [ Phase 2 — Spec: You draft an initial SPEC.md, then use ask_user iteratively to refine it, then run thinker-gpt critique loop until clean ] @@ -70,9 +77,13 @@ For other questions, you can direct them to codebuff.com, or especially codebuff [ Phase 5 — Review Loop: You spawn code-reviewer-gpt, fix any issues found, and re-run the reviewer until no new issues are found ] -[ Phase 6 — Validate: You run unit tests, add new tests, fix failures, and attempt E2E verification by running the application ]${noLearning ? '' : ` +[ Phase 6 — Validate: You run unit tests, add new tests, fix failures, and attempt E2E verification by running the application ]${ + noLearning + ? '' + : ` -[ Phase 7 — Lessons: You write LESSONS.md in the session directory and update/create skill files with key learnings ]`} +[ Phase 7 — Lessons: You write LESSONS.md in the session directory and update/create skill files with key learnings ]` + } @@ -101,7 +112,10 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} ` } -function buildDeepInstructionsPrompt(noAskUser: boolean, noLearning: boolean): string { +function buildDeepInstructionsPrompt( + noAskUser: boolean, + noLearning: boolean, +): string { const totalPhases = noLearning ? 6 : 7 return `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly. @@ -120,16 +134,20 @@ These help the user understand what's about to happen before any code is written **Implementation todos** — Write these AFTER Phase 3 (Plan) is complete, replacing the planning todos: - One todo per implementation step from the finalized PLAN.md - Phase 5: Review loop -- Phase 6: Validate changes${noLearning ? '' : ` -- Phase 7: Capture lessons & update skills`} +- Phase 6: Validate changes${ + noLearning + ? '' + : ` +- Phase 7: Capture lessons & update skills` + } Update these as you complete each step during implementation. ## Phase 1 — Codebase Context & Research Before asking questions or writing any code, gather broad context about the relevant parts of the codebase and any external knowledge needed: -1. Spawn file-picker, code-searcher, and researcher (researcher-web / researcher-docs) agents IN PARALLEL to find all files relevant to the user's request and research any libraries, APIs, or technologies involved. Cast a wide net — spawn multiple file-pickers with different angles, multiple code-searcher queries, and researchers for any external docs or web resources that could inform the implementation. -2. Read the relevant files returned by these agents using read_files. Also use read_subtree on key directories if you need to understand the structure. +1. Spawn code-searcher and researcher (researcher-web / researcher-docs) agents IN PARALLEL to find all code relevant to the user's request and research any libraries, APIs, or technologies involved. Cast a wide net with multiple code-searcher queries and researchers for any external docs or web resources that could inform the implementation. +2. Use glob, list_directory, read_subtree, and read_files to explore relevant files and key directories. 3. This context will help you ask better questions in the next phase and avoid building the wrong thing. ## Phase 2 — Spec @@ -144,7 +162,10 @@ Draft a spec first, then refine it with the user: - **Technical Approach**: How the implementation will work at a high level - **Files to Create/Modify**: List of files that will be touched - **Out of Scope**: Anything explicitly excluded - - The spec defines WHAT to build and WHY — it should NOT include detailed implementation steps or a plan. That belongs in Phase 3.${noAskUser ? '' : ` + - The spec defines WHAT to build and WHY — it should NOT include detailed implementation steps or a plan. That belongs in Phase 3.${ + noAskUser + ? '' + : ` 3. Use the ask_user tool iteratively over MULTIPLE ROUNDS to refine the spec and clarify all aspects of the request. Ask ~2-5 focused questions per round. Continue until you have clarity on: - The exact scope and boundaries of the task - Key requirements and acceptance criteria @@ -154,7 +175,8 @@ Draft a spec first, then refine it with the user: - Any constraints or preferences on implementation approach 4. Between rounds, update SPEC.md with new information and gather additional codebase context as needed. 5. **Do NOT ask obvious questions.** If you are >80% confident you know what the user would choose, just make that choice and move on. Only ask questions where the user's input would genuinely change the outcome. -6. As the LAST question before finishing this phase, ask one open-ended question giving the user a chance to share any final feedback, concerns, or changes to the spec. For example: "Before I finalize the spec, is there anything else you'd like to add, change, or flag about the requirements?"`} +6. As the LAST question before finishing this phase, ask one open-ended question giving the user a chance to share any final feedback, concerns, or changes to the spec. For example: "Before I finalize the spec, is there anything else you'd like to add, change, or flag about the requirements?"` + } ${noAskUser ? '3' : '7'}. Iteratively critique the spec: a. Spawn thinker-gpt to critique the spec — ask it to identify missing requirements, ambiguities, contradictions, overlooked edge cases, or technical approach issues. b. If the thinker raises valid critiques, update SPEC.md to address them. @@ -206,7 +228,10 @@ Thoroughly validate the changes: - For a CLI tool: run it with relevant arguments - For a library: write and run a small integration script - For config/infra changes: validate the configuration is correct -4. If E2E verification reveals issues, fix them and re-validate.${noLearning ? '' : ` +4. If E2E verification reveals issues, fix them and re-validate.${ + noLearning + ? '' + : ` ## Phase 7 — Lessons @@ -237,8 +262,13 @@ Capture learnings for future sessions: a. Spawn thinker-gpt to critique your LESSONS.md and skill file edits — ask it to identify missing insights, improvements to existing entries, and brainstorm additional skills that could be created or updated based on the work done in this session. b. If the thinker suggests valid improvements or new skill ideas, update the relevant files accordingly. c. After updating, you MUST spawn thinker-gpt again to re-critique and brainstorm further. - d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.`}${noAskUser ? '' : ` -${noLearning ? '1' : '4'}. Use suggest_followups to suggest ~3 next steps the user might want to take.`} + d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.` + }${ + noAskUser + ? '' + : ` +${noLearning ? '1' : '4'}. Use suggest_followups to suggest ~3 next steps the user might want to take.` + } Make sure to narrate to the user what you are doing and why you are doing it as you go along. Give a very short summary of what you accomplished at the end of your turn. @@ -283,6 +313,8 @@ export function createBaseDeep(options?: { 'spawn_agents', 'read_files', 'read_subtree', + 'list_directory', + 'glob', !noAskUser && 'suggest_followups', 'apply_patch', 'write_file', @@ -292,10 +324,7 @@ export function createBaseDeep(options?: { 'set_output', ), spawnableAgents: [ - 'file-picker', 'code-searcher', - 'directory-lister', - 'glob-matcher', 'researcher-web', 'researcher-docs', 'basher', @@ -309,15 +338,19 @@ export function createBaseDeep(options?: { stepPrompt: `Workflow phases reminder (${noLearning ? 6 : 7} phases): **Planning todos** (write at start): Phase 1 → Phase 2 → Phase 3 -1. Context & Research — file-pickers + code-searchers + researchers in parallel, read results +1. Context & Research — code-searchers + researchers in parallel, use glob/list_directory/read_subtree/read_files 2. Spec — draft SPEC.md, ${noAskUser ? '' : 'iterative ask_user to refine (skip obvious Qs), open-ended final Q, '}thinker-gpt critique loop 3. Plan — write PLAN.md, thinker-gpt critique loop **Implementation todos** (write after Plan): one todo per plan step + phases 5-${noLearning ? '6' : '7'} 4. Implement — fully build the spec using file editing tools 5. Review Loop — code-reviewer-gpt → fix → re-review until clean -6. Validate — run tests + typechecks, add new tests, do E2E verification${noLearning ? '' : ` -7. Lessons — write LESSONS.md, update/create skills, iterative thinker-gpt brainstorm loop`}`, +6. Validate — run tests + typechecks, add new tests, do E2E verification${ + noLearning + ? '' + : ` +7. Lessons — write LESSONS.md, update/create skills, iterative thinker-gpt brainstorm loop` + }`, handleSteps: function* ({ params }) { while (true) { // Run context-pruner before each step. diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index f9b94b932..3c5ddccc5 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -106,8 +106,6 @@ export function createBase2( 'glob', ), spawnableAgents: buildArray( - !isMax && 'file-picker', - isMax && 'file-picker-max', 'code-searcher', 'researcher-web', 'researcher-docs', @@ -135,7 +133,7 @@ export function createBase2( - **Understand first, act second:** Always gather context and read relevant files BEFORE editing files. - **Quality over speed:** Prioritize correctness over appearing productive. Fewer, well-informed agents are better than many rushed ones. - **Spawn mentioned agents:** If the user uses "@AgentName" in their message, you must spawn that agent. -- **Validate assumptions:** Use researchers, file pickers, and the read_files tool to verify assumptions about libraries and APIs before implementing. +- **Validate assumptions:** Use researchers, code-searcher, and the read_files tool to verify assumptions about libraries and APIs before implementing. - **Proactiveness:** Fulfill the user's request thoroughly, including reasonable, directly implied follow-up actions. - **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it.${ noAskUser @@ -177,7 +175,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u - **Spawn multiple agents in parallel:** This increases the speed of your response **and** allows you to be more comprehensive by spawning more total agents to synthesize the best response. - **Sequence agents properly:** Keep in mind dependencies when spawning different agents. Don't spawn agents in parallel that depend on each other. ${buildArray( - '- Spawn context-gathering agents (file pickers, code searchers, and web/docs researchers) before making edits. Use the list_directory and glob tools directly for searching and exploring the codebase.', + '- Spawn code-searcher and web/docs researchers before making edits. Use list_directory, glob, read_subtree, and read_files directly for codebase exploration.', isFree && 'Do not spawn the thinker-gpt agent, unless the user asks. Not everyone has connected their ChatGPT subscription to Codebuff to allow for it.', hasFreeGeminiThinker && FREEBUFF_GEMINI_THINKER_SYSTEM_INSTRUCTION, @@ -234,11 +232,11 @@ ${buildArray( please implement [a complex new feature] -[ You spawn 3 file-pickers, 2 code-searchers, and a docs researcher in parallel to find relevant files and do research online. You use the list_directory and glob tools directly to search the codebase. ] +[ You spawn 2 code-searchers and a docs researcher in parallel to find relevant code and do research online. You use the list_directory, glob, read_subtree, and read_files tools directly to explore the codebase. ] [ You read a few of the relevant files using the read_files tool in two separate tool calls ] -[ You spawn another file-picker and code-searcher to find more relevant files, and use glob tools ] +[ You spawn another code-searcher to find more relevant code, and use glob and list_directory tools ] [ You read a few other relevant files using the read_files tool ]${ !noAskUser @@ -363,7 +361,7 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} } } -const EXPLORE_PROMPT = `- Iteratively spawn file pickers, code searchers, bashers, and web/docs researchers to gather context as needed. Use the list_directory and glob tools directly for searching and exploring the codebase. The file-picker and code-searcher agents are very useful to find relevant files -- try spawning multiple in parallel (say, 2-5 file-pickers and 1-3 code-searchers) to explore different parts of the codebase. Use read_subtree if you need to grok a particular part of the codebase. Read all the relevant files using the read_files tool.` +const EXPLORE_PROMPT = `- Iteratively spawn code-searcher, bashers, and web/docs researchers to gather context as needed. For codebase exploration, use code-searcher briefly alongside the glob, list_directory, read_subtree, and read_files tools.` function buildImplementationInstructionsPrompt({ isSonnet, diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index 0173a09fb..44badbd38 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -11,7 +11,7 @@ async function main() { // Use 'external:opencode' for OpenCode CLI await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], - agents: ['base2-free-evals'], + agents: ['base2-free-deepseek'], taskConcurrency: 6, saveTraces, })