From 79ed16e3b8936d62d0714fbff7441ddb4a1fcec3 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 14:14:56 -0700 Subject: [PATCH 1/9] Make basher description clearer --- agents/basher.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agents/basher.ts b/agents/basher.ts index 671437bff1..dc9dc689d1 100644 --- a/agents/basher.ts +++ b/agents/basher.ts @@ -11,7 +11,7 @@ const basher: AgentDefinition = { model: 'google/gemini-3.1-flash-lite-preview', displayName: 'Basher', spawnerPrompt: - 'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor.', + 'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor. Requires both a shell command and a prompt.', inputSchema: { prompt: { @@ -24,7 +24,7 @@ const basher: AgentDefinition = { properties: { command: { type: 'string', - description: 'Terminal command to run', + description: 'Terminal command to run in bash shell', }, timeout_seconds: { type: 'number', From 013b7f77d0f9f7430b08a9ee2a9cb3f26f08ad88 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 14:39:15 -0700 Subject: [PATCH 2/9] freebuff: spawn thinker gemini more often --- agents/base2/base2.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index 9888a360f2..58d8c8b979 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -143,7 +143,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u ${buildArray( '- Spawn context-gathering agents (file pickers and web/docs researchers) before making edits. Use the code_search, list_directory, and glob tools directly for searching and exploring the codebase.', isFree && 'Do not spawn the thinker-gpt agent, unless the user asks. Not everyone has connected their ChatGPT subscription to Codebuff to allow for it.', - isFree && 'You should spawn the thinker-gemini agent whenever you encounter a complex problem or the user asks you to think about a problem. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it often!', + isFree && 'You must spawn the thinker-gemini agent to think through and plan the reponse to most requests, unless the request is trivial. This agent is extremely useful as it is very smart!', isDefault && '- Spawn the editor agent to implement the changes after you have gathered all the context you need.', (isDefault || isMax) && @@ -206,7 +206,7 @@ ${buildArray( [ You read a few other relevant files using the read_files tool ]${!noAskUser ? `\n\n[ You ask the user for important clarifications on their request or alternate implementation strategies using the ask_user tool ]` : '' - } + }${isFree ? `\n\n[ You spawn the thinker-gemini agent to plan the best response ]` : ''} ${isDefault ? `[ You implement the changes using the editor agent ]` : isFast || isFree @@ -334,7 +334,7 @@ ${buildArray( (isDefault || isMax) && `- For any task requiring 3+ steps, use the write_todos tool to write out your step-by-step implementation plan. Include ALL of the applicable tasks in the list.${isFast ? '' : ' You should include a step to review the changes after you have implemented the changes.'}:${hasNoValidation ? '' : ' You should include at least one step to validate/test your changes: be specific about whether to typecheck, run tests, run lints, etc.'} You may be able to do reviewing and validation in parallel in the same step. Skip write_todos for simple tasks like quick edits or answering questions.`, isFree && - `- For complex problems, spawn the thinker-gemini agent to help find the best solution. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it often!`, + `- For most requests, spawn the thinker-gemini agent to think through and plan the best response. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it about once per user request. Gather all the necessary context *before* spawning the thinker-gemini agent.`, (isDefault || isMax) && `- For quick problems, briefly explain your reasoning to the user. If you need to think longer, write your thoughts within the tags. Finally, for complex problems, spawn the thinker agent to help find the best solution. (gpt-5-agent is a last resort for complex problems)`, isDefault && @@ -379,6 +379,8 @@ function buildImplementationStepPrompt({ isMax && `Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`, 'You must use the skill tool to load any potentially relevant skills.', + isFree && + `You must spawn the thinker-gemini agent once per user request to plan the best response.`, isMax && `You must spawn the 'editor-multi-prompt' agent to implement code changes rather than using the str_replace or write_file tools, since it will generate the best code changes.`, (isDefault || isMax) && From 5181f84170bb0006e8a3f478993b957f13629139 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 14:43:29 -0700 Subject: [PATCH 3/9] thinker-gemini: low reasoning, be concise --- agents/thinker/thinker-gemini.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/agents/thinker/thinker-gemini.ts b/agents/thinker/thinker-gemini.ts index b8ab3f1a59..015461ed29 100644 --- a/agents/thinker/thinker-gemini.ts +++ b/agents/thinker/thinker-gemini.ts @@ -7,10 +7,13 @@ const definition: SecretAgentDefinition = { id: 'thinker-gemini', model: 'google/gemini-3.1-pro-preview', providerOptions: undefined, + reasoningOptions: { + effort: 'low', + }, outputSchema: undefined, outputMode: 'last_message', inheritParentSystemPrompt: false, - instructionsPrompt: `You are the thinker-gemini agent. Think deeply about the user request and when satisfied, write out your response. + instructionsPrompt: `You are the thinker-gemini agent. Think about the user request and when satisfied, write out a very concise response that captures the most important points. DO NOT be verbose -- say the absolute minimum needed to answer the user's question correctly. The parent agent will see your response. DO NOT call any tools. No need to spawn the thinker agent, because you are already the thinker agent. Just do the thinking work now.`, handleSteps: function* () { From 44ef1c18f65651f99be8ac61cfc796faca0bff34 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 14:54:18 -0700 Subject: [PATCH 4/9] Save traces option for buffbench --- evals/buffbench/main-hard-tasks.ts | 3 +++ evals/buffbench/main-nightly.ts | 3 +++ evals/buffbench/main-single-eval.ts | 3 +++ evals/buffbench/main.ts | 3 +++ evals/buffbench/run-buffbench.ts | 20 ++++++++++++++++++++ 5 files changed, 32 insertions(+) diff --git a/evals/buffbench/main-hard-tasks.ts b/evals/buffbench/main-hard-tasks.ts index 989d049d97..0d03c20f0d 100644 --- a/evals/buffbench/main-hard-tasks.ts +++ b/evals/buffbench/main-hard-tasks.ts @@ -13,6 +13,8 @@ function loadTaskIds(evalPath: string): string[] { } async function main() { + const saveTraces = process.argv.includes('--save-traces') + const evalPaths = [ path.join(__dirname, 'eval-codebuff2.json'), path.join(__dirname, 'eval-manifold2.json'), @@ -33,6 +35,7 @@ async function main() { agents: ['base2', 'external:claude'], taskIds: allTaskIds, taskConcurrency: 4, + saveTraces, }) process.exit(0) diff --git a/evals/buffbench/main-nightly.ts b/evals/buffbench/main-nightly.ts index ff5f89980b..df3c6f0ea5 100644 --- a/evals/buffbench/main-nightly.ts +++ b/evals/buffbench/main-nightly.ts @@ -8,6 +8,8 @@ import type { MetaAnalysisResult } from './meta-analyzer' import type { AgentEvalResults } from './types' async function main() { + const saveTraces = process.argv.includes('--save-traces') + console.log('Starting nightly buffbench evaluation...') console.log('Eval set: codebuff') console.log() @@ -16,6 +18,7 @@ async function main() { evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')], agents: ['base2-free'], taskConcurrency: 3, + saveTraces, }) console.log('\nNightly buffbench evaluation completed successfully!') diff --git a/evals/buffbench/main-single-eval.ts b/evals/buffbench/main-single-eval.ts index 229251932f..bae330cdcf 100644 --- a/evals/buffbench/main-single-eval.ts +++ b/evals/buffbench/main-single-eval.ts @@ -3,10 +3,13 @@ import path from 'path' import { runBuffBench } from './run-buffbench' async function main() { + const saveTraces = process.argv.includes('--save-traces') + await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], agents: ['base2'], taskIds: ['filter-system-history'], + saveTraces, }) process.exit(0) diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index c96acbe0c0..ef4e9149ed 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -3,6 +3,8 @@ import path from 'path' import { runBuffBench } from './run-buffbench' async function main() { + const saveTraces = process.argv.includes('--save-traces') + // Compare Codebuff agents against external CLI agents // Use 'external:claude' for Claude Code CLI // Use 'external:codex' for OpenAI Codex CLI @@ -10,6 +12,7 @@ async function main() { evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], agents: ['base2-free'], taskConcurrency: 5, + saveTraces, }) process.exit(0) diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts index a086f092eb..c501425dd2 100644 --- a/evals/buffbench/run-buffbench.ts +++ b/evals/buffbench/run-buffbench.ts @@ -57,6 +57,7 @@ async function runTask(options: { printEvents: boolean finalCheckCommands?: string[] disableAnalysis?: boolean + saveTraces?: boolean }) { const { client, @@ -74,6 +75,7 @@ async function runTask(options: { printEvents, finalCheckCommands, disableAnalysis, + saveTraces = false, } = options console.log( @@ -173,6 +175,21 @@ async function runTask(options: { finalCheckOutputs: agentResult.finalCheckOutputs, }) + // Save judge traces to separate files if saveTraces is enabled + if (saveTraces) { + const tracesDir = path.join(logsDir, 'traces') + if (!fs.existsSync(tracesDir)) { + fs.mkdirSync(tracesDir, { recursive: true }) + } + + // Save agent trace only (not judge traces) + const agentTracePath = path.join( + tracesDir, + `${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}-agent.json`, + ) + fs.writeFileSync(agentTracePath, JSON.stringify(agentResult.trace, null, 2)) + } + fs.writeFileSync( tracePath, JSON.stringify(commitTraces[commitTraces.length - 1], null, 2), @@ -300,6 +317,7 @@ export async function runBuffBench(options: { taskIds?: string[] extractLessons?: boolean disableAnalysis?: boolean + saveTraces?: boolean }) { const { evalDataPaths, @@ -308,6 +326,7 @@ export async function runBuffBench(options: { taskIds, extractLessons = false, disableAnalysis = false, + saveTraces = false, } = options if (evalDataPaths.length === 0) { @@ -453,6 +472,7 @@ export async function runBuffBench(options: { printEvents: agents.length === 1 && taskConcurrency === 1, finalCheckCommands: evalData.finalCheckCommands, disableAnalysis, + saveTraces, }), ) }) From 4345a87c03d99b8f74eb078e6272ef33130f31ff Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 14:54:35 -0700 Subject: [PATCH 5/9] Allow loading skills dynamically --- common/src/tools/params/tool/skill.ts | 4 +- .../src/tools/handlers/tool/skill.ts | 89 ++++++++++++++++++- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/common/src/tools/params/tool/skill.ts b/common/src/tools/params/tool/skill.ts index bb8c18f7a7..a8640d6481 100644 --- a/common/src/tools/params/tool/skill.ts +++ b/common/src/tools/params/tool/skill.ts @@ -34,9 +34,11 @@ export const AVAILABLE_SKILLS_PLACEHOLDER = '{{AVAILABLE_SKILLS}}' // Base description - the full description with available skills is generated dynamically const baseDescription = `Load a skill by name to get its full instructions. Skills provide reusable behaviors and domain-specific knowledge that you can use to complete tasks. -The following are the only skills that are currently available (do not try to use any other skills): +The following are the pre-loaded skills available at session start: ${AVAILABLE_SKILLS_PLACEHOLDER} +Note: You can also load any skill that was created during this session by specifying its name. The skill will be loaded dynamically from disk. + Example: ${$getNativeToolCallExampleString({ toolName, diff --git a/packages/agent-runtime/src/tools/handlers/tool/skill.ts b/packages/agent-runtime/src/tools/handlers/tool/skill.ts index 0c2956a117..9eaf2ccb7a 100644 --- a/packages/agent-runtime/src/tools/handlers/tool/skill.ts +++ b/packages/agent-runtime/src/tools/handlers/tool/skill.ts @@ -1,4 +1,10 @@ import { jsonToolResult } from '@codebuff/common/util/messages' +import { SKILLS_DIR_NAME, SKILL_FILE_NAME } from '@codebuff/common/constants/skills' +import { SkillFrontmatterSchema, type SkillDefinition } from '@codebuff/common/types/skill' +import fs from 'fs' +import path from 'path' +import os from 'os' +import matter from 'gray-matter' import type { CodebuffToolHandlerFunction } from '../handler-function-type' import type { @@ -7,6 +13,73 @@ import type { } from '@codebuff/common/tools/list' import type { ProjectFileContext } from '@codebuff/common/util/file' +/** + * Dynamically load a single skill from disk. + * Used when a skill is not found in the pre-loaded cache but may have been created during the session. + */ +async function loadSkillFromDisk( + projectRoot: string, + skillName: string, +): Promise { + const home = os.homedir() + const skillsDirs = [ + // Global directories first + path.join(home, '.agents', SKILLS_DIR_NAME), + path.join(home, '.claude', SKILLS_DIR_NAME), + // Project directories (later takes precedence for overwriting) + path.join(projectRoot, '.agents', SKILLS_DIR_NAME), + path.join(projectRoot, '.claude', SKILLS_DIR_NAME), + ] + + for (const skillsDir of skillsDirs) { + const skillDir = path.join(skillsDir, skillName) + const skillFilePath = path.join(skillDir, SKILL_FILE_NAME) + + try { + // Check if the skill directory and file exist + const stat = fs.statSync(skillDir) + if (!stat.isDirectory()) continue + + fs.statSync(skillFilePath) // Will throw if file doesn't exist + + // Read and parse the skill file + const content = fs.readFileSync(skillFilePath, 'utf8') + const parsed = matter(content) + + if (!parsed.data || Object.keys(parsed.data).length === 0) { + continue + } + + // Validate frontmatter + const result = SkillFrontmatterSchema.safeParse(parsed.data) + if (!result.success) { + continue + } + + const frontmatter = result.data + + // Verify name matches directory name + if (frontmatter.name !== skillName) { + continue + } + + return { + name: frontmatter.name, + description: frontmatter.description, + content, + license: frontmatter.license, + filePath: skillFilePath, + metadata: frontmatter.metadata, + } + } catch { + // Skill doesn't exist in this directory, try the next one + continue + } + } + + return null +} + type ToolName = 'skill' export const handleSkill = (async (params: { @@ -20,14 +93,24 @@ export const handleSkill = (async (params: { await previousToolCallFinished const skills = fileContext.skills ?? {} - const skill = skills[name] + const cachedSkill = skills[name] + + // If skill not in cache, try to load it dynamically from disk + // This supports skills created during the session + const diskSkill = cachedSkill + ? null + : fileContext.projectRoot + ? await loadSkillFromDisk(fileContext.projectRoot, name) + : null + + const skill = cachedSkill ?? diskSkill if (!skill) { const availableSkills = Object.keys(skills) const suggestion = availableSkills.length > 0 - ? ` Available skills: ${availableSkills.join(', ')}` - : ' No skills are currently available.' + ? ` Available skills: ${availableSkills.join(', ')}. You can also load skills created during this session by name.` + : ' No skills are currently available. You can load skills created during this session by name.' return { output: jsonToolResult({ From 4347ca035d3673caf6bcfed9037e3ab066579fec Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 14:56:30 -0700 Subject: [PATCH 6/9] buffbench: base2-free-evals --- evals/buffbench/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index ef4e9149ed..aeb462abe3 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -10,7 +10,7 @@ async function main() { // Use 'external:codex' for OpenAI Codex CLI await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], - agents: ['base2-free'], + agents: ['base2-free-evals'], taskConcurrency: 5, saveTraces, }) From 91516e6147b447864fa2f3e127d128948df9cced Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 17:32:22 -0700 Subject: [PATCH 7/9] Add a missing env to eval-codebuff.json --- evals/buffbench/eval-codebuff.json | 1 + 1 file changed, 1 insertion(+) diff --git a/evals/buffbench/eval-codebuff.json b/evals/buffbench/eval-codebuff.json index 9b5fac55d7..67ef66a02f 100644 --- a/evals/buffbench/eval-codebuff.json +++ b/evals/buffbench/eval-codebuff.json @@ -28,6 +28,7 @@ "STRIPE_SECRET_KEY": "test-stripe-key", "STRIPE_WEBHOOK_SECRET_KEY": "test-stripe-webhook", "STRIPE_TEAM_FEE_PRICE_ID": "test-team-price-id", + "STRIPE_USAGE_PRICE_ID": "test-usage-price-id", "LOOPS_API_KEY": "test-loops", "DISCORD_PUBLIC_KEY": "test-discord-public", "DISCORD_BOT_TOKEN": "test-discord-bot", From 6aed18d4efa548d82408370a9ae7cb3db9e5a275 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 17:56:01 -0700 Subject: [PATCH 8/9] Add thinker-with-files-gemini, which is cheaper since has no context except prompt + passed in files --- agents/base2/base2.ts | 10 ++-- agents/thinker/thinker-with-files-gemini.ts | 61 +++++++++++++++++++++ common/src/constants/free-agents.ts | 3 + 3 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 agents/thinker/thinker-with-files-gemini.ts diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index 58d8c8b979..3d504edfb2 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -87,7 +87,7 @@ export function createBase2( isFree && 'code-reviewer-lite', isDefault && 'code-reviewer', isMax && 'code-reviewer-multi-prompt', - isFree && 'thinker-gemini', + isFree && 'thinker-with-files-gemini', 'thinker-gpt', 'context-pruner', ), @@ -143,7 +143,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u ${buildArray( '- Spawn context-gathering agents (file pickers and web/docs researchers) before making edits. Use the code_search, list_directory, and glob tools directly for searching and exploring the codebase.', isFree && 'Do not spawn the thinker-gpt agent, unless the user asks. Not everyone has connected their ChatGPT subscription to Codebuff to allow for it.', - isFree && 'You must spawn the thinker-gemini agent to think through and plan the reponse to most requests, unless the request is trivial. This agent is extremely useful as it is very smart!', + isFree && 'You must spawn the thinker-with-files-gemini agent to think through and plan the reponse to most requests, unless the request is trivial. This agent is extremely useful as it is very smart! You must pass the relevant filePaths when spawning it, since it does not have access to the conversation history.', isDefault && '- Spawn the editor agent to implement the changes after you have gathered all the context you need.', (isDefault || isMax) && @@ -206,7 +206,7 @@ ${buildArray( [ You read a few other relevant files using the read_files tool ]${!noAskUser ? `\n\n[ You ask the user for important clarifications on their request or alternate implementation strategies using the ask_user tool ]` : '' - }${isFree ? `\n\n[ You spawn the thinker-gemini agent to plan the best response ]` : ''} + }${isFree ? `\n\n[ You spawn the thinker-with-files-gemini agent with the relevant filePaths to plan the best response ]` : ''} ${isDefault ? `[ You implement the changes using the editor agent ]` : isFast || isFree @@ -334,7 +334,7 @@ ${buildArray( (isDefault || isMax) && `- For any task requiring 3+ steps, use the write_todos tool to write out your step-by-step implementation plan. Include ALL of the applicable tasks in the list.${isFast ? '' : ' You should include a step to review the changes after you have implemented the changes.'}:${hasNoValidation ? '' : ' You should include at least one step to validate/test your changes: be specific about whether to typecheck, run tests, run lints, etc.'} You may be able to do reviewing and validation in parallel in the same step. Skip write_todos for simple tasks like quick edits or answering questions.`, isFree && - `- For most requests, spawn the thinker-gemini agent to think through and plan the best response. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it about once per user request. Gather all the necessary context *before* spawning the thinker-gemini agent.`, + `- For most requests, spawn the thinker-with-files-gemini agent to think through and plan the best response. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it about once per user request. Gather all the necessary context *before* spawning it, and pass the relevant filePaths since it does not have access to the conversation history.`, (isDefault || isMax) && `- For quick problems, briefly explain your reasoning to the user. If you need to think longer, write your thoughts within the tags. Finally, for complex problems, spawn the thinker agent to help find the best solution. (gpt-5-agent is a last resort for complex problems)`, isDefault && @@ -380,7 +380,7 @@ function buildImplementationStepPrompt({ `Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`, 'You must use the skill tool to load any potentially relevant skills.', isFree && - `You must spawn the thinker-gemini agent once per user request to plan the best response.`, + `You must spawn the thinker-with-files-gemini agent once per user request to plan the best response. Pass the relevant filePaths since it does not have access to the conversation history.`, isMax && `You must spawn the 'editor-multi-prompt' agent to implement code changes rather than using the str_replace or write_file tools, since it will generate the best code changes.`, (isDefault || isMax) && diff --git a/agents/thinker/thinker-with-files-gemini.ts b/agents/thinker/thinker-with-files-gemini.ts new file mode 100644 index 0000000000..0f9ec5ad33 --- /dev/null +++ b/agents/thinker/thinker-with-files-gemini.ts @@ -0,0 +1,61 @@ +import { publisher } from '../constants' + +import type { SecretAgentDefinition } from '../types/secret-agent-definition' + +const definition: SecretAgentDefinition = { + id: 'thinker-with-files-gemini', + publisher, + model: 'google/gemini-3.1-pro-preview', + displayName: 'Theo the Theorizer with Files (Gemini)', + reasoningOptions: { + effort: 'low', + }, + spawnerPrompt: + 'Does deep thinking given the prompt and provided files using Gemini. Use this to help you solve a specific problem. This agent has no context on the conversation history so it cannot see files you have read or previous discussion. Instead, you must provide all the relevant context via the prompt or filePaths for this agent to work well.', + inputSchema: { + prompt: { + type: 'string', + description: 'The problem you are trying to solve', + }, + params: { + type: 'object', + properties: { + filePaths: { + type: 'array', + items: { + type: 'string', + description: 'The path to a file', + }, + description: + 'A list of relevant file paths to read before thinking. Try to provide ALL the files that could be relevant to your request.', + }, + }, + required: ['filePaths'], + }, + }, + outputMode: 'last_message', + outputSchema: undefined, + includeMessageHistory: false, + inheritParentSystemPrompt: false, + spawnableAgents: [], + toolNames: [], + + instructionsPrompt: `You are the thinker-with-files-gemini agent. Think about the user request and when satisfied, write out a very concise response that captures the most important points. DO NOT be verbose -- say the absolute minimum needed to answer the user's question correctly. + +The parent agent will see your response. DO NOT call any tools. No need to spawn the thinker agent, because you are already the thinker agent. Just do the thinking work now.`, + + handleSteps: function* ({ params }) { + const filePaths = params?.filePaths as string[] | undefined + + if (filePaths && filePaths.length > 0) { + yield { + toolName: 'read_files', + input: { paths: filePaths }, + } + } + + yield 'STEP' + }, +} + +export default definition diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index 2f44ca8a9a..e56e3fb58a 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -37,6 +37,9 @@ export const FREE_MODE_AGENT_MODELS: Record> = { // Code reviewer for free mode 'code-reviewer-lite': new Set(['minimax/minimax-m2.5']), + + // Thinker for free mode + 'thinker-with-files-gemini': new Set(['google/gemini-3.1-pro-preview']), } /** From a88787235d0c050f7aa3c628c677ca48d508fbd0 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Fri, 10 Apr 2026 18:34:11 -0700 Subject: [PATCH 9/9] Update behavior of read-files to truncate after 100k chars --- sdk/src/__tests__/read-files.test.ts | 84 ++++++++++++++++++++-------- sdk/src/tools/read-files.ts | 26 +++++++-- 2 files changed, 82 insertions(+), 28 deletions(-) diff --git a/sdk/src/__tests__/read-files.test.ts b/sdk/src/__tests__/read-files.test.ts index e03f1e18eb..547bbfaa45 100644 --- a/sdk/src/__tests__/read-files.test.ts +++ b/sdk/src/__tests__/read-files.test.ts @@ -186,12 +186,13 @@ describe('getFiles', () => { }) describe('file too large', () => { - test('should return TOO_LARGE for files over 1MB', async () => { + test('should truncate files over 100k chars to 1k chars with message', async () => { + const largeContent = 'x'.repeat(101_000) // 101k chars - over limit const mockFs = createMockFs({ files: { '/project/large.bin': { - content: 'x', - size: 2 * 1024 * 1024, // 2MB + content: largeContent, + size: largeContent.length, }, }, }) @@ -202,28 +203,75 @@ describe('getFiles', () => { fs: mockFs, }) - expect(result['large.bin']).toContain(FILE_READ_STATUS.TOO_LARGE) - expect(result['large.bin']).toContain('2.00MB') + // Should contain first 1k chars + expect(result['large.bin']).toContain('x'.repeat(1000)) + // Should contain truncation message + expect(result['large.bin']).toContain('FILE_TOO_LARGE') + expect(result['large.bin']).toContain('101,000 chars') + }) + + test('should read files at exactly 100k chars', async () => { + const exactly100kContent = 'x'.repeat(100_000) // exactly 100k chars + const mockFs = createMockFs({ + files: { + '/project/exactly100k.bin': { + content: exactly100kContent, + size: exactly100kContent.length, + }, + }, + }) + + const result = await getFiles({ + filePaths: ['exactly100k.bin'], + cwd: '/project', + fs: mockFs, + }) + + // Should be read fully (no truncation message) + expect(result['exactly100k.bin']).toBe(exactly100kContent) + expect(result['exactly100k.bin']).not.toContain('FILE_TOO_LARGE') }) - test('should read files exactly at 1MB limit', async () => { - const oneMBContent = 'x'.repeat(1024 * 1024) + test('should reject files over 10MB without reading them', async () => { const mockFs = createMockFs({ files: { - '/project/exactly1mb.bin': { - content: oneMBContent, - size: 1024 * 1024, // exactly 1MB + '/project/huge.bin': { + content: 'x', + size: 15 * 1024 * 1024, // 15MB }, }, }) const result = await getFiles({ - filePaths: ['exactly1mb.bin'], + filePaths: ['huge.bin'], cwd: '/project', fs: mockFs, }) - expect(result['exactly1mb.bin']).toBe(oneMBContent) + expect(result['huge.bin']).toContain(FILE_READ_STATUS.TOO_LARGE) + expect(result['huge.bin']).toContain('15.0MB') + }) + + test('should read files just under 100k chars', async () => { + const justUnder100k = 'x'.repeat(99_000) // under limit + const mockFs = createMockFs({ + files: { + '/project/underlimit.bin': { + content: justUnder100k, + size: justUnder100k.length, + }, + }, + }) + + const result = await getFiles({ + filePaths: ['underlimit.bin'], + cwd: '/project', + fs: mockFs, + }) + + // Should be read fully (no truncation message) + expect(result['underlimit.bin']).toBe(justUnder100k) + expect(result['underlimit.bin']).not.toContain('FILE_TOO_LARGE') }) }) @@ -347,18 +395,6 @@ describe('getFiles', () => { }, }) - // Need to also make stat fail with same error - const originalStat = mockFs.stat - Object.assign(mockFs, { - stat: async (filePath: PathLike) => { - const pathStr = String(filePath) - if (pathStr === '/project/broken.ts') { - throw createNodeError('Permission denied', 'EACCES') - } - return originalStat(pathStr) - }, - }) - const result = await getFiles({ filePaths: ['broken.ts'], cwd: '/project', diff --git a/sdk/src/tools/read-files.ts b/sdk/src/tools/read-files.ts index e2d68b95fe..351eddfb54 100644 --- a/sdk/src/tools/read-files.ts +++ b/sdk/src/tools/read-files.ts @@ -28,7 +28,11 @@ export async function getFiles(params: { const hasCustomFilter = fileFilter !== undefined const result: Record = {} - const MAX_FILE_SIZE = 1024 * 1024 // 1MB in bytes + const MAX_FILE_BYTES = 10 * 1024 * 1024 // 10MB - skip reading entirely + const MAX_CHARS = 100_000 // 100k characters threshold + const TRUNCATE_TO_CHARS = 1_000 // Show first 1k chars when over limit + const numFmt = new Intl.NumberFormat('en-US') + const fmtNum = (n: number) => numFmt.format(n) for (const filePath of filePaths) { if (!filePath) { @@ -68,13 +72,27 @@ export async function getFiles(params: { } try { + // Safety check: skip reading files over 10MB to avoid OOM const stats = await fs.stat(fullPath) - if (stats.size > MAX_FILE_SIZE) { + if (stats.size > MAX_FILE_BYTES) { result[relativePath] = FILE_READ_STATUS.TOO_LARGE + - ` [${(stats.size / (1024 * 1024)).toFixed(2)}MB]` + ` [${(stats.size / (1024 * 1024)).toFixed(1)}MB exceeds 10MB limit. Use code_search or glob to find specific content.]` + continue + } + + const content = await fs.readFile(fullPath, 'utf8') + + if (content.length > MAX_CHARS) { + const truncated = content.slice(0, TRUNCATE_TO_CHARS) + result[relativePath] = + truncated + + '\n\n[FILE_TOO_LARGE: This file is ' + + fmtNum(content.length) + + ' chars, exceeding the 100k char limit. Only the first ' + + fmtNum(TRUNCATE_TO_CHARS) + + ' chars are shown. Use other tools to read sections of the file.]' } else { - const content = await fs.readFile(fullPath, 'utf8') // Prepend TEMPLATE marker for example files result[relativePath] = isExampleFile ? FILE_READ_STATUS.TEMPLATE + '\n' + content