diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index 9888a360f2..3d504edfb2 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -87,7 +87,7 @@ export function createBase2( isFree && 'code-reviewer-lite', isDefault && 'code-reviewer', isMax && 'code-reviewer-multi-prompt', - isFree && 'thinker-gemini', + isFree && 'thinker-with-files-gemini', 'thinker-gpt', 'context-pruner', ), @@ -143,7 +143,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u ${buildArray( '- Spawn context-gathering agents (file pickers and web/docs researchers) before making edits. Use the code_search, list_directory, and glob tools directly for searching and exploring the codebase.', isFree && 'Do not spawn the thinker-gpt agent, unless the user asks. Not everyone has connected their ChatGPT subscription to Codebuff to allow for it.', - isFree && 'You should spawn the thinker-gemini agent whenever you encounter a complex problem or the user asks you to think about a problem. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it often!', + isFree && 'You must spawn the thinker-with-files-gemini agent to think through and plan the reponse to most requests, unless the request is trivial. This agent is extremely useful as it is very smart! You must pass the relevant filePaths when spawning it, since it does not have access to the conversation history.', isDefault && '- Spawn the editor agent to implement the changes after you have gathered all the context you need.', (isDefault || isMax) && @@ -206,7 +206,7 @@ ${buildArray( [ You read a few other relevant files using the read_files tool ]${!noAskUser ? `\n\n[ You ask the user for important clarifications on their request or alternate implementation strategies using the ask_user tool ]` : '' - } + }${isFree ? `\n\n[ You spawn the thinker-with-files-gemini agent with the relevant filePaths to plan the best response ]` : ''} ${isDefault ? `[ You implement the changes using the editor agent ]` : isFast || isFree @@ -334,7 +334,7 @@ ${buildArray( (isDefault || isMax) && `- For any task requiring 3+ steps, use the write_todos tool to write out your step-by-step implementation plan. Include ALL of the applicable tasks in the list.${isFast ? '' : ' You should include a step to review the changes after you have implemented the changes.'}:${hasNoValidation ? '' : ' You should include at least one step to validate/test your changes: be specific about whether to typecheck, run tests, run lints, etc.'} You may be able to do reviewing and validation in parallel in the same step. Skip write_todos for simple tasks like quick edits or answering questions.`, isFree && - `- For complex problems, spawn the thinker-gemini agent to help find the best solution. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it often!`, + `- For most requests, spawn the thinker-with-files-gemini agent to think through and plan the best response. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it about once per user request. Gather all the necessary context *before* spawning it, and pass the relevant filePaths since it does not have access to the conversation history.`, (isDefault || isMax) && `- For quick problems, briefly explain your reasoning to the user. If you need to think longer, write your thoughts within the tags. Finally, for complex problems, spawn the thinker agent to help find the best solution. (gpt-5-agent is a last resort for complex problems)`, isDefault && @@ -379,6 +379,8 @@ function buildImplementationStepPrompt({ isMax && `Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`, 'You must use the skill tool to load any potentially relevant skills.', + isFree && + `You must spawn the thinker-with-files-gemini agent once per user request to plan the best response. Pass the relevant filePaths since it does not have access to the conversation history.`, isMax && `You must spawn the 'editor-multi-prompt' agent to implement code changes rather than using the str_replace or write_file tools, since it will generate the best code changes.`, (isDefault || isMax) && diff --git a/agents/basher.ts b/agents/basher.ts index 671437bff1..dc9dc689d1 100644 --- a/agents/basher.ts +++ b/agents/basher.ts @@ -11,7 +11,7 @@ const basher: AgentDefinition = { model: 'google/gemini-3.1-flash-lite-preview', displayName: 'Basher', spawnerPrompt: - 'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor.', + 'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor. Requires both a shell command and a prompt.', inputSchema: { prompt: { @@ -24,7 +24,7 @@ const basher: AgentDefinition = { properties: { command: { type: 'string', - description: 'Terminal command to run', + description: 'Terminal command to run in bash shell', }, timeout_seconds: { type: 'number', diff --git a/agents/thinker/thinker-gemini.ts b/agents/thinker/thinker-gemini.ts index b8ab3f1a59..015461ed29 100644 --- a/agents/thinker/thinker-gemini.ts +++ b/agents/thinker/thinker-gemini.ts @@ -7,10 +7,13 @@ const definition: SecretAgentDefinition = { id: 'thinker-gemini', model: 'google/gemini-3.1-pro-preview', providerOptions: undefined, + reasoningOptions: { + effort: 'low', + }, outputSchema: undefined, outputMode: 'last_message', inheritParentSystemPrompt: false, - instructionsPrompt: `You are the thinker-gemini agent. Think deeply about the user request and when satisfied, write out your response. + instructionsPrompt: `You are the thinker-gemini agent. Think about the user request and when satisfied, write out a very concise response that captures the most important points. DO NOT be verbose -- say the absolute minimum needed to answer the user's question correctly. The parent agent will see your response. DO NOT call any tools. No need to spawn the thinker agent, because you are already the thinker agent. Just do the thinking work now.`, handleSteps: function* () { diff --git a/agents/thinker/thinker-with-files-gemini.ts b/agents/thinker/thinker-with-files-gemini.ts new file mode 100644 index 0000000000..0f9ec5ad33 --- /dev/null +++ b/agents/thinker/thinker-with-files-gemini.ts @@ -0,0 +1,61 @@ +import { publisher } from '../constants' + +import type { SecretAgentDefinition } from '../types/secret-agent-definition' + +const definition: SecretAgentDefinition = { + id: 'thinker-with-files-gemini', + publisher, + model: 'google/gemini-3.1-pro-preview', + displayName: 'Theo the Theorizer with Files (Gemini)', + reasoningOptions: { + effort: 'low', + }, + spawnerPrompt: + 'Does deep thinking given the prompt and provided files using Gemini. Use this to help you solve a specific problem. This agent has no context on the conversation history so it cannot see files you have read or previous discussion. Instead, you must provide all the relevant context via the prompt or filePaths for this agent to work well.', + inputSchema: { + prompt: { + type: 'string', + description: 'The problem you are trying to solve', + }, + params: { + type: 'object', + properties: { + filePaths: { + type: 'array', + items: { + type: 'string', + description: 'The path to a file', + }, + description: + 'A list of relevant file paths to read before thinking. Try to provide ALL the files that could be relevant to your request.', + }, + }, + required: ['filePaths'], + }, + }, + outputMode: 'last_message', + outputSchema: undefined, + includeMessageHistory: false, + inheritParentSystemPrompt: false, + spawnableAgents: [], + toolNames: [], + + instructionsPrompt: `You are the thinker-with-files-gemini agent. Think about the user request and when satisfied, write out a very concise response that captures the most important points. DO NOT be verbose -- say the absolute minimum needed to answer the user's question correctly. + +The parent agent will see your response. DO NOT call any tools. No need to spawn the thinker agent, because you are already the thinker agent. Just do the thinking work now.`, + + handleSteps: function* ({ params }) { + const filePaths = params?.filePaths as string[] | undefined + + if (filePaths && filePaths.length > 0) { + yield { + toolName: 'read_files', + input: { paths: filePaths }, + } + } + + yield 'STEP' + }, +} + +export default definition diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts index 2f44ca8a9a..e56e3fb58a 100644 --- a/common/src/constants/free-agents.ts +++ b/common/src/constants/free-agents.ts @@ -37,6 +37,9 @@ export const FREE_MODE_AGENT_MODELS: Record> = { // Code reviewer for free mode 'code-reviewer-lite': new Set(['minimax/minimax-m2.5']), + + // Thinker for free mode + 'thinker-with-files-gemini': new Set(['google/gemini-3.1-pro-preview']), } /** diff --git a/common/src/tools/params/tool/skill.ts b/common/src/tools/params/tool/skill.ts index bb8c18f7a7..a8640d6481 100644 --- a/common/src/tools/params/tool/skill.ts +++ b/common/src/tools/params/tool/skill.ts @@ -34,9 +34,11 @@ export const AVAILABLE_SKILLS_PLACEHOLDER = '{{AVAILABLE_SKILLS}}' // Base description - the full description with available skills is generated dynamically const baseDescription = `Load a skill by name to get its full instructions. Skills provide reusable behaviors and domain-specific knowledge that you can use to complete tasks. -The following are the only skills that are currently available (do not try to use any other skills): +The following are the pre-loaded skills available at session start: ${AVAILABLE_SKILLS_PLACEHOLDER} +Note: You can also load any skill that was created during this session by specifying its name. The skill will be loaded dynamically from disk. + Example: ${$getNativeToolCallExampleString({ toolName, diff --git a/evals/buffbench/eval-codebuff.json b/evals/buffbench/eval-codebuff.json index 9b5fac55d7..67ef66a02f 100644 --- a/evals/buffbench/eval-codebuff.json +++ b/evals/buffbench/eval-codebuff.json @@ -28,6 +28,7 @@ "STRIPE_SECRET_KEY": "test-stripe-key", "STRIPE_WEBHOOK_SECRET_KEY": "test-stripe-webhook", "STRIPE_TEAM_FEE_PRICE_ID": "test-team-price-id", + "STRIPE_USAGE_PRICE_ID": "test-usage-price-id", "LOOPS_API_KEY": "test-loops", "DISCORD_PUBLIC_KEY": "test-discord-public", "DISCORD_BOT_TOKEN": "test-discord-bot", diff --git a/evals/buffbench/main-hard-tasks.ts b/evals/buffbench/main-hard-tasks.ts index 989d049d97..0d03c20f0d 100644 --- a/evals/buffbench/main-hard-tasks.ts +++ b/evals/buffbench/main-hard-tasks.ts @@ -13,6 +13,8 @@ function loadTaskIds(evalPath: string): string[] { } async function main() { + const saveTraces = process.argv.includes('--save-traces') + const evalPaths = [ path.join(__dirname, 'eval-codebuff2.json'), path.join(__dirname, 'eval-manifold2.json'), @@ -33,6 +35,7 @@ async function main() { agents: ['base2', 'external:claude'], taskIds: allTaskIds, taskConcurrency: 4, + saveTraces, }) process.exit(0) diff --git a/evals/buffbench/main-nightly.ts b/evals/buffbench/main-nightly.ts index ff5f89980b..df3c6f0ea5 100644 --- a/evals/buffbench/main-nightly.ts +++ b/evals/buffbench/main-nightly.ts @@ -8,6 +8,8 @@ import type { MetaAnalysisResult } from './meta-analyzer' import type { AgentEvalResults } from './types' async function main() { + const saveTraces = process.argv.includes('--save-traces') + console.log('Starting nightly buffbench evaluation...') console.log('Eval set: codebuff') console.log() @@ -16,6 +18,7 @@ async function main() { evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')], agents: ['base2-free'], taskConcurrency: 3, + saveTraces, }) console.log('\nNightly buffbench evaluation completed successfully!') diff --git a/evals/buffbench/main-single-eval.ts b/evals/buffbench/main-single-eval.ts index 229251932f..bae330cdcf 100644 --- a/evals/buffbench/main-single-eval.ts +++ b/evals/buffbench/main-single-eval.ts @@ -3,10 +3,13 @@ import path from 'path' import { runBuffBench } from './run-buffbench' async function main() { + const saveTraces = process.argv.includes('--save-traces') + await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], agents: ['base2'], taskIds: ['filter-system-history'], + saveTraces, }) process.exit(0) diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index c96acbe0c0..aeb462abe3 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -3,13 +3,16 @@ import path from 'path' import { runBuffBench } from './run-buffbench' async function main() { + const saveTraces = process.argv.includes('--save-traces') + // Compare Codebuff agents against external CLI agents // Use 'external:claude' for Claude Code CLI // Use 'external:codex' for OpenAI Codex CLI await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], - agents: ['base2-free'], + agents: ['base2-free-evals'], taskConcurrency: 5, + saveTraces, }) process.exit(0) diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts index a086f092eb..c501425dd2 100644 --- a/evals/buffbench/run-buffbench.ts +++ b/evals/buffbench/run-buffbench.ts @@ -57,6 +57,7 @@ async function runTask(options: { printEvents: boolean finalCheckCommands?: string[] disableAnalysis?: boolean + saveTraces?: boolean }) { const { client, @@ -74,6 +75,7 @@ async function runTask(options: { printEvents, finalCheckCommands, disableAnalysis, + saveTraces = false, } = options console.log( @@ -173,6 +175,21 @@ async function runTask(options: { finalCheckOutputs: agentResult.finalCheckOutputs, }) + // Save judge traces to separate files if saveTraces is enabled + if (saveTraces) { + const tracesDir = path.join(logsDir, 'traces') + if (!fs.existsSync(tracesDir)) { + fs.mkdirSync(tracesDir, { recursive: true }) + } + + // Save agent trace only (not judge traces) + const agentTracePath = path.join( + tracesDir, + `${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}-agent.json`, + ) + fs.writeFileSync(agentTracePath, JSON.stringify(agentResult.trace, null, 2)) + } + fs.writeFileSync( tracePath, JSON.stringify(commitTraces[commitTraces.length - 1], null, 2), @@ -300,6 +317,7 @@ export async function runBuffBench(options: { taskIds?: string[] extractLessons?: boolean disableAnalysis?: boolean + saveTraces?: boolean }) { const { evalDataPaths, @@ -308,6 +326,7 @@ export async function runBuffBench(options: { taskIds, extractLessons = false, disableAnalysis = false, + saveTraces = false, } = options if (evalDataPaths.length === 0) { @@ -453,6 +472,7 @@ export async function runBuffBench(options: { printEvents: agents.length === 1 && taskConcurrency === 1, finalCheckCommands: evalData.finalCheckCommands, disableAnalysis, + saveTraces, }), ) }) diff --git a/packages/agent-runtime/src/tools/handlers/tool/skill.ts b/packages/agent-runtime/src/tools/handlers/tool/skill.ts index 0c2956a117..9eaf2ccb7a 100644 --- a/packages/agent-runtime/src/tools/handlers/tool/skill.ts +++ b/packages/agent-runtime/src/tools/handlers/tool/skill.ts @@ -1,4 +1,10 @@ import { jsonToolResult } from '@codebuff/common/util/messages' +import { SKILLS_DIR_NAME, SKILL_FILE_NAME } from '@codebuff/common/constants/skills' +import { SkillFrontmatterSchema, type SkillDefinition } from '@codebuff/common/types/skill' +import fs from 'fs' +import path from 'path' +import os from 'os' +import matter from 'gray-matter' import type { CodebuffToolHandlerFunction } from '../handler-function-type' import type { @@ -7,6 +13,73 @@ import type { } from '@codebuff/common/tools/list' import type { ProjectFileContext } from '@codebuff/common/util/file' +/** + * Dynamically load a single skill from disk. + * Used when a skill is not found in the pre-loaded cache but may have been created during the session. + */ +async function loadSkillFromDisk( + projectRoot: string, + skillName: string, +): Promise { + const home = os.homedir() + const skillsDirs = [ + // Global directories first + path.join(home, '.agents', SKILLS_DIR_NAME), + path.join(home, '.claude', SKILLS_DIR_NAME), + // Project directories (later takes precedence for overwriting) + path.join(projectRoot, '.agents', SKILLS_DIR_NAME), + path.join(projectRoot, '.claude', SKILLS_DIR_NAME), + ] + + for (const skillsDir of skillsDirs) { + const skillDir = path.join(skillsDir, skillName) + const skillFilePath = path.join(skillDir, SKILL_FILE_NAME) + + try { + // Check if the skill directory and file exist + const stat = fs.statSync(skillDir) + if (!stat.isDirectory()) continue + + fs.statSync(skillFilePath) // Will throw if file doesn't exist + + // Read and parse the skill file + const content = fs.readFileSync(skillFilePath, 'utf8') + const parsed = matter(content) + + if (!parsed.data || Object.keys(parsed.data).length === 0) { + continue + } + + // Validate frontmatter + const result = SkillFrontmatterSchema.safeParse(parsed.data) + if (!result.success) { + continue + } + + const frontmatter = result.data + + // Verify name matches directory name + if (frontmatter.name !== skillName) { + continue + } + + return { + name: frontmatter.name, + description: frontmatter.description, + content, + license: frontmatter.license, + filePath: skillFilePath, + metadata: frontmatter.metadata, + } + } catch { + // Skill doesn't exist in this directory, try the next one + continue + } + } + + return null +} + type ToolName = 'skill' export const handleSkill = (async (params: { @@ -20,14 +93,24 @@ export const handleSkill = (async (params: { await previousToolCallFinished const skills = fileContext.skills ?? {} - const skill = skills[name] + const cachedSkill = skills[name] + + // If skill not in cache, try to load it dynamically from disk + // This supports skills created during the session + const diskSkill = cachedSkill + ? null + : fileContext.projectRoot + ? await loadSkillFromDisk(fileContext.projectRoot, name) + : null + + const skill = cachedSkill ?? diskSkill if (!skill) { const availableSkills = Object.keys(skills) const suggestion = availableSkills.length > 0 - ? ` Available skills: ${availableSkills.join(', ')}` - : ' No skills are currently available.' + ? ` Available skills: ${availableSkills.join(', ')}. You can also load skills created during this session by name.` + : ' No skills are currently available. You can load skills created during this session by name.' return { output: jsonToolResult({ diff --git a/sdk/src/__tests__/read-files.test.ts b/sdk/src/__tests__/read-files.test.ts index e03f1e18eb..547bbfaa45 100644 --- a/sdk/src/__tests__/read-files.test.ts +++ b/sdk/src/__tests__/read-files.test.ts @@ -186,12 +186,13 @@ describe('getFiles', () => { }) describe('file too large', () => { - test('should return TOO_LARGE for files over 1MB', async () => { + test('should truncate files over 100k chars to 1k chars with message', async () => { + const largeContent = 'x'.repeat(101_000) // 101k chars - over limit const mockFs = createMockFs({ files: { '/project/large.bin': { - content: 'x', - size: 2 * 1024 * 1024, // 2MB + content: largeContent, + size: largeContent.length, }, }, }) @@ -202,28 +203,75 @@ describe('getFiles', () => { fs: mockFs, }) - expect(result['large.bin']).toContain(FILE_READ_STATUS.TOO_LARGE) - expect(result['large.bin']).toContain('2.00MB') + // Should contain first 1k chars + expect(result['large.bin']).toContain('x'.repeat(1000)) + // Should contain truncation message + expect(result['large.bin']).toContain('FILE_TOO_LARGE') + expect(result['large.bin']).toContain('101,000 chars') + }) + + test('should read files at exactly 100k chars', async () => { + const exactly100kContent = 'x'.repeat(100_000) // exactly 100k chars + const mockFs = createMockFs({ + files: { + '/project/exactly100k.bin': { + content: exactly100kContent, + size: exactly100kContent.length, + }, + }, + }) + + const result = await getFiles({ + filePaths: ['exactly100k.bin'], + cwd: '/project', + fs: mockFs, + }) + + // Should be read fully (no truncation message) + expect(result['exactly100k.bin']).toBe(exactly100kContent) + expect(result['exactly100k.bin']).not.toContain('FILE_TOO_LARGE') }) - test('should read files exactly at 1MB limit', async () => { - const oneMBContent = 'x'.repeat(1024 * 1024) + test('should reject files over 10MB without reading them', async () => { const mockFs = createMockFs({ files: { - '/project/exactly1mb.bin': { - content: oneMBContent, - size: 1024 * 1024, // exactly 1MB + '/project/huge.bin': { + content: 'x', + size: 15 * 1024 * 1024, // 15MB }, }, }) const result = await getFiles({ - filePaths: ['exactly1mb.bin'], + filePaths: ['huge.bin'], cwd: '/project', fs: mockFs, }) - expect(result['exactly1mb.bin']).toBe(oneMBContent) + expect(result['huge.bin']).toContain(FILE_READ_STATUS.TOO_LARGE) + expect(result['huge.bin']).toContain('15.0MB') + }) + + test('should read files just under 100k chars', async () => { + const justUnder100k = 'x'.repeat(99_000) // under limit + const mockFs = createMockFs({ + files: { + '/project/underlimit.bin': { + content: justUnder100k, + size: justUnder100k.length, + }, + }, + }) + + const result = await getFiles({ + filePaths: ['underlimit.bin'], + cwd: '/project', + fs: mockFs, + }) + + // Should be read fully (no truncation message) + expect(result['underlimit.bin']).toBe(justUnder100k) + expect(result['underlimit.bin']).not.toContain('FILE_TOO_LARGE') }) }) @@ -347,18 +395,6 @@ describe('getFiles', () => { }, }) - // Need to also make stat fail with same error - const originalStat = mockFs.stat - Object.assign(mockFs, { - stat: async (filePath: PathLike) => { - const pathStr = String(filePath) - if (pathStr === '/project/broken.ts') { - throw createNodeError('Permission denied', 'EACCES') - } - return originalStat(pathStr) - }, - }) - const result = await getFiles({ filePaths: ['broken.ts'], cwd: '/project', diff --git a/sdk/src/tools/read-files.ts b/sdk/src/tools/read-files.ts index e2d68b95fe..351eddfb54 100644 --- a/sdk/src/tools/read-files.ts +++ b/sdk/src/tools/read-files.ts @@ -28,7 +28,11 @@ export async function getFiles(params: { const hasCustomFilter = fileFilter !== undefined const result: Record = {} - const MAX_FILE_SIZE = 1024 * 1024 // 1MB in bytes + const MAX_FILE_BYTES = 10 * 1024 * 1024 // 10MB - skip reading entirely + const MAX_CHARS = 100_000 // 100k characters threshold + const TRUNCATE_TO_CHARS = 1_000 // Show first 1k chars when over limit + const numFmt = new Intl.NumberFormat('en-US') + const fmtNum = (n: number) => numFmt.format(n) for (const filePath of filePaths) { if (!filePath) { @@ -68,13 +72,27 @@ export async function getFiles(params: { } try { + // Safety check: skip reading files over 10MB to avoid OOM const stats = await fs.stat(fullPath) - if (stats.size > MAX_FILE_SIZE) { + if (stats.size > MAX_FILE_BYTES) { result[relativePath] = FILE_READ_STATUS.TOO_LARGE + - ` [${(stats.size / (1024 * 1024)).toFixed(2)}MB]` + ` [${(stats.size / (1024 * 1024)).toFixed(1)}MB exceeds 10MB limit. Use code_search or glob to find specific content.]` + continue + } + + const content = await fs.readFile(fullPath, 'utf8') + + if (content.length > MAX_CHARS) { + const truncated = content.slice(0, TRUNCATE_TO_CHARS) + result[relativePath] = + truncated + + '\n\n[FILE_TOO_LARGE: This file is ' + + fmtNum(content.length) + + ' chars, exceeding the 100k char limit. Only the first ' + + fmtNum(TRUNCATE_TO_CHARS) + + ' chars are shown. Use other tools to read sections of the file.]' } else { - const content = await fs.readFile(fullPath, 'utf8') // Prepend TEMPLATE marker for example files result[relativePath] = isExampleFile ? FILE_READ_STATUS.TEMPLATE + '\n' + content