Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions agents/base2/base2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ export function createBase2(
isFree && 'code-reviewer-lite',
isDefault && 'code-reviewer',
isMax && 'code-reviewer-multi-prompt',
isFree && 'thinker-gemini',
isFree && 'thinker-with-files-gemini',
'thinker-gpt',
'context-pruner',
),
Expand Down Expand Up @@ -143,7 +143,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u
${buildArray(
'- Spawn context-gathering agents (file pickers and web/docs researchers) before making edits. Use the code_search, list_directory, and glob tools directly for searching and exploring the codebase.',
isFree && 'Do not spawn the thinker-gpt agent, unless the user asks. Not everyone has connected their ChatGPT subscription to Codebuff to allow for it.',
isFree && 'You should spawn the thinker-gemini agent whenever you encounter a complex problem or the user asks you to think about a problem. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it often!',
isFree && 'You must spawn the thinker-with-files-gemini agent to think through and plan the reponse to most requests, unless the request is trivial. This agent is extremely useful as it is very smart! You must pass the relevant filePaths when spawning it, since it does not have access to the conversation history.',
isDefault &&
'- Spawn the editor agent to implement the changes after you have gathered all the context you need.',
(isDefault || isMax) &&
Expand Down Expand Up @@ -206,7 +206,7 @@ ${buildArray(
[ You read a few other relevant files using the read_files tool ]${!noAskUser
? `\n\n[ You ask the user for important clarifications on their request or alternate implementation strategies using the ask_user tool ]`
: ''
}
}${isFree ? `\n\n[ You spawn the thinker-with-files-gemini agent with the relevant filePaths to plan the best response ]` : ''}
${isDefault
? `[ You implement the changes using the editor agent ]`
: isFast || isFree
Expand Down Expand Up @@ -334,7 +334,7 @@ ${buildArray(
(isDefault || isMax) &&
`- For any task requiring 3+ steps, use the write_todos tool to write out your step-by-step implementation plan. Include ALL of the applicable tasks in the list.${isFast ? '' : ' You should include a step to review the changes after you have implemented the changes.'}:${hasNoValidation ? '' : ' You should include at least one step to validate/test your changes: be specific about whether to typecheck, run tests, run lints, etc.'} You may be able to do reviewing and validation in parallel in the same step. Skip write_todos for simple tasks like quick edits or answering questions.`,
isFree &&
`- For complex problems, spawn the thinker-gemini agent to help find the best solution. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it often!`,
`- For most requests, spawn the thinker-with-files-gemini agent to think through and plan the best response. This agent is extremely useful as it is very smart. You must take advantage of it and spawn it about once per user request. Gather all the necessary context *before* spawning it, and pass the relevant filePaths since it does not have access to the conversation history.`,
(isDefault || isMax) &&
`- For quick problems, briefly explain your reasoning to the user. If you need to think longer, write your thoughts within the <think> tags. Finally, for complex problems, spawn the thinker agent to help find the best solution. (gpt-5-agent is a last resort for complex problems)`,
isDefault &&
Expand Down Expand Up @@ -379,6 +379,8 @@ function buildImplementationStepPrompt({
isMax &&
`Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`,
'You must use the skill tool to load any potentially relevant skills.',
isFree &&
`You must spawn the thinker-with-files-gemini agent once per user request to plan the best response. Pass the relevant filePaths since it does not have access to the conversation history.`,
isMax &&
`You must spawn the 'editor-multi-prompt' agent to implement code changes rather than using the str_replace or write_file tools, since it will generate the best code changes.`,
(isDefault || isMax) &&
Expand Down
4 changes: 2 additions & 2 deletions agents/basher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const basher: AgentDefinition = {
model: 'google/gemini-3.1-flash-lite-preview',
displayName: 'Basher',
spawnerPrompt:
'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor.',
'Runs a single terminal command and describes its output using an LLM. A lightweight shell command executor. Requires both a shell command and a prompt.',

inputSchema: {
prompt: {
Expand All @@ -24,7 +24,7 @@ const basher: AgentDefinition = {
properties: {
command: {
type: 'string',
description: 'Terminal command to run',
description: 'Terminal command to run in bash shell',
},
timeout_seconds: {
type: 'number',
Expand Down
5 changes: 4 additions & 1 deletion agents/thinker/thinker-gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@ const definition: SecretAgentDefinition = {
id: 'thinker-gemini',
model: 'google/gemini-3.1-pro-preview',
providerOptions: undefined,
reasoningOptions: {
effort: 'low',
},
outputSchema: undefined,
outputMode: 'last_message',
inheritParentSystemPrompt: false,
instructionsPrompt: `You are the thinker-gemini agent. Think deeply about the user request and when satisfied, write out your response.
instructionsPrompt: `You are the thinker-gemini agent. Think about the user request and when satisfied, write out a very concise response that captures the most important points. DO NOT be verbose -- say the absolute minimum needed to answer the user's question correctly.

The parent agent will see your response. DO NOT call any tools. No need to spawn the thinker agent, because you are already the thinker agent. Just do the thinking work now.`,
handleSteps: function* () {
Expand Down
61 changes: 61 additions & 0 deletions agents/thinker/thinker-with-files-gemini.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { publisher } from '../constants'

import type { SecretAgentDefinition } from '../types/secret-agent-definition'

const definition: SecretAgentDefinition = {
id: 'thinker-with-files-gemini',
publisher,
model: 'google/gemini-3.1-pro-preview',
displayName: 'Theo the Theorizer with Files (Gemini)',
reasoningOptions: {
effort: 'low',
},
spawnerPrompt:
'Does deep thinking given the prompt and provided files using Gemini. Use this to help you solve a specific problem. This agent has no context on the conversation history so it cannot see files you have read or previous discussion. Instead, you must provide all the relevant context via the prompt or filePaths for this agent to work well.',
inputSchema: {
prompt: {
type: 'string',
description: 'The problem you are trying to solve',
},
params: {
type: 'object',
properties: {
filePaths: {
type: 'array',
items: {
type: 'string',
description: 'The path to a file',
},
description:
'A list of relevant file paths to read before thinking. Try to provide ALL the files that could be relevant to your request.',
},
},
required: ['filePaths'],
},
},
outputMode: 'last_message',
outputSchema: undefined,
includeMessageHistory: false,
inheritParentSystemPrompt: false,
spawnableAgents: [],
toolNames: [],

instructionsPrompt: `You are the thinker-with-files-gemini agent. Think about the user request and when satisfied, write out a very concise response that captures the most important points. DO NOT be verbose -- say the absolute minimum needed to answer the user's question correctly.

The parent agent will see your response. DO NOT call any tools. No need to spawn the thinker agent, because you are already the thinker agent. Just do the thinking work now.`,

handleSteps: function* ({ params }) {
const filePaths = params?.filePaths as string[] | undefined

if (filePaths && filePaths.length > 0) {
yield {
toolName: 'read_files',
input: { paths: filePaths },
}
}

yield 'STEP'
},
}

export default definition
3 changes: 3 additions & 0 deletions common/src/constants/free-agents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {

// Code reviewer for free mode
'code-reviewer-lite': new Set(['minimax/minimax-m2.5']),

// Thinker for free mode
'thinker-with-files-gemini': new Set(['google/gemini-3.1-pro-preview']),
}

/**
Expand Down
4 changes: 3 additions & 1 deletion common/src/tools/params/tool/skill.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@ export const AVAILABLE_SKILLS_PLACEHOLDER = '{{AVAILABLE_SKILLS}}'
// Base description - the full description with available skills is generated dynamically
const baseDescription = `Load a skill by name to get its full instructions. Skills provide reusable behaviors and domain-specific knowledge that you can use to complete tasks.

The following are the only skills that are currently available (do not try to use any other skills):
The following are the pre-loaded skills available at session start:
${AVAILABLE_SKILLS_PLACEHOLDER}

Note: You can also load any skill that was created during this session by specifying its name. The skill will be loaded dynamically from disk.

Example:
${$getNativeToolCallExampleString({
toolName,
Expand Down
1 change: 1 addition & 0 deletions evals/buffbench/eval-codebuff.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"STRIPE_SECRET_KEY": "test-stripe-key",
"STRIPE_WEBHOOK_SECRET_KEY": "test-stripe-webhook",
"STRIPE_TEAM_FEE_PRICE_ID": "test-team-price-id",
"STRIPE_USAGE_PRICE_ID": "test-usage-price-id",
"LOOPS_API_KEY": "test-loops",
"DISCORD_PUBLIC_KEY": "test-discord-public",
"DISCORD_BOT_TOKEN": "test-discord-bot",
Expand Down
3 changes: 3 additions & 0 deletions evals/buffbench/main-hard-tasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ function loadTaskIds(evalPath: string): string[] {
}

async function main() {
const saveTraces = process.argv.includes('--save-traces')

const evalPaths = [
path.join(__dirname, 'eval-codebuff2.json'),
path.join(__dirname, 'eval-manifold2.json'),
Expand All @@ -33,6 +35,7 @@ async function main() {
agents: ['base2', 'external:claude'],
taskIds: allTaskIds,
taskConcurrency: 4,
saveTraces,
})

process.exit(0)
Expand Down
3 changes: 3 additions & 0 deletions evals/buffbench/main-nightly.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import type { MetaAnalysisResult } from './meta-analyzer'
import type { AgentEvalResults } from './types'

async function main() {
const saveTraces = process.argv.includes('--save-traces')

console.log('Starting nightly buffbench evaluation...')
console.log('Eval set: codebuff')
console.log()
Expand All @@ -16,6 +18,7 @@ async function main() {
evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')],
agents: ['base2-free'],
taskConcurrency: 3,
saveTraces,
})

console.log('\nNightly buffbench evaluation completed successfully!')
Expand Down
3 changes: 3 additions & 0 deletions evals/buffbench/main-single-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ import path from 'path'
import { runBuffBench } from './run-buffbench'

async function main() {
const saveTraces = process.argv.includes('--save-traces')

await runBuffBench({
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
agents: ['base2'],
taskIds: ['filter-system-history'],
saveTraces,
})

process.exit(0)
Expand Down
5 changes: 4 additions & 1 deletion evals/buffbench/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@ import path from 'path'
import { runBuffBench } from './run-buffbench'

async function main() {
const saveTraces = process.argv.includes('--save-traces')

// Compare Codebuff agents against external CLI agents
// Use 'external:claude' for Claude Code CLI
// Use 'external:codex' for OpenAI Codex CLI
await runBuffBench({
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
agents: ['base2-free'],
agents: ['base2-free-evals'],
taskConcurrency: 5,
saveTraces,
})

process.exit(0)
Expand Down
20 changes: 20 additions & 0 deletions evals/buffbench/run-buffbench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ async function runTask(options: {
printEvents: boolean
finalCheckCommands?: string[]
disableAnalysis?: boolean
saveTraces?: boolean
}) {
const {
client,
Expand All @@ -74,6 +75,7 @@ async function runTask(options: {
printEvents,
finalCheckCommands,
disableAnalysis,
saveTraces = false,
} = options

console.log(
Expand Down Expand Up @@ -173,6 +175,21 @@ async function runTask(options: {
finalCheckOutputs: agentResult.finalCheckOutputs,
})

// Save judge traces to separate files if saveTraces is enabled
if (saveTraces) {
const tracesDir = path.join(logsDir, 'traces')
if (!fs.existsSync(tracesDir)) {
fs.mkdirSync(tracesDir, { recursive: true })
}

// Save agent trace only (not judge traces)
const agentTracePath = path.join(
tracesDir,
`${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}-agent.json`,
)
fs.writeFileSync(agentTracePath, JSON.stringify(agentResult.trace, null, 2))
}

fs.writeFileSync(
tracePath,
JSON.stringify(commitTraces[commitTraces.length - 1], null, 2),
Expand Down Expand Up @@ -300,6 +317,7 @@ export async function runBuffBench(options: {
taskIds?: string[]
extractLessons?: boolean
disableAnalysis?: boolean
saveTraces?: boolean
}) {
const {
evalDataPaths,
Expand All @@ -308,6 +326,7 @@ export async function runBuffBench(options: {
taskIds,
extractLessons = false,
disableAnalysis = false,
saveTraces = false,
} = options

if (evalDataPaths.length === 0) {
Expand Down Expand Up @@ -453,6 +472,7 @@ export async function runBuffBench(options: {
printEvents: agents.length === 1 && taskConcurrency === 1,
finalCheckCommands: evalData.finalCheckCommands,
disableAnalysis,
saveTraces,
}),
)
})
Expand Down
Loading
Loading