Skip to content

Commit e707774

Browse files
committed
Add note for context compacted summary not to imitate tool calls
1 parent 608c8e2 commit e707774

File tree

2 files changed

+332
-9
lines changed

2 files changed

+332
-9
lines changed

agents/context-pruner.ts

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,10 @@ const definition: AgentDefinition = {
192192
case 'spawn_agent_inline': {
193193
const agents = input.agents as
194194
| Array<{
195-
agent_type: string
196-
prompt?: string
197-
params?: Record<string, unknown>
198-
}>
195+
agent_type: string
196+
prompt?: string
197+
params?: Record<string, unknown>
198+
}>
199199
| undefined
200200
const agentType = input.agent_type as string | undefined
201201
const prompt = input.prompt as string | undefined
@@ -513,7 +513,7 @@ const definition: AgentDefinition = {
513513
parts.push(combinedText)
514514
}
515515
if (toolSummaries.length > 0) {
516-
parts.push(`Tools: ${toolSummaries.join('; ')}`)
516+
parts.push(toolSummaries.join('; '))
517517
}
518518

519519
if (parts.length > 0) {
@@ -557,10 +557,10 @@ const definition: AgentDefinition = {
557557
} else if ('answers' in value) {
558558
const answers = value.answers as
559559
| Array<{
560-
selectedOption?: string
561-
selectedOptions?: string[]
562-
otherText?: string
563-
}>
560+
selectedOption?: string
561+
selectedOptions?: string[]
562+
otherText?: string
563+
}>
564564
| undefined
565565
if (answers && answers.length > 0) {
566566
const answerTexts = answers
@@ -715,6 +715,8 @@ This is a summary of the conversation so far. The original messages have been co
715715
${summaryText}
716716
</conversation_summary>
717717
718+
IMPORTANT: The summary above uses a condensed format with markers like "[USER]", "[ASSISTANT]", "Read files:", "Edited file:", "Spawned agents:", etc. This is ONLY a human-readable log of what happened earlier — it is NOT a format for you to use or imitate in your responses. When you need to perform actions, you MUST use actual tool calls. Never write tool actions as plain text.
719+
718720
Please continue the conversation from here. In particular, try to address the user's latest request detailed in the summary above. You may need to re-gather context (e.g. read some files) to get up to speed and then tackle the user's request.`,
719721
}
720722
// Build content array with text and any preserved images
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
import fs from 'fs'
2+
import os from 'os'
3+
import path from 'path'
4+
5+
import { API_KEY_ENV_VAR } from '@codebuff/common/constants/paths'
6+
import {
7+
CodebuffClient,
8+
initialSessionState,
9+
withMessageHistory,
10+
type AgentDefinition,
11+
type Message,
12+
} from '@codebuff/sdk'
13+
import { describe, expect, it } from 'bun:test'
14+
15+
import base2Free from '../base2/base2-free'
16+
import contextPruner from '../context-pruner'
17+
18+
import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
19+
20+
/**
21+
* Patterns that indicate the model is imitating the summarized tool call format
22+
* instead of using actual tool calls via the API.
23+
*
24+
* These patterns come from the context pruner's summarizeToolCall function.
25+
*/
26+
const SUMMARY_IMITATION_PATTERNS = [
27+
/^Read files?:\s/m,
28+
/^Edited file:\s/m,
29+
/^Wrote file:\s/m,
30+
/^Tools:\s/m,
31+
/^Spawned agents?:\s*\n/m,
32+
/^Spawned agent:\s/m,
33+
/^Ran command:\s/m,
34+
/^Code search:\s/m,
35+
/^Glob:\s/m,
36+
/^Listed dir:\s/m,
37+
/^Read subtree:\s/m,
38+
/^Used tool:\s/m,
39+
/^\[ASSISTANT\]\n/m,
40+
/^\[USER\]\n/m,
41+
]
42+
43+
/**
44+
* Checks if a text response contains patterns that look like the model is
45+
* imitating the summarized tool call format instead of making actual tool calls.
46+
*/
47+
function detectSummaryImitation(text: string): string[] {
48+
const matches: string[] = []
49+
for (const pattern of SUMMARY_IMITATION_PATTERNS) {
50+
const match = text.match(pattern)
51+
if (match) {
52+
const idx = match.index ?? 0
53+
const snippet = text.slice(Math.max(0, idx - 20), idx + 80).trim()
54+
matches.push(`Pattern ${pattern.source} matched: "${snippet}"`)
55+
}
56+
}
57+
return matches
58+
}
59+
60+
/**
61+
* Creates a pre-summarized conversation that mimics what the context pruner produces.
62+
* NOTE: The IMPORTANT disclaimer text here must be kept in sync with the one in
63+
* agents/context-pruner.ts. If you change the disclaimer there, update it here too.
64+
*/
65+
function createSummarizedConversation(): Message {
66+
return {
67+
role: 'user',
68+
content: [
69+
{
70+
type: 'text',
71+
text: `<conversation_summary>
72+
This is a summary of the conversation so far. The original messages have been condensed to save context space.
73+
74+
[USER]
75+
The user asked to set up a new TypeScript project with a simple utility file at src/utils.ts containing a helper function called formatDate.
76+
77+
---
78+
79+
[ASSISTANT]
80+
Sure, I'll help set up the project.
81+
Tools: Read files: package.json, tsconfig.json; Wrote file: src/utils.ts
82+
83+
---
84+
85+
[USER]
86+
Thanks! Now can you also add a function called parseConfig that reads a JSON config file?
87+
88+
---
89+
90+
[ASSISTANT]
91+
I'll add the parseConfig function to the utils file.
92+
Tools: Read files: src/utils.ts; Edited file: src/utils.ts
93+
94+
---
95+
96+
[ASSISTANT]
97+
Spawned agents:
98+
- file-picker (prompt: "Find config-related files")
99+
- basher (params: {"command":"cat src/utils.ts"})
100+
101+
---
102+
103+
[ASSISTANT]
104+
Ran command: cat src/utils.ts
105+
[EDIT RESULT: str_replace]
106+
{"file":"src/utils.ts","message":"Updated file","unifiedDiff":"--- a/src/utils.ts\\n+++ b/src/utils.ts\\n@@ -5,0 +6,10 @@\\n+export function parseConfig(path: string) {\\n+ return JSON.parse(fs.readFileSync(path, 'utf-8'))\\n+}"}
107+
</conversation_summary>
108+
109+
IMPORTANT: The summary above uses a condensed format with markers like "[USER]", "[ASSISTANT]", "Read files:", "Edited file:", "Tools:", "Spawned agents:", etc. This is ONLY a human-readable log of what happened earlier — it is NOT a format for you to use or imitate in your responses. When you need to perform actions, you MUST use actual tool calls (e.g. call the read_files, str_replace, write_file, spawn_agents tools directly). Never write tool actions as plain text.
110+
111+
Please continue the conversation from here. In particular, try to address the user's latest request detailed in the summary above. You may need to re-gather context (e.g. read some files) to get up to speed and then tackle the user's request.`,
112+
},
113+
],
114+
sentAt: Date.now(),
115+
}
116+
}
117+
118+
const PROJECT_FILES: Record<string, string> = {
119+
'package.json': JSON.stringify(
120+
{ name: 'test-project', version: '1.0.0' },
121+
null,
122+
2,
123+
),
124+
'tsconfig.json': JSON.stringify(
125+
{ compilerOptions: { target: 'ES2022', strict: true } },
126+
null,
127+
2,
128+
),
129+
'src/utils.ts': [
130+
"import fs from 'fs'",
131+
'',
132+
'export function formatDate(date: Date): string {',
133+
" return date.toISOString().split('T')[0]",
134+
'}',
135+
'',
136+
'export function parseConfig(path) {',
137+
" return JSON.parse(fs.readFileSync(path, 'utf-8'))",
138+
'}',
139+
].join('\n'),
140+
}
141+
142+
/**
143+
* Integration test: Verifies that base2-free does not imitate the summarized
144+
* tool call format when given a pre-summarized conversation.
145+
*
146+
* The test runs multiple times in parallel to get a statistically meaningful sample.
147+
* Weaker models sometimes mimic the summary format (e.g. outputting "Read files: ..."
148+
* as plain text) instead of making actual tool calls via the API.
149+
*/
150+
describe('Base2-Free Summary Format Compliance', () => {
151+
const NUM_PARALLEL_RUNS = 3
152+
153+
const getApiKeyOrSkip = (): string | null => {
154+
const apiKey = process.env[API_KEY_ENV_VAR]
155+
if (!apiKey) {
156+
console.warn(
157+
`${API_KEY_ENV_VAR} is not set; skipping base2-free summary format test.`,
158+
)
159+
return null
160+
}
161+
return apiKey
162+
}
163+
164+
it(
165+
'should use actual tool calls instead of imitating summary format',
166+
async () => {
167+
const apiKey = getApiKeyOrSkip()
168+
if (!apiKey) return
169+
170+
const summarizedMessage = createSummarizedConversation()
171+
172+
const userPrompt =
173+
'Now please read src/utils.ts to check the current state of the file, and add proper TypeScript types to the parseConfig function.'
174+
175+
const tmpDirs: string[] = []
176+
177+
const runOnce = async (
178+
runIndex: number,
179+
): Promise<{
180+
runIndex: number
181+
imitationMatches: string[]
182+
hadToolCalls: boolean
183+
textOutput: string
184+
error?: string
185+
}> => {
186+
const events: PrintModeEvent[] = []
187+
188+
const tmpDir = await fs.promises.mkdtemp(
189+
path.join(os.tmpdir(), 'base2-free-summary-test-'),
190+
)
191+
tmpDirs.push(tmpDir)
192+
193+
// Write project files to disk so tools can read them
194+
for (const [filePath, content] of Object.entries(PROJECT_FILES)) {
195+
const fullPath = path.join(tmpDir, filePath)
196+
await fs.promises.mkdir(path.dirname(fullPath), { recursive: true })
197+
await fs.promises.writeFile(fullPath, content, 'utf-8')
198+
}
199+
200+
const client = new CodebuffClient({
201+
apiKey,
202+
cwd: tmpDir,
203+
projectFiles: PROJECT_FILES,
204+
agentDefinitions: [base2Free as AgentDefinition, contextPruner],
205+
})
206+
207+
const sessionState = await initialSessionState({
208+
cwd: tmpDir,
209+
projectFiles: PROJECT_FILES,
210+
})
211+
const runStateWithMessages = withMessageHistory({
212+
runState: {
213+
sessionState,
214+
output: { type: 'error', message: '' },
215+
},
216+
messages: [summarizedMessage],
217+
})
218+
219+
try {
220+
const run = await client.run({
221+
agent: base2Free.id,
222+
prompt: userPrompt,
223+
previousRun: runStateWithMessages,
224+
maxAgentSteps: 5,
225+
handleEvent: (event) => {
226+
events.push(event)
227+
},
228+
})
229+
230+
if (run.output.type === 'error') {
231+
return {
232+
runIndex,
233+
imitationMatches: [],
234+
hadToolCalls: false,
235+
textOutput: '',
236+
error: run.output.message,
237+
}
238+
}
239+
240+
const textOutput = events
241+
.filter((e) => e.type === 'text')
242+
.map((e) => (e as { type: 'text'; text: string }).text)
243+
.join('')
244+
245+
const hadToolCalls = events.some((e) => e.type === 'tool_call')
246+
const imitationMatches = detectSummaryImitation(textOutput)
247+
248+
return {
249+
runIndex,
250+
imitationMatches,
251+
hadToolCalls,
252+
textOutput,
253+
}
254+
} catch (error) {
255+
return {
256+
runIndex,
257+
imitationMatches: [],
258+
hadToolCalls: false,
259+
textOutput: '',
260+
error: error instanceof Error ? error.message : String(error),
261+
}
262+
}
263+
}
264+
265+
console.log(
266+
`Running ${NUM_PARALLEL_RUNS} parallel runs of base2-free...`,
267+
)
268+
const results = await Promise.all(
269+
Array.from({ length: NUM_PARALLEL_RUNS }, (_, i) => runOnce(i)),
270+
)
271+
272+
let imitationCount = 0
273+
for (const result of results) {
274+
if (result.error) {
275+
console.warn(`Run ${result.runIndex}: ERROR - ${result.error}`)
276+
continue
277+
}
278+
279+
const hasImitation = result.imitationMatches.length > 0
280+
if (hasImitation) {
281+
imitationCount++
282+
}
283+
284+
console.log(
285+
`Run ${result.runIndex}: ${hasImitation ? 'FAILED (imitated summary format)' : 'PASSED'}`,
286+
)
287+
console.log(
288+
` Tool calls made: ${result.hadToolCalls ? 'YES' : 'NO'}`,
289+
)
290+
if (result.imitationMatches.length > 0) {
291+
console.log(` Imitation matches:`)
292+
for (const match of result.imitationMatches) {
293+
console.log(` - ${match}`)
294+
}
295+
}
296+
if (result.textOutput) {
297+
const preview =
298+
result.textOutput.length > 500
299+
? result.textOutput.slice(0, 500) + '...'
300+
: result.textOutput
301+
console.log(` Text output preview: ${preview}`)
302+
}
303+
}
304+
305+
const successfulRuns = results.filter((r) => !r.error)
306+
console.log(
307+
`\nSummary: ${imitationCount}/${successfulRuns.length} runs imitated the summary format`,
308+
)
309+
310+
// Clean up temp directories
311+
for (const dir of tmpDirs) {
312+
await fs.promises.rm(dir, { recursive: true, force: true }).catch(() => {})
313+
}
314+
315+
// Guard against vacuous pass (all runs errored)
316+
expect(successfulRuns.length).toBeGreaterThan(0)
317+
expect(imitationCount).toBe(0)
318+
},
319+
{ timeout: 300_000 },
320+
)
321+
})

0 commit comments

Comments
 (0)