diff --git a/scenarios/chat-agent-advanced/scenario.ts b/scenarios/chat-agent-advanced/scenario.ts index b044b2f..36ec8d2 100644 --- a/scenarios/chat-agent-advanced/scenario.ts +++ b/scenarios/chat-agent-advanced/scenario.ts @@ -13,6 +13,9 @@ export const chatAgentAdvanced: Scenario = { "commands; only create and edit files. When finished, briefly list the files you created.", ].join(" "), fixtureDir: fileURLToPath(new URL("../../fixtures/backend-ts", import.meta.url)), + // The hardest skill (sessions/HITL/sub-agents): the full HITL task legitimately needs + // longer than the global default, so both arms were timing out at 300s. + timeoutMs: 600_000, assertions: [ fileMatches("uses-chat-agent", "defines an agent with chat.agent", /chat\.agent\(/), noSdkV3(), diff --git a/src/runner.ts b/src/runner.ts index d11501e..051d0f1 100644 --- a/src/runner.ts +++ b/src/runner.ts @@ -63,7 +63,8 @@ function parseMetrics(stdout: string): RunMetrics | undefined { async function runAgent( dir: string, prompt: string, - config: EvalConfig + config: EvalConfig, + timeoutMs: number ): Promise<{ error?: string; metrics?: RunMetrics }> { // --output-format json gives us num_turns + token usage + cost as a final JSON line. const args = ["-p", prompt, "--permission-mode", config.permissionMode, "--output-format", "json"]; @@ -72,7 +73,7 @@ async function runAgent( try { const { stdout, stderr } = await exec("claude", args, { cwd: dir, - timeout: config.timeoutMs, + timeout: timeoutMs, maxBuffer: 64 * 1024 * 1024, }); await writeFile(log, `${stdout}\n${stderr}`).catch(() => {}); @@ -101,7 +102,12 @@ export async function runArmSample( if (arm === "withskills") await installSkills(runDir, config); - const { error: agentError, metrics } = await runAgent(runDir, scenario.prompt, config); + const { error: agentError, metrics } = await runAgent( + runDir, + scenario.prompt, + config, + scenario.timeoutMs ?? config.timeoutMs + ); const ctx: RunContext = { dir: runDir, arm, sample }; const result = await gradeRun(scenario, ctx, await relFiles(runDir), agentError); result.metrics = metrics; diff --git a/src/types.ts b/src/types.ts index a5f6a3b..7864612 100644 --- a/src/types.ts +++ b/src/types.ts @@ -65,6 +65,9 @@ export interface Scenario { /** Absolute path to the fixture template copied into each run. */ fixtureDir: string; assertions: Assertion[]; + /** Per-agent-run timeout for this scenario. Overrides config.timeoutMs (e.g. heavier + * scenarios that legitimately need longer than the global default). */ + timeoutMs?: number; } /** Cost signal for a single agent run, parsed from `claude -p --output-format json`. */