diff --git a/.changeset/tall-nights-care.md b/.changeset/tall-nights-care.md new file mode 100644 index 000000000..03e6b31f5 --- /dev/null +++ b/.changeset/tall-nights-care.md @@ -0,0 +1,5 @@ +--- +"braintrust": patch +--- + +fix(claude-agent-sdk): Nest built-in tools under sub-agents diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.1.span-events.json b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.1.span-events.json index 78a9015fa..dc7c8bf54 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.1.span-events.json +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.1.span-events.json @@ -291,7 +291,7 @@ "metric_keys": [], "name": "Claude Agent", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], @@ -307,7 +307,7 @@ "metric_keys": [], "name": "tool: calculator/calculator", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.76.span-events.json b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.76.span-events.json index 1a12cdb33..0c96f2827 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.76.span-events.json +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.76.span-events.json @@ -293,7 +293,7 @@ "metric_keys": [], "name": "Claude Agent", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], @@ -309,7 +309,7 @@ "metric_keys": [], "name": "tool: calculator/calculator", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.79.span-events.json b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.79.span-events.json index 1a12cdb33..0c96f2827 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.79.span-events.json +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.79.span-events.json @@ -293,7 +293,7 @@ "metric_keys": [], "name": "Claude Agent", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], @@ -309,7 +309,7 @@ "metric_keys": [], "name": "tool: calculator/calculator", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.81.span-events.json b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.81.span-events.json index 1a12cdb33..0c96f2827 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.81.span-events.json +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/__snapshots__/claude-agent-sdk-v0.2.81.span-events.json @@ -293,7 +293,7 @@ "metric_keys": [], "name": "Claude Agent", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], @@ -309,7 +309,7 @@ "metric_keys": [], "name": "tool: calculator/calculator", "root_span_id": "", - "span_id": "", + "span_id": "", "span_parents": [ "" ], diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/assertions.ts b/e2e/scenarios/claude-agent-sdk-instrumentation/assertions.ts index ffc8e6b06..f47ed9230 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/assertions.ts +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/assertions.ts @@ -175,6 +175,38 @@ function findSpanById( return events.find((event) => event.span.id === spanId); } +function isDescendantOf( + events: CapturedLogEvent[], + event: CapturedLogEvent | undefined, + ancestorId: string | undefined, +): boolean { + if (!event || !ancestorId) { + return false; + } + + const spanById = new Map(events.map((span) => [span.span.id, span] as const)); + const queue = [...event.span.parentIds]; + const visited = new Set(); + + while (queue.length > 0) { + const parentId = queue.shift(); + if (!parentId || visited.has(parentId)) { + continue; + } + if (parentId === ancestorId) { + return true; + } + + visited.add(parentId); + const parentSpan = spanById.get(parentId); + if (parentSpan) { + queue.push(...parentSpan.span.parentIds); + } + } + + return false; +} + function hasSubAgentHandoffToolName( event: CapturedLogEvent | undefined, ): boolean { @@ -191,10 +223,13 @@ function hasSubAgentHandoffToolName( function findSubAgentTaskSpan( events: CapturedLogEvent[], + ancestorId?: string, ): CapturedLogEvent | undefined { return events.find( (event) => - event.span.type === "task" && event.span.name?.startsWith("Agent:"), + event.span.type === "task" && + event.span.name?.startsWith("Agent:") && + (!ancestorId || isDescendantOf(events, event, ancestorId)), ); } @@ -208,6 +243,31 @@ function findSubAgentHandoffTool( return hasSubAgentHandoffToolName(parentSpan) ? parentSpan : undefined; } +function findLatestTaskLlmBeforeSpan( + events: CapturedLogEvent[], + taskId: string | undefined, + childStartTime: number | undefined, +): CapturedLogEvent | undefined { + return findChildSpans(events, "anthropic.messages.create", taskId) + .filter((event) => { + if (childStartTime === undefined) { + return true; + } + return ( + Number(event.metrics?.start ?? Number.NaN) <= Number(childStartTime) + ); + }) + .at(-1); +} + +function findOperationTaskRoot( + events: CapturedLogEvent[], + operationName: string, +): CapturedLogEvent | undefined { + const operation = findLatestSpan(events, operationName); + return findChildSpans(events, "Claude Agent", operation?.span.id).at(-1); +} + function buildSpanSummary(events: CapturedLogEvent[]): Json { const root = findLatestSpan(events, ROOT_NAME); const basicOperation = findLatestSpan(events, "claude-agent-basic-operation"); @@ -258,7 +318,7 @@ function buildSpanSummary(events: CapturedLogEvent[]): Json { const input = event.input as Array<{ content?: string }> | undefined; return Array.isArray(input) && input.some((item) => item.content); }); - const subAgentTask = findSubAgentTaskSpan(events); + const subAgentTask = findSubAgentTaskSpan(events, subAgentTaskRoot?.span.id); const subAgentLlm = findChildSpans( events, "anthropic.messages.create", @@ -274,9 +334,16 @@ function buildSpanSummary(events: CapturedLogEvent[]): Json { const basicTool = findToolSpanByLocalHandler(events, "calculator-local-handler-multiply") ?? findToolSpanByOperation(events, "multiply"); - const subAgentTool = + const subAgentToolCandidate = findToolSpanByLocalHandler(events, "calculator-local-handler-add") ?? findToolSpanByOperation(events, "add"); + const subAgentTool = isDescendantOf( + events, + subAgentToolCandidate, + subAgentTask?.span.id, + ) + ? subAgentToolCandidate + : undefined; const failureTool = findToolSpanByLocalHandler(events, "calculator-local-handler-divide") ?? findToolSpanByOperation(events, "divide"); @@ -453,16 +520,23 @@ export function defineClaudeAgentSDKInstrumentationAssertions(options: { events, "claude-agent-subagent-operation", ); - const taskRoot = findChildSpans( + const taskRoot = findOperationTaskRoot( events, - "Claude Agent", - operation?.span.id, - ).at(-1); - const llm = findAllSpans(events, "anthropic.messages.create").find( - (event) => event.span.parentIds.includes(taskRoot?.span.id ?? ""), + "claude-agent-subagent-operation", ); - const nestedTask = findSubAgentTaskSpan(events); + const nestedTask = findSubAgentTaskSpan(events, taskRoot?.span.id); const handoffTool = findSubAgentHandoffTool(events, nestedTask); + const llm = findLatestTaskLlmBeforeSpan( + events, + taskRoot?.span.id, + typeof handoffTool?.metrics?.start === "number" + ? handoffTool.metrics.start + : undefined, + ); + const handoffToolParent = findSpanById( + events, + handoffTool?.span.parentIds[0], + ); const nestedTaskLlm = findChildSpans( events, "anthropic.messages.create", @@ -490,6 +564,10 @@ export function defineClaudeAgentSDKInstrumentationAssertions(options: { } expect(handoffTool).toBeDefined(); expect(hasSubAgentHandoffToolName(handoffTool)).toBe(true); + expect(handoffToolParent?.span.id).toBe(taskRoot?.span.id); + expect(Number(llm?.metrics?.start ?? Number.NaN)).toBeLessThanOrEqual( + Number(handoffTool?.metrics?.start ?? Number.NaN), + ); expect(nestedTask?.span.parentIds).toEqual([handoffTool?.span.id ?? ""]); expect(nestedTaskLlm).toBeDefined(); expect(nestedTaskLlm?.span.parentIds).toContain( @@ -508,6 +586,74 @@ export function defineClaudeAgentSDKInstrumentationAssertions(options: { } }); + if (options.expectTaskLifecycleDetails) { + test( + "orders built-in Agent and Bash after their llm siblings", + testConfig, + () => { + const operation = findLatestSpan( + events, + "claude-agent-subagent-built-in-tool-operation", + ); + const taskRoot = findOperationTaskRoot( + events, + "claude-agent-subagent-built-in-tool-operation", + ); + const nestedTask = findSubAgentTaskSpan(events, taskRoot?.span.id); + const handoffTool = findSubAgentHandoffTool(events, nestedTask); + const taskRootLlm = findLatestTaskLlmBeforeSpan( + events, + taskRoot?.span.id, + typeof handoffTool?.metrics?.start === "number" + ? handoffTool.metrics.start + : undefined, + ); + const handoffToolParent = findSpanById( + events, + handoffTool?.span.parentIds[0], + ); + const bashTool = findAllSpans(events, "tool: Bash").find((event) => + isDescendantOf(events, event, taskRoot?.span.id), + ); + const nestedTaskLlm = findLatestTaskLlmBeforeSpan( + events, + nestedTask?.span.id, + typeof bashTool?.metrics?.start === "number" + ? bashTool.metrics.start + : undefined, + ); + const bashToolParent = findSpanById( + events, + bashTool?.span.parentIds[0], + ); + + expect(operation).toBeDefined(); + expect(taskRoot).toBeDefined(); + expect(nestedTask).toBeDefined(); + expect(handoffTool).toBeDefined(); + expect(handoffToolParent?.span.id).toBe(taskRoot?.span.id); + expect( + Number(taskRootLlm?.metrics?.start ?? Number.NaN), + ).toBeLessThanOrEqual( + Number(handoffTool?.metrics?.start ?? Number.NaN), + ); + expect(nestedTaskLlm).toBeDefined(); + expect(bashTool).toBeDefined(); + expect(isDescendantOf(events, bashTool, nestedTask?.span.id)).toBe( + true, + ); + expect(bashTool?.span.parentIds).not.toContain( + taskRootLlm?.span.id ?? "", + ); + expect(bashToolParent?.span.type).toBe("task"); + expect(bashToolParent?.span.id).toBe(nestedTask?.span.id); + expect( + Number(nestedTaskLlm?.metrics?.start ?? Number.NaN), + ).toBeLessThanOrEqual(Number(bashTool?.metrics?.start ?? Number.NaN)); + }, + ); + } + test("captures tool failure details", testConfig, () => { const operation = findLatestSpan( events, diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.impl.mjs b/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.impl.mjs index 683918c11..4d565d2a0 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.impl.mjs +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.impl.mjs @@ -7,6 +7,7 @@ import { import { z } from "zod"; const CLAUDE_AGENT_MODEL = "claude-haiku-4-5"; +const CLAUDE_AGENT_TOP_LEVEL_MODEL = "claude-sonnet-4-5"; export const ROOT_NAME = "claude-agent-sdk-root"; export const SCENARIO_NAME = "claude-agent-sdk-traces"; @@ -139,6 +140,33 @@ async function runClaudeAgentSDKScenario({ decorateSDK, sdk }) { }, ); + await runOperation( + "claude-agent-subagent-built-in-tool-operation", + "subagent-built-in-tool", + async () => { + await collectAsync( + query({ + prompt: + 'You MUST call the Agent tool now with subagent_type="echo" and description "echo greeting". Do not call Bash yourself. Do not answer with text yourself; delegate to the echo sub-agent.', + options: { + agents: { + echo: { + description: "Runs one bash echo and reports back.", + model: CLAUDE_AGENT_MODEL, + prompt: + "Run `echo hello` via Bash exactly once, then reply with only the word done.", + tools: ["Bash"], + }, + }, + allowedTools: ["Agent", "Bash"], + model: CLAUDE_AGENT_TOP_LEVEL_MODEL, + permissionMode: "bypassPermissions", + }, + }), + ); + }, + ); + await runOperation( "claude-agent-failure-operation", "failure", diff --git a/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.test.ts b/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.test.ts index bbc10b088..2f3900430 100644 --- a/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.test.ts +++ b/e2e/scenarios/claude-agent-sdk-instrumentation/scenario.test.ts @@ -9,7 +9,7 @@ import { defineClaudeAgentSDKInstrumentationAssertions } from "./assertions"; const scenarioDir = await prepareScenarioDir({ scenarioDir: resolveScenarioDir(import.meta.url), }); -const TIMEOUT_MS = 120_000; +const TIMEOUT_MS = 180_000; const claudeAgentSDKScenarios = await Promise.all( [ { diff --git a/js/src/instrumentation/plugins/claude-agent-sdk-plugin.ts b/js/src/instrumentation/plugins/claude-agent-sdk-plugin.ts index 9458b6127..bf8bcaa9c 100644 --- a/js/src/instrumentation/plugins/claude-agent-sdk-plugin.ts +++ b/js/src/instrumentation/plugins/claude-agent-sdk-plugin.ts @@ -38,7 +38,10 @@ type ParsedToolName = { rawToolName: string; toolName: string; }; -type ParentSpanResolver = (toolUseID: string) => Promise; +type ParentSpanResolver = ( + toolUseID: string, + context?: { agentId?: string; preferTaskSiblingParent?: boolean }, +) => Promise; type LLMSpanResult = { finalMessage: ClaudeConversationMessage | undefined; spanExport: string; @@ -62,6 +65,10 @@ function isSubAgentDelegationToolName(toolName: string): boolean { return toolName === "Agent" || toolName === "Task"; } +function shouldParentToolAsTaskSibling(toolName: string): boolean { + return toolName === "Agent" || toolName === "Task" || toolName === "Bash"; +} + function filterSerializableOptions( options: ClaudeAgentSDKQueryOptions, ): Record { @@ -188,6 +195,18 @@ function resolveTaskToolUseId( return undefined; } +function seedTaskToolUseIdMapping( + taskIdToToolUseId: Map, + message: ClaudeAgentSDKMessage, +): void { + if ( + typeof message.task_id === "string" && + typeof message.tool_use_id === "string" + ) { + taskIdToToolUseId.set(message.task_id, message.tool_use_id); + } +} + function extractUsageFromMessage( message: ClaudeAgentSDKMessage, ): Record { @@ -479,7 +498,10 @@ function createToolTracingHooks( }, }, name: parsed.displayName, - parent: await resolveParentSpan(toolUseID), + parent: await resolveParentSpan(toolUseID, { + agentId: input.agent_id, + preferTaskSiblingParent: shouldParentToolAsTaskSibling(parsed.toolName), + }), spanAttributes: { type: SpanTypeAttribute.TOOL }, }); @@ -869,6 +891,8 @@ function maybeTrackToolUseContext( state: QueryState, message: ClaudeAgentSDKMessage, ): void { + seedTaskToolUseIdMapping(state.taskIdToToolUseId, message); + if ( message.type !== "assistant" || !Array.isArray(message.message?.content) @@ -961,6 +985,45 @@ async function ensureSubAgentSpan( return subAgentSpan; } +async function ensureActiveLlmSpanForParentToolUse( + rootSpan: Span, + activeLlmSpansByParentToolUse: Map, + subAgentDetailsByToolUseId: Map, + activeToolSpans: Map, + subAgentSpans: Map, + parentToolUseId: string | null, + startTime: number, +): Promise { + const parentKey = llmParentKey(parentToolUseId); + const existingLlmSpan = activeLlmSpansByParentToolUse.get(parentKey); + if (existingLlmSpan) { + return existingLlmSpan; + } + + let llmParentSpan = await rootSpan.export(); + if (parentToolUseId) { + const subAgentSpan = await ensureSubAgentSpan( + subAgentDetailsByToolUseId, + rootSpan, + activeToolSpans, + subAgentSpans, + parentToolUseId, + ); + llmParentSpan = await subAgentSpan.export(); + } + + const llmSpan = startSpan({ + name: "anthropic.messages.create", + parent: llmParentSpan, + spanAttributes: { + type: SpanTypeAttribute.LLM, + }, + startTime, + }); + activeLlmSpansByParentToolUse.set(parentKey, llmSpan); + return llmSpan; +} + async function maybeHandleTaskLifecycleMessage( state: QueryState, message: ClaudeAgentSDKMessage, @@ -1098,30 +1161,15 @@ async function handleStreamMessage( if (message.type === "assistant" && message.message?.usage) { const parentToolUseId = message.parent_tool_use_id ?? null; - const parentKey = llmParentKey(parentToolUseId); - if (!state.activeLlmSpansByParentToolUse.has(parentKey)) { - let llmParentSpan = await state.span.export(); - if (parentToolUseId) { - const subAgentSpan = await ensureSubAgentSpan( - state.subAgentDetailsByToolUseId, - state.span, - state.activeToolSpans, - state.subAgentSpans, - parentToolUseId, - ); - llmParentSpan = await subAgentSpan.export(); - } - - const llmSpan = startSpan({ - name: "anthropic.messages.create", - parent: llmParentSpan, - spanAttributes: { - type: SpanTypeAttribute.LLM, - }, - startTime: state.currentMessageStartTime, - }); - state.activeLlmSpansByParentToolUse.set(parentKey, llmSpan); - } + await ensureActiveLlmSpanForParentToolUse( + state.span, + state.activeLlmSpansByParentToolUse, + state.subAgentDetailsByToolUseId, + state.activeToolSpans, + state.subAgentSpans, + parentToolUseId, + state.currentMessageStartTime, + ); state.currentMessages.push(message); } @@ -1311,10 +1359,47 @@ export class ClaudeAgentSDKPlugin extends BasePlugin { hasLocalToolHandlers; const resolveToolUseParentSpan: ParentSpanResolver = async ( toolUseID, + context, ) => { - const parentToolUseId = toolUseToParent.get(toolUseID) ?? null; + const trackedParentToolUseId = toolUseToParent.get(toolUseID); + const parentToolUseId = + trackedParentToolUseId ?? + (context?.agentId + ? (taskIdToToolUseId.get(context.agentId) ?? null) + : null); const parentKey = llmParentKey(parentToolUseId); const activeLlmSpan = activeLlmSpansByParentToolUse.get(parentKey); + + if (context?.preferTaskSiblingParent) { + // Built-in Claude tools should be siblings of the driving LLM turn, + // but we still materialize that LLM span first so trace ordering + // reflects that the tool call was produced by the model. + if (!activeLlmSpan) { + await ensureActiveLlmSpanForParentToolUse( + span, + activeLlmSpansByParentToolUse, + subAgentDetailsByToolUseId, + activeToolSpans, + subAgentSpans, + parentToolUseId, + getCurrentUnixTimestamp(), + ); + } + + if (parentToolUseId) { + const subAgentSpan = await ensureSubAgentSpan( + subAgentDetailsByToolUseId, + span, + activeToolSpans, + subAgentSpans, + parentToolUseId, + ); + return subAgentSpan.export(); + } + + return span.export(); + } + if (activeLlmSpan) { return activeLlmSpan.export(); } diff --git a/js/src/vendor-sdk-types/claude-agent-sdk.ts b/js/src/vendor-sdk-types/claude-agent-sdk.ts index 48ebb5beb..0f7e80ce5 100644 --- a/js/src/vendor-sdk-types/claude-agent-sdk.ts +++ b/js/src/vendor-sdk-types/claude-agent-sdk.ts @@ -23,6 +23,7 @@ interface BaseHookInput { session_id: string; transcript_path: string; cwd: string; + agent_id?: string; permission_mode?: string; }