diff --git a/.agents/skills/nemoclaw-user-reference/references/commands.md b/.agents/skills/nemoclaw-user-reference/references/commands.md index b6f96b32ae..d959470c32 100644 --- a/.agents/skills/nemoclaw-user-reference/references/commands.md +++ b/.agents/skills/nemoclaw-user-reference/references/commands.md @@ -1299,6 +1299,21 @@ These flags toggle optional behaviors during onboarding; set them before running | `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. | | `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. | +### Onboard Profiling Traces + +Set `NEMOCLAW_TRACE=1` before `nemoclaw onboard` to write an OpenTelemetry-style JSON trace for the run. +When no explicit path is provided, NemoClaw writes a timestamped file under `.e2e/traces/` in the current working directory. +Use `NEMOCLAW_TRACE_DIR` to choose the output directory, or `NEMOCLAW_TRACE_FILE` to choose the exact output file. + +```console +$ NEMOCLAW_TRACE=1 nemoclaw onboard +$ NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces nemoclaw onboard +$ NEMOCLAW_TRACE_FILE=/tmp/nemoclaw-onboard-trace.json nemoclaw onboard +``` + +Trace artifacts include onboard phase timing, sandbox and dashboard readiness waits, policy application, inference validation probes, curl probe results, and sandbox build progress events. +Secret-like metadata such as API keys, bearer tokens, cookies, and credentials is redacted before the file is written. + ### Probe Timeouts These tune how long internal probes wait before giving up. diff --git a/.github/workflows/e2e-branch-validation.yaml b/.github/workflows/e2e-branch-validation.yaml index 0a35f70b1d..c041da7006 100644 --- a/.github/workflows/e2e-branch-validation.yaml +++ b/.github/workflows/e2e-branch-validation.yaml @@ -322,6 +322,7 @@ jobs: 'set +e mkdir -p /tmp/nc-debug cp /tmp/nemoclaw-onboard.log /tmp/nc-debug/ 2>/dev/null || true + cp -R /tmp/nemoclaw-traces /tmp/nc-debug/traces 2>/dev/null || true timeout 15s openshell sandbox list > /tmp/nc-debug/sandbox-list.txt 2>&1 timeout 15s openshell gateway status > /tmp/nc-debug/gateway-status.txt 2>&1 timeout 15s docker ps -a > /tmp/nc-debug/docker-ps.txt 2>&1 diff --git a/.github/workflows/regression-e2e.yaml b/.github/workflows/regression-e2e.yaml index 29409fb1bd..8a6574e155 100644 --- a/.github/workflows/regression-e2e.yaml +++ b/.github/workflows/regression-e2e.yaml @@ -227,6 +227,7 @@ jobs: env: NEMOCLAW_NON_INTERACTIVE: "1" NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces run: bash test/e2e/test-onboard-inference-smoke.sh - name: Upload onboard inference smoke logs on failure @@ -239,6 +240,14 @@ jobs: /tmp/nemoclaw-e2e-onboard-inference-smoke-node.log if-no-files-found: ignore + - name: Upload onboard profiling traces + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: onboard-inference-smoke-traces + path: /tmp/nemoclaw-traces/ + if-no-files-found: ignore + # ── Gateway drift preflight E2E ───────────────────────────── # Coverage guard for #3399 / #3423. A stale OpenShell gateway image can # make sandbox-state RPCs fail with protobuf invalid-wire decode errors. diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index f891cb9e99..f1aa836283 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -1307,6 +1307,21 @@ These flags toggle optional behaviors during onboarding; set them before running | `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. | | `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. | +### Onboard Profiling Traces + +Set `NEMOCLAW_TRACE=1` before `nemoclaw onboard` to write an OpenTelemetry-style JSON trace for the run. +When no explicit path is provided, NemoClaw writes a timestamped file under `.e2e/traces/` in the current working directory. +Use `NEMOCLAW_TRACE_DIR` to choose the output directory, or `NEMOCLAW_TRACE_FILE` to choose the exact output file. + +```console +$ NEMOCLAW_TRACE=1 nemoclaw onboard +$ NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces nemoclaw onboard +$ NEMOCLAW_TRACE_FILE=/tmp/nemoclaw-onboard-trace.json nemoclaw onboard +``` + +Trace artifacts include onboard phase timing, sandbox and dashboard readiness waits, policy application, inference validation probes, curl probe results, and sandbox build progress events. +Secret-like metadata such as API keys, bearer tokens, cookies, and credentials is redacted before the file is written. + ### Probe Timeouts These tune how long internal probes wait before giving up. diff --git a/src/lib/adapters/http/probe.test.ts b/src/lib/adapters/http/probe.test.ts index 0669c8181e..ad07ed6f55 100644 --- a/src/lib/adapters/http/probe.test.ts +++ b/src/lib/adapters/http/probe.test.ts @@ -2,9 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it } from "vitest"; +import { flushTrace, resetTraceForTests, TRACE_FILE_ENV, type TraceArtifact } from "../../trace"; import { getCurlTimingArgs, runChatCompletionsStreamingProbe, @@ -15,6 +17,19 @@ import { summarizeProbeFailure, } from "./probe"; +function withTraceFile(fn: (traceFile: string) => T): T { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-probe-trace-test-")); + const traceFile = path.join(tmpDir, "trace.json"); + process.env[TRACE_FILE_ENV] = traceFile; + resetTraceForTests(); + return fn(traceFile); +} + +afterEach(() => { + delete process.env[TRACE_FILE_ENV]; + resetTraceForTests(); +}); + describe("http-probe helpers", () => { it("returns explicit curl timeouts", () => { expect(getCurlTimingArgs()).toEqual(["--connect-timeout", "10", "--max-time", "60"]); @@ -157,6 +172,27 @@ describe("runChatCompletionsStreamingProbe", () => { expect(result.ok).toBe(false); expect(result.message).toContain("did not return SSE data"); }); + + it("records curl_result metadata for chat streaming probes", () => { + withTraceFile((traceFile) => { + const result = runChatCompletionsStreamingProbe( + ["-sS", "--max-time", "120", "https://example.test/v1/chat/completions"], + { spawnSyncImpl: mockStreaming("", 28, "200") }, + ); + + expect(result.ok).toBe(false); + flushTrace(); + const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact; + const span = artifact.resource_spans[0].scope_spans[0].spans.find( + (entry) => entry.name === "nemoclaw.inference.curl_streaming_probe", + ); + expect(span?.events[0].attributes).toMatchObject({ + ok: false, + http_status: 200, + curl_status: 28, + }); + }); + }); }); describe("runStreamingEventProbe", () => { @@ -307,4 +343,25 @@ describe("runStreamingEventProbe", () => { expect(fs.existsSync(outputPath)).toBe(false); expect(fs.existsSync(path.dirname(outputPath))).toBe(false); }); + + it("records curl_result metadata for responses streaming probes", () => { + withTraceFile((traceFile) => { + const result = runStreamingEventProbe( + ["-sS", "--max-time", "15", "https://example.test/v1/responses"], + { spawnSyncImpl: mockStreaming("event: response.created\ndata: {}\n") }, + ); + + expect(result.ok).toBe(false); + flushTrace(); + const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact; + const span = artifact.resource_spans[0].scope_spans[0].spans.find( + (entry) => entry.name === "nemoclaw.inference.curl_streaming_event_probe", + ); + expect(span?.events[0].attributes).toMatchObject({ + ok: false, + missing_events_count: 1, + curl_status: 0, + }); + }); + }); }); diff --git a/src/lib/adapters/http/probe.ts b/src/lib/adapters/http/probe.ts index dcc08b187d..e787e0dcad 100644 --- a/src/lib/adapters/http/probe.ts +++ b/src/lib/adapters/http/probe.ts @@ -13,6 +13,7 @@ import { isErrnoException } from "../../core/errno"; import { compactText } from "../../core/url-utils"; import type { ProbeResult } from "../../onboard/types"; import { ROOT } from "../../state/paths"; +import { addTraceEvent, withTraceSpan } from "../../trace"; export type CurlProbeResult = ProbeResult; @@ -68,6 +69,37 @@ export function getCurlTimingArgs(): string[] { return ["--connect-timeout", "10", "--max-time", "60"]; } +function sanitizeCurlUrl(value: string): string { + try { + const url = new URL(value); + url.username = ""; + url.password = ""; + for (const key of [...url.searchParams.keys()]) { + url.searchParams.set(key, ""); + } + url.hash = ""; + return url.toString(); + } catch { + return value.replace(/(Bearer\s+)\S+/gi, "$1"); + } +} + +function getCurlProbeTraceAttributes(argv: string[], opts: CurlProbeOptions): Record { + const url = argv.at(-1) || ""; + const methodIndex = argv.findIndex((arg) => arg === "-X" || arg === "--request"); + const method = + methodIndex >= 0 && argv[methodIndex + 1] ? argv[methodIndex + 1].toUpperCase() : "POST"; + return { + "http.url": sanitizeCurlUrl(String(url)), + "http.request.method": method, + "process.timeout_ms": opts.timeoutMs ?? 30_000, + }; +} + +function emitCurlResultTraceEvent(attributes: Record): void { + addTraceEvent("curl_result", attributes); +} + export function summarizeCurlFailure(curlStatus = 0, stderr = "", body = ""): string { const detail = compactText(stderr || body); return detail @@ -130,6 +162,12 @@ export function summarizeProbeFailure(body = "", status = 0, curlStatus = 0, std } export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult { + return withTraceSpan("nemoclaw.inference.curl_probe", getCurlProbeTraceAttributes(argv, opts), () => + runCurlProbeImpl(argv, opts), + ); +} + +function runCurlProbeImpl(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult { const bodyFile = secureTempFile("nemoclaw-curl-probe", ".json"); try { const args = [...argv]; @@ -154,7 +192,7 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP const errorMessage = compactText( `${result.error.message || String(result.error)} ${String(result.stderr || "")}`, ); - return { + const failure = { ok: false, httpStatus: 0, curlStatus: errorCode, @@ -162,9 +200,11 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP stderr: errorMessage, message: summarizeProbeFailure(body, 0, errorCode, errorMessage), }; + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: errorCode }); + return failure; } const status = Number(String(result.stdout || "").trim()); - return { + const probeResult = { ok: result.status === 0 && status >= 200 && status < 300, httpStatus: Number.isFinite(status) ? status : 0, curlStatus: result.status || 0, @@ -177,9 +217,15 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP String(result.stderr || ""), ), }; + emitCurlResultTraceEvent({ + ok: probeResult.ok, + http_status: probeResult.httpStatus, + curl_status: probeResult.curlStatus, + }); + return probeResult; } catch (error) { const detail = error instanceof Error ? error.message : String(error); - return { + const probeResult = { ok: false, httpStatus: 0, curlStatus: @@ -191,6 +237,8 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP detail, ), }; + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: probeResult.curlStatus }); + return probeResult; } finally { cleanupTempDir(bodyFile, "nemoclaw-curl-probe"); } @@ -218,6 +266,17 @@ function hasChatCompletionsStreamingData(body: string): boolean { export function runChatCompletionsStreamingProbe( argv: string[], opts: CurlProbeOptions = {}, +): CurlProbeResult { + return withTraceSpan( + "nemoclaw.inference.curl_streaming_probe", + getCurlProbeTraceAttributes(argv, opts), + () => runChatCompletionsStreamingProbeImpl(argv, opts), + ); +} + +function runChatCompletionsStreamingProbeImpl( + argv: string[], + opts: CurlProbeOptions = {}, ): CurlProbeResult { const bodyFile = secureTempFile("nemoclaw-chat-streaming-probe", ".sse"); try { @@ -247,6 +306,7 @@ export function runChatCompletionsStreamingProbe( const errorMessage = compactText( `${result.error.message || String(result.error)} ${String(result.stderr || "")}`, ); + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: errorCode }); return { ok: false, httpStatus: 0, @@ -262,6 +322,7 @@ export function runChatCompletionsStreamingProbe( const hasStreamingData = hasChatCompletionsStreamingData(body); const httpOk = Number.isFinite(status) && status >= 200 && status < 300; if (httpOk && hasStreamingData && (curlStatus === 0 || curlStatus === 28)) { + emitCurlResultTraceEvent({ ok: true, http_status: status, curl_status: curlStatus }); return { ok: true, httpStatus: status, @@ -276,6 +337,11 @@ export function runChatCompletionsStreamingProbe( httpOk && !hasStreamingData ? `HTTP ${status}: chat completions stream did not return SSE data` : summarizeProbeFailure(body, status || 0, curlStatus, String(result.stderr || "")); + emitCurlResultTraceEvent({ + ok: false, + http_status: Number.isFinite(status) ? status : 0, + curl_status: curlStatus, + }); return { ok: false, httpStatus: Number.isFinite(status) ? status : 0, @@ -286,17 +352,16 @@ export function runChatCompletionsStreamingProbe( }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + const curlStatus = + typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1; + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: curlStatus }); return { ok: false, httpStatus: 0, - curlStatus: - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, + curlStatus, body: "", stderr: detail, - message: summarizeCurlFailure( - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, - detail, - ), + message: summarizeCurlFailure(curlStatus, detail), }; } finally { cleanupTempDir(bodyFile, "nemoclaw-chat-streaming-probe"); @@ -323,6 +388,17 @@ const REQUIRED_STREAMING_EVENTS = ["response.output_text.delta"]; export function runStreamingEventProbe( argv: string[], opts: CurlProbeOptions = {}, +): StreamingProbeResult { + return withTraceSpan( + "nemoclaw.inference.curl_streaming_event_probe", + getCurlProbeTraceAttributes(argv, opts), + () => runStreamingEventProbeImpl(argv, opts), + ); +} + +function runStreamingEventProbeImpl( + argv: string[], + opts: CurlProbeOptions = {}, ): StreamingProbeResult { const bodyFile = secureTempFile("nemoclaw-streaming-probe", ".sse"); try { @@ -347,6 +423,11 @@ export function runStreamingEventProbe( const detail = result.error ? String(result.error.message || result.error) : String(result.stderr || ""); + emitCurlResultTraceEvent({ + ok: false, + missing_events_count: REQUIRED_STREAMING_EVENTS.length, + curl_status: result.status ?? 1, + }); return { ok: false, missingEvents: REQUIRED_STREAMING_EVENTS, @@ -366,6 +447,11 @@ export function runStreamingEventProbe( const missing = REQUIRED_STREAMING_EVENTS.filter((e) => !eventTypes.has(e)); if (missing.length > 0) { + emitCurlResultTraceEvent({ + ok: false, + missing_events_count: missing.length, + curl_status: result.status ?? 0, + }); return { ok: false, missingEvents: missing, @@ -375,9 +461,17 @@ export function runStreamingEventProbe( }; } + emitCurlResultTraceEvent({ ok: true, missing_events_count: 0, curl_status: result.status ?? 0 }); return { ok: true, missingEvents: [], message: "" }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + const curlStatus = + typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1; + emitCurlResultTraceEvent({ + ok: false, + missing_events_count: REQUIRED_STREAMING_EVENTS.length, + curl_status: curlStatus, + }); return { ok: false, missingEvents: REQUIRED_STREAMING_EVENTS, diff --git a/src/lib/inference/onboard-probes.ts b/src/lib/inference/onboard-probes.ts index ee3248312d..a86a1afb95 100644 --- a/src/lib/inference/onboard-probes.ts +++ b/src/lib/inference/onboard-probes.ts @@ -24,6 +24,7 @@ const { runChatCompletionsStreamingProbe, runStreamingEventProbe, } = httpProbe; +const trace = require("../trace"); // ── Helpers ────────────────────────────────────────────────────── @@ -232,16 +233,37 @@ function isCurlTimeout(result) { } function executeProbeWithHttpRetry(probe) { - let result = probe.execute(); - for (const delayMs of HTTP_PROBE_RETRY_DELAYS_MS) { - if (!shouldRetryHttpProbe(result)) break; - console.log( - ` ${probe.name} validation returned HTTP ${result.httpStatus}; retrying in ${Math.round(delayMs / 1000)}s...`, - ); - sleepSync(delayMs); - result = probe.execute(); - } - return result; + return trace.withTraceSpan( + "nemoclaw.inference.validation_probe", + { probe_name: probe.name, api: probe.api || null }, + () => { + let attempt = 1; + let result = probe.execute(); + trace.addTraceEvent("probe_result", { + attempt, + ok: result.ok, + http_status: result.httpStatus, + curl_status: result.curlStatus, + }); + for (const delayMs of HTTP_PROBE_RETRY_DELAYS_MS) { + if (!shouldRetryHttpProbe(result)) break; + console.log( + ` ${probe.name} validation returned HTTP ${result.httpStatus}; retrying in ${Math.round(delayMs / 1000)}s...`, + ); + trace.addTraceEvent("probe_retry_sleep", { delay_ms: delayMs, http_status: result.httpStatus }); + sleepSync(delayMs); + attempt += 1; + result = probe.execute(); + trace.addTraceEvent("probe_result", { + attempt, + ok: result.ok, + http_status: result.httpStatus, + curl_status: result.curlStatus, + }); + } + return result; + }, + ); } // ── Responses API probe ────────────────────────────────────────── diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 2a4706f61f..c6488b7e07 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -82,6 +82,8 @@ const { }: typeof import("./onboard/gateway-gpu-passthrough") = require("./onboard/gateway-gpu-passthrough"); const { syncPresetSelection }: typeof import("./onboard/policy-preset-sync") = require("./onboard/policy-preset-sync"); const { maybeForceE2eStepFailure }: typeof import("./onboard/e2e-failure-injection") = require("./onboard/e2e-failure-injection"); +const onboardTracing: typeof import("./onboard/tracing") = require("./onboard/tracing"); +const sandboxReadinessTracing: typeof import("./onboard/sandbox-readiness-tracing") = require("./onboard/sandbox-readiness-tracing"); const { gatherWechatConfig, hasWechatConfigDrift, @@ -3799,6 +3801,7 @@ async function createSandbox( return false; }, failureCheck: dockerGpuCreatePatch.createFailureMessage, + traceEvent: onboardTracing.addTraceEvent, }); if (initialSandboxPolicy.cleanup && initialSandboxPolicy.cleanup()) { @@ -3848,16 +3851,13 @@ async function createSandbox( // without this gate, NemoClaw registers a phantom sandbox that // causes "sandbox not found" on every subsequent connect/status call. console.log(" Waiting for sandbox to become ready..."); - let ready = false; - const readyAttempts = Math.max(1, Math.ceil(sandboxReadyTimeoutSecs / 2)); - for (let i = 0; i < readyAttempts; i++) { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) { - ready = true; - break; - } - if (i < readyAttempts - 1) sleepSeconds(2); - } + const ready = sandboxReadinessTracing.waitForCreatedSandboxReadyWithTrace({ + sandboxName, + timeoutSecs: sandboxReadyTimeoutSecs, + runCaptureOpenshell, + isSandboxReady, + sleep: sleepSeconds, + }); const restoreBackupPath = pendingStateRestore?.manifest?.backupPath ?? pendingStateRestoreBackupPath; @@ -3910,23 +3910,12 @@ async function createSandbox( // Probes /health endpoint and accepts 200 or 401 (device auth) as "alive". // Previously used `curl -sf` which failed on 401, causing false negatives. Fixes #2342. console.log(" Waiting for NemoClaw dashboard to become ready..."); - for (let i = 0; i < 15; i++) { - const readyOutput = runCaptureOpenshell( - ["sandbox", "exec", "-n", sandboxName, "--", "curl", "-so", "/dev/null", "-w", "%{http_code}", - "--max-time", "3", `http://localhost:${effectiveDashboardPort}/health`], - { ignoreError: true }, - ); - const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; - if (readyCode === 200 || readyCode === 401) { - console.log(" ✓ Dashboard is live"); - break; - } - if (i === 14) { - console.warn(" Dashboard taking longer than expected to start. Continuing..."); - } else { - sleepSeconds(2); - } - } + sandboxReadinessTracing.waitForDashboardReadyWithTrace({ + sandboxName, + port: effectiveDashboardPort, + runCaptureOpenshell, + sleep: sleepSeconds, + }); if (effectiveSandboxGpuConfig.sandboxGpuEnabled) { try { @@ -6836,7 +6825,10 @@ async function onboard(opts: OnboardOptions = {}): Promise { }; process.once("exit", releaseOnboardLock); + let onboardTrace: ReturnType = { collector: null, span: null }; + let traceCompleted = false; try { + onboardTrace = onboardTracing.startOnboardTrace(opts, process.env); let session: Session | null; let selectedMessagingChannels: string[] = []; // Merged, absolute fromDockerfile: explicit flag/env takes precedence; on @@ -7406,9 +7398,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { log: (message) => console.log(message), }, }); + traceCompleted = true; } finally { releaseOnboardLock(); onboardRuntimeBoundary.clear(); + onboardTracing.finishOnboardTrace(onboardTrace, traceCompleted); } } diff --git a/src/lib/onboard/gateway-http-readiness.ts b/src/lib/onboard/gateway-http-readiness.ts index 4647e9cb7c..bec301c289 100644 --- a/src/lib/onboard/gateway-http-readiness.ts +++ b/src/lib/onboard/gateway-http-readiness.ts @@ -16,6 +16,7 @@ import http2 from "node:http2"; import { getGatewayHttpEndpoint } from "../core/gateway-address"; import { GATEWAY_PORT } from "../core/ports"; import { sleepSeconds } from "../core/wait"; +import { addTraceEvent, withTraceSpan } from "../trace"; import { envInt } from "./env"; /** @@ -73,6 +74,18 @@ export function isGatewayHttpReady( timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/`, method: "GET" | "POST" = "GET", +): Promise { + return withTraceSpan( + "nemoclaw.gateway.http_probe", + { timeout_ms: timeoutMs, url, method }, + () => isGatewayHttpReadyImpl(timeoutMs, url, method), + ); +} + +function isGatewayHttpReadyImpl( + timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, + url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/`, + method: "GET" | "POST" = "GET", ): Promise { const effectiveTimeout = Number.isFinite(timeoutMs) && timeoutMs > 0 @@ -103,6 +116,17 @@ export function isGatewayHttpReady( export function isDockerDriverGatewayHttpReady( timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/openshell.v1.OpenShell/Health`, +): Promise { + return withTraceSpan( + "nemoclaw.gateway.docker_driver_http_probe", + { timeout_ms: timeoutMs, url }, + () => isDockerDriverGatewayHttpReadyImpl(timeoutMs, url), + ); +} + +function isDockerDriverGatewayHttpReadyImpl( + timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, + url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/openshell.v1.OpenShell/Health`, ): Promise { const effectiveTimeout = Number.isFinite(timeoutMs) && timeoutMs > 0 @@ -205,33 +229,43 @@ export function isDockerDriverGatewayHttpReady( export async function waitForGatewayHttpReady( opts: WaitForGatewayHttpReadyOpts = {}, ): Promise { - const probe = opts.probe ?? (() => isGatewayHttpReady()); - const sleeper = opts.sleeper ?? sleepSeconds; - const config = getGatewayReuseHealthWaitConfig(); - // Always probe at least once, even if the caller passed a non-positive - // maxAttempts. Non-finite (NaN, Infinity) values fall back to safe defaults - // — Math.max alone would let Infinity through and hang the loop, and NaN - // would propagate into sleeper(). - const rawAttempts = opts.maxAttempts ?? config.count; - const maxAttempts = Number.isFinite(rawAttempts) ? Math.max(1, Math.round(rawAttempts)) : 1; - const rawInterval = opts.intervalSeconds ?? config.interval; - const intervalSeconds = Number.isFinite(rawInterval) ? Math.max(0, rawInterval) : 0; - - // The default probe (isGatewayHttpReady) never rejects, but injected probes - // can. Treat a rejection as "not ready this attempt" so we exhaust the - // budget instead of bailing on the first transient failure. - const safeProbe = async (): Promise => { - try { - return await probe(); - } catch { - return false; - } - }; + return withTraceSpan("nemoclaw.gateway.http_readiness_wait", {}, async () => { + const probe = opts.probe ?? (() => isGatewayHttpReady()); + const sleeper = opts.sleeper ?? sleepSeconds; + const config = getGatewayReuseHealthWaitConfig(); + // Always probe at least once, even if the caller passed a non-positive + // maxAttempts. Non-finite (NaN, Infinity) values fall back to safe defaults + // — Math.max alone would let Infinity through and hang the loop, and NaN + // would propagate into sleeper(). + const rawAttempts = opts.maxAttempts ?? config.count; + const maxAttempts = Number.isFinite(rawAttempts) ? Math.max(1, Math.round(rawAttempts)) : 1; + const rawInterval = opts.intervalSeconds ?? config.interval; + const intervalSeconds = Number.isFinite(rawInterval) ? Math.max(0, rawInterval) : 0; + addTraceEvent("wait_config", { max_attempts: maxAttempts, interval_seconds: intervalSeconds }); - if (await safeProbe()) return true; - for (let attempt = 1; attempt < maxAttempts; attempt++) { - sleeper(intervalSeconds); - if (await safeProbe()) return true; - } - return false; + // The default probe (isGatewayHttpReady) never rejects, but injected probes + // can. Treat a rejection as "not ready this attempt" so we exhaust the + // budget instead of bailing on the first transient failure. + const safeProbe = async (): Promise => { + try { + return await probe(); + } catch { + return false; + } + }; + + if (await safeProbe()) { + addTraceEvent("ready", { attempt: 1 }); + return true; + } + for (let attempt = 1; attempt < maxAttempts; attempt++) { + sleeper(intervalSeconds); + if (await safeProbe()) { + addTraceEvent("ready", { attempt: attempt + 1 }); + return true; + } + } + addTraceEvent("not_ready", { attempts: maxAttempts }); + return false; + }); } diff --git a/src/lib/onboard/gateway-tcp-readiness.ts b/src/lib/onboard/gateway-tcp-readiness.ts index f0a01e3ec1..0a8a0e3bce 100644 --- a/src/lib/onboard/gateway-tcp-readiness.ts +++ b/src/lib/onboard/gateway-tcp-readiness.ts @@ -52,6 +52,7 @@ import net from "node:net"; import { getGatewayConnectHost } from "../core/gateway-address"; import { GATEWAY_PORT } from "../core/ports"; +import { withTraceSpan } from "../trace"; const ISGATEWAY_TCP_READY_DEFAULT_TIMEOUT_MS = 500; @@ -83,6 +84,18 @@ export function isGatewayTcpReady( port: number = GATEWAY_PORT, timeoutMs: number = ISGATEWAY_TCP_READY_DEFAULT_TIMEOUT_MS, host = getGatewayConnectHost(), +): Promise { + return withTraceSpan( + "nemoclaw.gateway.tcp_probe", + { host, port, timeout_ms: timeoutMs }, + () => isGatewayTcpReadyImpl(port, timeoutMs, host), + ); +} + +function isGatewayTcpReadyImpl( + port: number = GATEWAY_PORT, + timeoutMs: number = ISGATEWAY_TCP_READY_DEFAULT_TIMEOUT_MS, + host = getGatewayConnectHost(), ): Promise { const effectiveTimeout = Number.isFinite(timeoutMs) && timeoutMs > ISGATEWAY_TCP_READY_MIN_TIMEOUT_MS diff --git a/src/lib/onboard/machine/handlers/gateway.ts b/src/lib/onboard/machine/handlers/gateway.ts index b7c9d93e55..78009aae2e 100644 --- a/src/lib/onboard/machine/handlers/gateway.ts +++ b/src/lib/onboard/machine/handlers/gateway.ts @@ -2,9 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 import type { NvidiaPlatform } from "../../../inference/nim"; -import type { GatewayContainerState } from "../../gateway-container-running"; -import type { Session } from "../../../state/onboard-session"; import type { GatewayReuseState } from "../../../state/gateway"; +import type { Session } from "../../../state/onboard-session"; +import type { GatewayContainerState } from "../../gateway-container-running"; +import { withGatewayTrace } from "../../tracing"; export interface GatewayStateOptions { resume: boolean; @@ -176,7 +177,9 @@ export async function handleGatewayState({ gatewayReuseState = "missing"; } await deps.startRecordedStep("gateway"); - await deps.startGateway(gpu, { gpuPassthrough }); + await withGatewayTrace(gatewayReuseState, gpuPassthrough, () => + deps.startGateway(gpu, { gpuPassthrough }), + ); session = await deps.recordStepComplete("gateway"); } diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts index 80a648c204..5d26c5c9d3 100644 --- a/src/lib/onboard/machine/handlers/preflight.ts +++ b/src/lib/onboard/machine/handlers/preflight.ts @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 import type { Session } from "../../../state/onboard-session"; +import { withPreflightTrace } from "../../tracing"; export type PreflightSandboxGpuFlag = "enable" | "disable" | null; @@ -128,7 +129,7 @@ export async function handlePreflightState< ); } else { await deps.startRecordedStep("preflight"); - gpu = await deps.runPreflight({ optedOutGpuPassthrough: noGpu }); + gpu = await withPreflightTrace(() => deps.runPreflight({ optedOutGpuPassthrough: noGpu })); session = await deps.recordStepComplete("preflight"); } diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts index 139b643cf2..f84d682943 100644 --- a/src/lib/onboard/machine/handlers/provider-inference.ts +++ b/src/lib/onboard/machine/handlers/provider-inference.ts @@ -3,6 +3,7 @@ import type { WebSearchConfig } from "../../../inference/web-search"; import type { Session, SessionUpdates } from "../../../state/onboard-session"; +import { withInferenceTrace, withProviderSelectionTrace } from "../../tracing"; export type ProviderInferenceRetry = { retry: "selection" } | { ok: true; retry?: undefined }; @@ -211,7 +212,11 @@ export async function handleProviderInferenceState({ } } else { await deps.startRecordedStep("provider_selection"); - const selection = await deps.setupNim(gpu, sandboxName, agent); + const selection = await withProviderSelectionTrace( + sandboxName, + (agent as { name?: string } | null)?.name, + () => deps.setupNim(gpu, sandboxName, agent), + ); model = selection.model; provider = selection.provider; endpointUrl = selection.endpointUrl; @@ -224,8 +229,10 @@ export async function handleProviderInferenceState({ } const selected = requireSelection(provider, model, deps); - provider = selected.provider; - model = selected.model; + const selectedProvider = selected.provider; + const selectedModel = selected.model; + provider = selectedProvider; + model = selectedModel; if (shouldRecordProviderSelection) { session = await deps.recordStepComplete( "provider_selection", @@ -254,15 +261,23 @@ export async function handleProviderInferenceState({ let inferenceResult: ProviderInferenceRetry; try { if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + const confirmedSandboxName = sandboxName; await deps.startRecordedStep("inference", { provider, model }); - inferenceResult = await deps.setupInference( - sandboxName, - model, - provider, - endpointUrl, + inferenceResult = await withInferenceTrace( + confirmedSandboxName, + selectedProvider, + selectedModel, credentialEnv, - hermesAuthMethod, - hermesToolGateways, + () => + deps.setupInference( + confirmedSandboxName, + selectedModel, + selectedProvider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + ), ); } finally { clearStagedCredentialEnv(deps, credentialEnv); @@ -302,6 +317,7 @@ export async function handleProviderInferenceState({ let inferenceResult: ProviderInferenceRetry; try { if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + const confirmedSandboxName = sandboxName; const buildEstimateNote = env.NEMOCLAW_IGNORE_RUNTIME_RESOURCES === "1" ? null @@ -315,7 +331,7 @@ export async function handleProviderInferenceState({ webSearchConfig, hermesToolGateways, enabledChannels: selectedMessagingChannels.length > 0 ? selectedMessagingChannels : null, - sandboxName, + sandboxName: confirmedSandboxName, notes: buildEstimateNote ? [buildEstimateNote] : [], }), ); @@ -330,14 +346,21 @@ export async function handleProviderInferenceState({ } await deps.startRecordedStep("inference", { provider, model }); - inferenceResult = await deps.setupInference( - sandboxName, - model, - provider, - endpointUrl, + inferenceResult = await withInferenceTrace( + confirmedSandboxName, + selectedProvider, + selectedModel, credentialEnv, - hermesAuthMethod, - hermesToolGateways, + () => + deps.setupInference( + confirmedSandboxName, + selectedModel, + selectedProvider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + ), ); } finally { clearStagedCredentialEnv(deps, credentialEnv); diff --git a/src/lib/onboard/machine/handlers/sandbox.ts b/src/lib/onboard/machine/handlers/sandbox.ts index 032fc74276..efa5cf0adb 100644 --- a/src/lib/onboard/machine/handlers/sandbox.ts +++ b/src/lib/onboard/machine/handlers/sandbox.ts @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 import type { Session, SessionUpdates } from "../../../state/onboard-session"; +import { withSandboxPhaseTrace } from "../../tracing"; export interface SandboxStateOptions { resume: boolean; @@ -275,22 +276,30 @@ export async function handleSandboxState + deps.createSandbox( + gpu, + model, + provider, + preferredInferenceApi, + confirmedSandboxName, + nextWebSearchConfig, + selectedMessagingChannels, + fromDockerfile, + agent, + controlUiPort, + sandboxGpuConfig, + resourceProfile, + hermesToolGateways, + ), ); webSearchConfig = nextWebSearchConfig; deps.updateSandboxRegistry(sandboxName, { diff --git a/src/lib/onboard/policy-selection.ts b/src/lib/onboard/policy-selection.ts index c2e7755ea3..9c45b2b192 100644 --- a/src/lib/onboard/policy-selection.ts +++ b/src/lib/onboard/policy-selection.ts @@ -13,6 +13,7 @@ import { pruneDisabledMessagingPolicyPresets, requiredMessagingChannelPolicyPresets, } from "./messaging-policy-presets"; +import { withPolicyApplicationTrace } from "./tracing"; type Preset = { name: string; access?: string }; type SupportOptions = { webSearchSupported?: boolean | null }; @@ -231,6 +232,16 @@ export async function setupPoliciesWithSelection( deps: SetupPolicySelectionDeps, sandboxName: string, options: SetupPolicySelectionOptions = {}, +): Promise { + return withPolicyApplicationTrace(sandboxName, options, () => + setupPoliciesWithSelectionInner(deps, sandboxName, options), + ); +} + +async function setupPoliciesWithSelectionInner( + deps: SetupPolicySelectionDeps, + sandboxName: string, + options: SetupPolicySelectionOptions = {}, ): Promise { const selectedPresets = Array.isArray(options.selectedPresets) ? options.selectedPresets : null; const onSelection = typeof options.onSelection === "function" ? options.onSelection : null; diff --git a/src/lib/onboard/sandbox-readiness-tracing.ts b/src/lib/onboard/sandbox-readiness-tracing.ts new file mode 100644 index 0000000000..33d49ef8b9 --- /dev/null +++ b/src/lib/onboard/sandbox-readiness-tracing.ts @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { addTraceEvent, withDashboardReadinessTrace, withSandboxReadinessTrace } from "./tracing"; + +type RunCaptureOpenshell = ( + args: string[], + options?: { ignoreError?: boolean }, +) => string; + +export function waitForSandboxReadyWithTrace(options: { + sandboxName: string; + attempts: number; + delaySeconds: number; + runCaptureOpenshell: RunCaptureOpenshell; + isSandboxReady: (output: string, sandboxName: string) => boolean; + isLinuxDockerDriverGatewayEnabled: () => boolean; + sleep: (seconds: number) => void; +}): boolean { + const { + sandboxName, + attempts, + delaySeconds, + runCaptureOpenshell, + isSandboxReady, + isLinuxDockerDriverGatewayEnabled, + sleep, + } = options; + return withSandboxReadinessTrace(sandboxName, { attempts, delay_seconds: delaySeconds }, () => { + for (let i = 0; i < attempts; i += 1) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + addTraceEvent("ready", { attempt: i + 1, source: "sandbox_list" }); + return true; + } + + // Package-managed OpenShell gateways report readiness through + // `sandbox list`; legacy Kubernetes gateways may still expose pod state. + if (isLinuxDockerDriverGatewayEnabled()) { + if (i < attempts - 1) sleep(delaySeconds); + continue; + } + const podPhase = runCaptureOpenshell( + [ + "doctor", + "exec", + "--", + "kubectl", + "-n", + "openshell", + "get", + "pod", + sandboxName, + "-o", + "jsonpath={.status.phase}", + ], + { ignoreError: true }, + ); + if (podPhase === "Running") { + addTraceEvent("ready", { attempt: i + 1, source: "pod_phase" }); + return true; + } + if (i < attempts - 1) sleep(delaySeconds); + } + addTraceEvent("not_ready", { attempts }); + return false; + }); +} + +export function waitForCreatedSandboxReadyWithTrace(options: { + sandboxName: string; + timeoutSecs: number; + runCaptureOpenshell: RunCaptureOpenshell; + isSandboxReady: (output: string, sandboxName: string) => boolean; + sleep: (seconds: number) => void; +}): boolean { + const { sandboxName, timeoutSecs, runCaptureOpenshell, isSandboxReady, sleep } = options; + return withSandboxReadinessTrace(sandboxName, { timeout_seconds: timeoutSecs }, () => { + const readyAttempts = Math.max(1, Math.ceil(timeoutSecs / 2)); + for (let i = 0; i < readyAttempts; i++) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + addTraceEvent("ready", { attempt: i + 1 }); + return true; + } + if (i < readyAttempts - 1) sleep(2); + } + addTraceEvent("not_ready", { attempts: readyAttempts }); + return false; + }); +} + +export function waitForDashboardReadyWithTrace(options: { + sandboxName: string; + port: string | number; + runCaptureOpenshell: RunCaptureOpenshell; + sleep: (seconds: number) => void; +}): void { + const { sandboxName, port, runCaptureOpenshell, sleep } = options; + withDashboardReadinessTrace(sandboxName, port, 15, () => { + for (let i = 0; i < 15; i++) { + const readyOutput = runCaptureOpenshell( + [ + "sandbox", + "exec", + "-n", + sandboxName, + "--", + "curl", + "-so", + "/dev/null", + "-w", + "%{http_code}", + "--max-time", + "3", + `http://localhost:${port}/health`, + ], + { ignoreError: true }, + ); + const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; + addTraceEvent("dashboard_probe", { attempt: i + 1, http_status: readyCode }); + if (readyCode === 200 || readyCode === 401) { + console.log(" ✓ Dashboard is live"); + return; + } + if (i === 14) { + console.warn(" Dashboard taking longer than expected to start. Continuing..."); + } else { + sleep(2); + } + } + }); +} diff --git a/src/lib/onboard/tracing.ts b/src/lib/onboard/tracing.ts new file mode 100644 index 0000000000..121710e66f --- /dev/null +++ b/src/lib/onboard/tracing.ts @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { GatewayReuseState } from "../state/gateway"; +import * as trace from "../trace"; + +type TraceFn = () => T; + +const TRACE_TRUTHY_VALUES = new Set(["1", "true", "yes", "on"]); + +export interface OnboardTraceOptions { + resume?: boolean; + fresh?: boolean; + nonInteractive?: boolean; + agent?: string | null; +} + +export interface OnboardTraceHandle { + collector: ReturnType; + span: ReturnType>["startSpan"]> | null; +} + +export function isTruthyTraceEnv(value: unknown): boolean { + return TRACE_TRUTHY_VALUES.has(String(value ?? "").trim().toLowerCase()); +} + +function hasTracePath(value: unknown): boolean { + return typeof value === "string" && value.trim().length > 0; +} + +export function startOnboardTrace( + opts: OnboardTraceOptions, + env: NodeJS.ProcessEnv, +): OnboardTraceHandle { + const collector = trace.getTraceCollector(); + const span = collector?.startSpan("nemoclaw.onboard", { + resume: opts.resume === true, + fresh: opts.fresh === true, + non_interactive: opts.nonInteractive === true || env.NEMOCLAW_NON_INTERACTIVE === "1", + agent: opts.agent || env.NEMOCLAW_AGENT || null, + trace_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE), + trace_file_enabled: hasTracePath(env.NEMOCLAW_TRACE_FILE), + trace_dir_enabled: hasTracePath(env.NEMOCLAW_TRACE_DIR), + }); + return { collector, span: span ?? null }; +} + +export function finishOnboardTrace(handle: OnboardTraceHandle, completed: boolean): void { + if (!handle.span) return; + handle.collector?.endSpan(handle.span, completed ? "OK" : "ERROR"); + trace.flushTrace(completed ? "OK" : "ERROR"); +} + +export function withPreflightTrace(fn: TraceFn): T { + return trace.withTraceSpan("nemoclaw.onboard.phase.preflight", {}, fn); +} + +export function withGatewayTrace( + reuseState: GatewayReuseState, + gpuPassthrough: boolean, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.gateway", + { reuse_state: reuseState, gpu_passthrough: gpuPassthrough }, + fn, + ); +} + +export function withProviderSelectionTrace( + sandboxName: string | null, + agentName: string | null | undefined, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.provider_selection", + { sandbox_name: sandboxName, agent: agentName ?? null }, + fn, + ); +} + +export function withInferenceTrace( + sandboxName: string, + provider: string, + model: string, + credentialEnv: string | null, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.inference", + { sandbox_name: sandboxName, provider, model, credential_env: credentialEnv }, + fn, + ); +} + +export function withSandboxPhaseTrace( + sandboxName: string, + provider: string, + model: string, + agentName: string | null | undefined, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.sandbox", + { sandbox_name: sandboxName, provider, model, agent: agentName ?? null }, + fn, + ); +} + +export function withSandboxCreateStreamTrace( + attrs: { + sandboxName: string; + provider: string; + model: string; + timeoutSeconds: number; + fromDockerfile: boolean; + gpuEnabled: boolean; + }, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.sandbox.create_stream", + { + sandbox_name: attrs.sandboxName, + provider: attrs.provider, + model: attrs.model, + timeout_seconds: attrs.timeoutSeconds, + from_dockerfile: attrs.fromDockerfile, + gpu_enabled: attrs.gpuEnabled, + }, + fn, + ); +} + +export function withSandboxReadinessTrace( + sandboxName: string, + attrs: Record, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.sandbox.readiness_wait", + { sandbox_name: sandboxName, ...attrs }, + fn, + ); +} + +export function withDashboardReadinessTrace( + sandboxName: string, + port: string | number, + attempts: number, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.dashboard.readiness_wait", + { sandbox_name: sandboxName, port, attempts }, + fn, + ); +} + +export function withPolicyApplicationTrace( + sandboxName: string, + options: { selectedPresets?: unknown; provider?: unknown; webSearchSupported?: unknown }, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.policy.application", + { + sandbox_name: sandboxName, + selected_presets: options.selectedPresets ?? null, + provider: options.provider ?? null, + web_search_supported: options.webSearchSupported ?? null, + }, + fn, + ); +} + +export const addTraceEvent = trace.addTraceEvent; diff --git a/src/lib/sandbox/create-stream.test.ts b/src/lib/sandbox/create-stream.test.ts index 96d210de7b..55e22bda63 100644 --- a/src/lib/sandbox/create-stream.test.ts +++ b/src/lib/sandbox/create-stream.test.ts @@ -74,8 +74,10 @@ describe("sandbox-create-stream", () => { it("streams BuildKit progress lines as build output", async () => { const child = new FakeChild(); const logLine = vi.fn(); + const traceEvent = vi.fn(); const promise = streamSandboxCreate("echo create", process.env, { logLine, + traceEvent, spawnImpl: () => child as never, heartbeatIntervalMs: 1_000, silentPhaseMs: 10_000, @@ -95,6 +97,62 @@ describe("sandbox-create-stream", () => { expect(logLine).toHaveBeenCalledWith("#1 [internal] load build definition from Dockerfile"); expect(logLine).toHaveBeenCalledWith("#2 CACHED"); expect(logLine).toHaveBeenCalledWith("#3 DONE 0.1s"); + expect(traceEvent).toHaveBeenCalledWith( + "docker_buildkit_progress", + expect.objectContaining({ + step: 1, + detail: "[internal] load build definition from Dockerfile", + }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_buildkit_progress", + expect.objectContaining({ step: 2, detail: "CACHED" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_buildkit_progress", + expect.objectContaining({ step: 3, detail: "DONE 0.1s" }), + ); + }); + + it("records classic Docker build steps as trace events", async () => { + const child = new FakeChild(); + const traceEvent = vi.fn(); + const promise = streamSandboxCreate("echo create", process.env, { + traceEvent, + logLine: vi.fn(), + spawnImpl: () => child as never, + heartbeatIntervalMs: 1_000, + silentPhaseMs: 10_000, + }); + + child.stdout.emit( + "data", + Buffer.from( + " Step 1/3 : FROM base\n" + + " Step 2/3 : RUN npm ci\n" + + " Step 3/3 : COPY . /workspace\n" + + "Successfully built abc123\n", + ), + ); + child.emit("close", 0); + + await expect(promise).resolves.toMatchObject({ status: 0, sawProgress: true }); + expect(traceEvent).toHaveBeenCalledWith( + "sandbox_create_phase", + expect.objectContaining({ phase: "build" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_build_step_start", + expect.objectContaining({ step: "Step 1/3", index: 1, total: 3, instruction: "FROM base" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_build_step_end", + expect.objectContaining({ status: "completed", step: "Step 1/3", instruction: "FROM base" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_build_end", + expect.objectContaining({ status: "completed" }), + ); }); it("forces success when the sandbox becomes ready before the stream exits", async () => { diff --git a/src/lib/sandbox/create-stream.ts b/src/lib/sandbox/create-stream.ts index f489b033cb..1e93cf74dd 100644 --- a/src/lib/sandbox/create-stream.ts +++ b/src/lib/sandbox/create-stream.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { spawn, type SpawnOptions } from "node:child_process"; +import { type SpawnOptions, spawn } from "node:child_process"; import { ROOT } from "../state/paths"; @@ -19,6 +19,7 @@ export interface StreamSandboxCreateOptions { heartbeatIntervalMs?: number; silentPhaseMs?: number; logLine?: (line: string) => void; + traceEvent?: (name: string, attributes?: Record) => void; // Optional guard for the early-ready escape hatch. When set, readyCheck() // alone cannot detach the create stream until at least one streamed output // line matches a configured pattern. @@ -97,6 +98,7 @@ const VISIBLE_PROGRESS_PATTERNS: readonly RegExp[] = [ const VM_READY_DETACH_OUTPUT_PATTERNS: readonly RegExp[] = [/Setting up NemoClaw/]; const CLASSIC_DOCKER_STEP_RE = /^\s*Step (\d+)\/(\d+) : (.+)$/; +const BUILDKIT_STEP_RE = /^#(\d+)\s+(.+)$/; function matchesAny(line: string, patterns: readonly RegExp[]) { return patterns.some((pattern) => pattern.test(line)); @@ -133,6 +135,7 @@ export function streamSandboxCreate( }); const logLine = options.logLine ?? console.log; + const traceEvent = options.traceEvent ?? (() => {}); const lines: string[] = []; let pending = ""; let lastPrintedLine = ""; @@ -198,6 +201,10 @@ export function streamSandboxCreate( printProgressLine(line); } + function emitTraceEvent(name: string, attributes: Record = {}) { + traceEvent(name, attributes); + } + function markBuildStarted(nowMs: number = timingNow()) { if (buildStartedAtMs === null) { buildStartedAtMs = nowMs; @@ -206,11 +213,18 @@ export function streamSandboxCreate( function finishActiveBuildStep(status: "completed" | "stopped", nowMs: number = timingNow()) { if (!activeBuildStep) return; + const elapsedMs = nowMs - activeBuildStep.startedAtMs; const phrase = status === "completed" ? "completed in" : "stopped after"; - const elapsed = formatDuration(nowMs - activeBuildStep.startedAtMs); + const elapsed = formatDuration(elapsedMs); appendTimingLine( ` ${activeBuildStep.label} ${phrase} ${elapsed} (${activeBuildStep.instruction})`, ); + emitTraceEvent("docker_build_step_end", { + status, + step: activeBuildStep.label, + instruction: activeBuildStep.instruction, + duration_ms: elapsedMs, + }); activeBuildStep = null; } @@ -218,10 +232,15 @@ export function streamSandboxCreate( if (buildTimingFinished) return; finishActiveBuildStep(status, nowMs); if (buildStartedAtMs !== null) { + const elapsedMs = nowMs - buildStartedAtMs; const phrase = status === "completed" ? "completed in" : "stopped after"; appendTimingLine( - ` Sandbox image build ${phrase} ${formatDuration(nowMs - buildStartedAtMs)}`, + ` Sandbox image build ${phrase} ${formatDuration(elapsedMs)}`, ); + emitTraceEvent("docker_build_end", { + status, + duration_ms: elapsedMs, + }); } buildTimingFinished = true; } @@ -237,6 +256,24 @@ export function streamSandboxCreate( instruction: match[3].trim().replace(/\s+/g, " "), startedAtMs: nowMs, }; + emitTraceEvent("docker_build_step_start", { + step: activeBuildStep.label, + index: Number(match[1]), + total: Number(match[2]), + instruction: activeBuildStep.instruction, + }); + } + + function maybeRecordBuildKitStep(line: string) { + const match = line.match(BUILDKIT_STEP_RE); + if (!match) return; + if (!matchesAny(line, BUILD_PROGRESS_PATTERNS) && !matchesAny(line, PULL_PROGRESS_PATTERNS)) { + return; + } + emitTraceEvent("docker_buildkit_progress", { + step: Number(match[1]), + detail: match[2].trim().replace(/\s+/g, " "), + }); } function elapsedSeconds() { @@ -260,6 +297,10 @@ export function streamSandboxCreate( : nextPhase === "ready" ? " Waiting for sandbox to become ready..." : null; + emitTraceEvent("sandbox_create_phase", { + phase: nextPhase, + elapsed_seconds: elapsedSeconds(), + }); if (phaseLine) printProgressLine(phaseLine); } @@ -275,6 +316,7 @@ export function streamSandboxCreate( markBuildStarted(); } maybeStartClassicBuildStep(line); + maybeRecordBuildKitStep(line); if (/^(?:Successfully built | {2}Built image )/.test(line)) { finishBuildTiming("completed"); } diff --git a/src/lib/trace.test.ts b/src/lib/trace.test.ts new file mode 100644 index 0000000000..430cd59213 --- /dev/null +++ b/src/lib/trace.test.ts @@ -0,0 +1,185 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { runCurlProbe } from "./adapters/http/probe"; +import { + addTraceEvent, + flushTrace, + getTraceCollector, + resetTraceForTests, + sanitizeTraceAttributes, + TRACE_DIR_ENV, + TRACE_ENABLED_ENV, + TRACE_FILE_ENV, + type TraceArtifact, + withTraceSpan, +} from "./trace"; + +function withTraceFile(fn: (traceFile: string) => T): T { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-test-")); + const traceFile = path.join(tmpDir, "trace.json"); + process.env[TRACE_FILE_ENV] = traceFile; + resetTraceForTests(); + return fn(traceFile); +} + +afterEach(() => { + delete process.env[TRACE_ENABLED_ENV]; + delete process.env[TRACE_FILE_ENV]; + delete process.env[TRACE_DIR_ENV]; + resetTraceForTests(); +}); + +describe("onboard trace artifacts", () => { + it("writes OpenTelemetry-style spans and a slowest-span summary", () => { + withTraceFile((traceFile) => { + withTraceSpan("nemoclaw.onboard.phase.gateway", { provider: "nvidia-prod" }, () => { + addTraceEvent("ready", { attempt: 1 }); + }); + + expect(flushTrace()).toBe(traceFile); + const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact; + const spans = artifact.resource_spans[0].scope_spans[0].spans; + + expect(artifact.resource_spans[0].scope_spans[0].scope.name).toBe("nemoclaw.onboard"); + expect(spans).toHaveLength(1); + expect(spans[0].trace_id).toBe(artifact.summary.trace_id); + expect(spans[0].span_id).toMatch(/^[0-9a-f]{16}$/); + expect(spans[0].duration_ms).toBeGreaterThanOrEqual(0); + expect(spans[0].events[0]).toMatchObject({ + name: "ready", + attributes: { attempt: 1 }, + }); + expect(artifact.summary.slowest_spans[0].name).toBe("nemoclaw.onboard.phase.gateway"); + }); + }); + + it("redacts secret-like metadata before writing artifacts", () => { + withTraceFile((traceFile) => { + withTraceSpan( + "nemoclaw.inference.validation_probe", + { + api_key: "nvapi-secret-value", + authorization: "Bearer sk-secret-value", + endpoint_url: "https://user:pass@example.test/v1?token=secret", + }, + () => undefined, + ); + + flushTrace(); + const text = fs.readFileSync(traceFile, "utf8"); + + expect(text).not.toContain("nvapi-secret-value"); + expect(text).not.toContain("sk-secret-value"); + expect(text).not.toContain("user:pass"); + expect(text).not.toContain("token=secret"); + expect(text).toContain(""); + }); + }); + + it("sanitizes curl probe URLs and records status metadata", () => { + withTraceFile((traceFile) => { + const result = runCurlProbe( + [ + "-sS", + "-H", + "Authorization: Bearer should-not-appear", + "https://example.test/v1/chat/completions?key=secret", + ], + { + spawnSyncImpl: () => + ({ + status: 0, + stdout: "200", + stderr: "", + error: undefined, + }) as never, + }, + ); + + expect(result.ok).toBe(true); + flushTrace(); + const text = fs.readFileSync(traceFile, "utf8"); + const artifact = JSON.parse(text) as TraceArtifact; + const span = artifact.resource_spans[0].scope_spans[0].spans.find( + (entry) => entry.name === "nemoclaw.inference.curl_probe", + ); + + expect(text).not.toContain("should-not-appear"); + expect(text).not.toContain("key=secret"); + expect(span?.attributes["http.url"]).toBe( + "https://example.test/v1/chat/completions?key=%3CREDACTED%3E", + ); + expect(span?.events[0].attributes).toMatchObject({ ok: true, http_status: 200 }); + }); + }); + + it("redacts nested sensitive attributes", () => { + expect( + sanitizeTraceAttributes({ + nested: { token: "xoxb-secret", ok: true }, + credential_env: "NVIDIA_API_KEY", + }), + ).toMatchObject({ + nested: '{"token":"","ok":true}', + credential_env: "NVIDIA_API_KEY", + }); + }); + + it("creates a readable timestamped trace file when NEMOCLAW_TRACE is enabled", () => { + const originalCwd = process.cwd(); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-enabled-")); + try { + process.chdir(tmpDir); + process.env[TRACE_ENABLED_ENV] = "1"; + resetTraceForTests(); + + const collector = getTraceCollector(); + expect(collector?.outputPath).toMatch( + /[/\\]\.e2e[/\\]traces[/\\]nemoclaw-trace-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}-\d{3}Z-pid-\d+\.json$/, + ); + + withTraceSpan("nemoclaw.onboard.phase.preflight", {}, () => undefined); + const outputPath = flushTrace(); + expect(outputPath).toBe(collector?.outputPath); + expect(fs.existsSync(String(outputPath))).toBe(true); + } finally { + process.chdir(originalCwd); + } + }); + + it("treats false-like NEMOCLAW_TRACE values as disabled", () => { + process.env[TRACE_ENABLED_ENV] = "false"; + resetTraceForTests(); + + expect(getTraceCollector()).toBeNull(); + }); + + it("removes the registered exit listener when resetting tests", () => { + const before = process.listenerCount("exit"); + withTraceFile(() => { + expect(getTraceCollector()).not.toBeNull(); + expect(process.listenerCount("exit")).toBe(before + 1); + resetTraceForTests(); + expect(process.listenerCount("exit")).toBe(before); + }); + }); + + it("does not mark traces flushed when artifact writes fail", () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-blocker-")); + fs.chmodSync(tmpDir, 0o700); + const blocker = path.join(tmpDir, "not-a-directory"); + fs.writeFileSync(blocker, "not a directory"); + process.env[TRACE_FILE_ENV] = path.join(blocker, "trace.json"); + resetTraceForTests(); + const collector = getTraceCollector(); + expect(collector).not.toBeNull(); + + expect(() => flushTrace()).toThrow(); + expect(() => collector?.flush()).toThrow(); + }); +}); diff --git a/src/lib/trace.ts b/src/lib/trace.ts new file mode 100644 index 0000000000..f0cd0dc5c6 --- /dev/null +++ b/src/lib/trace.ts @@ -0,0 +1,323 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import crypto from "node:crypto"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { redactForLog, redactFull, redactUrl } from "./security/redact"; + +type TraceStatusCode = "OK" | "ERROR" | "UNSET"; +type TraceAttribute = string | number | boolean | null | string[] | number[] | boolean[]; +type TraceAttributes = Record; + +export interface TraceEvent { + name: string; + time_unix_nano: string; + attributes?: TraceAttributes; +} + +export interface TraceSpan { + trace_id: string; + span_id: string; + parent_span_id?: string; + name: string; + kind: "INTERNAL"; + start_time_unix_nano: string; + end_time_unix_nano?: string; + duration_ms?: number; + status: { + code: TraceStatusCode; + message?: string; + }; + attributes: TraceAttributes; + events: TraceEvent[]; +} + +export interface TraceArtifact { + resource_spans: [ + { + resource: { + attributes: TraceAttributes; + }; + scope_spans: [ + { + scope: { + name: string; + version: string; + }; + spans: TraceSpan[]; + }, + ]; + }, + ]; + summary: { + trace_id: string; + generated_at: string; + total_duration_ms: number; + slowest_spans: Array<{ name: string; duration_ms: number; status: TraceStatusCode }>; + output_path: string; + }; +} + +const TRACE_SCOPE_NAME = "nemoclaw.onboard"; +const TRACE_SCOPE_VERSION = "1.0.0"; +export const TRACE_ENABLED_ENV = "NEMOCLAW_TRACE"; +export const TRACE_FILE_ENV = "NEMOCLAW_TRACE_FILE"; +export const TRACE_DIR_ENV = "NEMOCLAW_TRACE_DIR"; +const DEFAULT_TRACE_DIR = path.join(".e2e", "traces"); +const MAX_ATTRIBUTE_STRING_LENGTH = 240; +const SENSITIVE_ATTRIBUTE_KEY = + /(?:api[_-]?key|token|secret|password|authorization|bearer|cookie|set-cookie)/i; + +function nowNs(): bigint { + return BigInt(Date.now()) * 1_000_000n; +} + +function durationMs(startNs: bigint, endNs: bigint): number { + return Number(endNs - startNs) / 1_000_000; +} + +function randomHex(bytes: number): string { + return crypto.randomBytes(bytes).toString("hex"); +} + +function sanitizeAttributeValue(key: string, value: unknown): TraceAttribute | undefined { + if (value === undefined) return undefined; + if (value === null || typeof value === "boolean") return value; + if (typeof value === "number") return Number.isFinite(value) ? value : String(value); + if (typeof value === "bigint") return String(value); + if (typeof value === "string") { + if (SENSITIVE_ATTRIBUTE_KEY.test(key)) return ""; + const urlValue = redactUrl(value); + const redacted = redactFull(urlValue ?? value); + return redacted.slice(0, MAX_ATTRIBUTE_STRING_LENGTH); + } + if (Array.isArray(value)) { + const entries = value + .map((entry) => sanitizeAttributeValue(key, entry)) + .filter((entry): entry is string | number | boolean => { + return typeof entry === "string" || typeof entry === "number" || typeof entry === "boolean"; + }); + if (entries.every((entry) => typeof entry === "string")) return entries as string[]; + if (entries.every((entry) => typeof entry === "number")) return entries as number[]; + if (entries.every((entry) => typeof entry === "boolean")) return entries as boolean[]; + return entries.map((entry) => String(entry)); + } + const redacted = redactForLog(value); + try { + return JSON.stringify(redacted).slice(0, MAX_ATTRIBUTE_STRING_LENGTH); + } catch { + return "[unserializable]"; + } +} + +export function sanitizeTraceAttributes(attributes: Record = {}): TraceAttributes { + const safe: TraceAttributes = {}; + for (const [key, value] of Object.entries(attributes)) { + const sanitized = sanitizeAttributeValue(key, value); + if (sanitized !== undefined) safe[key] = sanitized; + } + return safe; +} + +function isTraceFlagEnabled(value: string | undefined): boolean { + const normalized = value?.trim().toLowerCase(); + if (!normalized) return false; + return !["0", "false", "no", "off"].includes(normalized); +} + +function traceFileName(): string { + const stamp = new Date().toISOString().replace("T", "-").replace(/[:.]/g, "-"); + return `nemoclaw-trace-${stamp}-pid-${process.pid}.json`; +} + +function resolveTracePath(env: NodeJS.ProcessEnv): string | null { + const traceFile = env[TRACE_FILE_ENV]?.trim(); + if (traceFile) return path.resolve(traceFile); + const traceDir = env[TRACE_DIR_ENV]?.trim(); + if (traceDir) return path.resolve(traceDir, traceFileName()); + if (isTraceFlagEnabled(env[TRACE_ENABLED_ENV])) { + return path.resolve(DEFAULT_TRACE_DIR, traceFileName()); + } + return null; +} + +export class TraceCollector { + readonly traceId = randomHex(16); + readonly outputPath: string; + private readonly spans: TraceSpan[] = []; + private readonly spanStack: TraceSpan[] = []; + private readonly startNs = nowNs(); + private flushed = false; + + constructor(outputPath: string) { + this.outputPath = outputPath; + } + + startSpan(name: string, attributes: Record = {}): TraceSpan { + const parent = this.spanStack.at(-1); + const span: TraceSpan & { _startNs?: bigint } = { + trace_id: this.traceId, + span_id: randomHex(8), + ...(parent ? { parent_span_id: parent.span_id } : {}), + name, + kind: "INTERNAL", + start_time_unix_nano: nowNs().toString(), + status: { code: "UNSET" }, + attributes: sanitizeTraceAttributes(attributes), + events: [], + }; + span._startNs = BigInt(span.start_time_unix_nano); + this.spans.push(span); + this.spanStack.push(span); + return span; + } + + endSpan(span: TraceSpan, status: TraceStatusCode = "OK", message?: string): void { + const internal = span as TraceSpan & { _startNs?: bigint }; + if (span.end_time_unix_nano) return; + const endNs = nowNs(); + span.end_time_unix_nano = endNs.toString(); + span.duration_ms = durationMs(internal._startNs ?? BigInt(span.start_time_unix_nano), endNs); + span.status = message ? { code: status, message: redactFull(message).slice(0, 200) } : { code: status }; + const index = this.spanStack.lastIndexOf(span); + if (index >= 0) this.spanStack.splice(index, 1); + delete internal._startNs; + } + + addEvent(span: TraceSpan, name: string, attributes: Record = {}): void { + span.events.push({ + name, + time_unix_nano: nowNs().toString(), + attributes: sanitizeTraceAttributes(attributes), + }); + } + + activeSpan(): TraceSpan | null { + return this.spanStack.at(-1) ?? null; + } + + flush(finalStatus: TraceStatusCode = "OK", message?: string): string | null { + if (this.flushed) return this.outputPath; + while (this.spanStack.length > 0) { + const span = this.spanStack.pop(); + if (span) this.endSpan(span, finalStatus, message); + } + const endNs = nowNs(); + const slowest = [...this.spans] + .filter((span) => span.name !== "nemoclaw.onboard") + .filter((span) => typeof span.duration_ms === "number") + .sort((a, b) => (b.duration_ms ?? 0) - (a.duration_ms ?? 0)) + .slice(0, 10) + .map((span) => ({ + name: span.name, + duration_ms: Number((span.duration_ms ?? 0).toFixed(3)), + status: span.status.code, + })); + const artifact: TraceArtifact = { + resource_spans: [ + { + resource: { + attributes: sanitizeTraceAttributes({ + "service.name": "nemoclaw", + "service.version": process.env.npm_package_version || "unknown", + "host.type": os.type(), + "os.platform": process.platform, + "process.pid": process.pid, + "ci": process.env.CI === "true" || process.env.GITHUB_ACTIONS === "true", + }), + }, + scope_spans: [ + { + scope: { + name: TRACE_SCOPE_NAME, + version: TRACE_SCOPE_VERSION, + }, + spans: this.spans, + }, + ], + }, + ], + summary: { + trace_id: this.traceId, + generated_at: new Date().toISOString(), + total_duration_ms: Number(durationMs(this.startNs, endNs).toFixed(3)), + slowest_spans: slowest, + output_path: this.outputPath, + }, + }; + fs.mkdirSync(path.dirname(this.outputPath), { recursive: true, mode: 0o700 }); + fs.writeFileSync(this.outputPath, `${JSON.stringify(artifact, null, 2)}\n`, { mode: 0o600 }); + this.flushed = true; + return this.outputPath; + } +} + +let collector: TraceCollector | null | undefined; +let exitHandler: ((code: number) => void) | null = null; + +export function getTraceCollector(): TraceCollector | null { + if (collector !== undefined) return collector; + const tracePath = resolveTracePath(process.env); + collector = tracePath ? new TraceCollector(tracePath) : null; + if (collector) { + exitHandler = (code) => { + collector?.flush(code === 0 ? "OK" : "ERROR", `process exited with code ${code}`); + }; + process.once("exit", exitHandler); + } + return collector; +} + +export function resetTraceForTests(): void { + if (exitHandler) { + process.removeListener("exit", exitHandler); + } + exitHandler = null; + collector = undefined; +} + +export function withTraceSpan( + name: string, + attributes: Record, + fn: () => T, +): T { + const trace = getTraceCollector(); + if (!trace) return fn(); + const span = trace.startSpan(name, attributes); + try { + const result = fn(); + const maybePromise = result as unknown; + if ( + maybePromise !== null && + typeof maybePromise === "object" && + typeof (maybePromise as { then?: unknown }).then === "function" + ) { + return (maybePromise as Promise) + .then((value) => { + trace.endSpan(span, "OK"); + return value; + }) + .catch((error) => { + trace.endSpan(span, "ERROR", error instanceof Error ? error.message : String(error)); + throw error; + }) as T; + } + trace.endSpan(span, "OK"); + return result; + } catch (error) { + trace.endSpan(span, "ERROR", error instanceof Error ? error.message : String(error)); + throw error; + } +} + +export function addTraceEvent(name: string, attributes: Record = {}): void { + const trace = getTraceCollector(); + const span = trace?.activeSpan() ?? null; + if (trace && span) trace.addEvent(span, name, attributes); +} + +export function flushTrace(status: TraceStatusCode = "OK", message?: string): string | null { + return getTraceCollector()?.flush(status, message) ?? null; +} diff --git a/test/e2e/brev-e2e.test.ts b/test/e2e/brev-e2e.test.ts index 8386100403..473e89f011 100644 --- a/test/e2e/brev-e2e.test.ts +++ b/test/e2e/brev-e2e.test.ts @@ -270,6 +270,7 @@ function sshEnv( `export NEMOCLAW_NON_INTERACTIVE=1`, `export NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1`, `export NEMOCLAW_SANDBOX_NAME=e2e-test`, + `export NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces`, ]; if (GPU_TEST_SUITE) { // This suite validates Docker GPU passthrough and sandbox inference wiring.